Imports

In [1]:
import os
import sys

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.decomposition import PCA

# Add parent directory and import utils
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import utils

# Table Loading

In [2]:
games = utils.df_from_table('rocket_league.games', 'date')
# games_players = utils.df_from_table('rocket_league.games_players')

# Variable distribution

There will be a couple of methods to explore the distributions of different variables. First, we want to see all the possible variables.

<table style="text-align: center; margin: 1rem auto">
<thead>
  <tr>
    <th colspan="3" style="text-align: center;">Game Variables</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td>created_at</td>
    <td>id</td>
    <td>overtime</td>
  </tr>
  <tr>
    <td>updated_at</td>
    <td>octane_id</td>
    <td>flip_ballchasing</td>
  </tr>
  <tr>
    <td>event_id</td>
    <td>number</td>
    <td>ballchasing</td>
  </tr>
  <tr>
    <td>stage_id</td>
    <td>duration</td>
    <td>map_id</td>
  </tr>
  <tr>
    <td>match_id</td>
    <td>date</td>
    <td>map_name</td>
  </tr>
</tbody>

<table style="text-align: center; margin: 1rem auto">
<thead>
  <tr>
    <th colspan="3" style="text-align: center;">Team Variables</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td>assists</td>
    <td>boost_count_stolen_big</td>
    <td>movement_time_slow_speed</td>
  </tr>
  <tr>
    <td>ball_possession_time</td>
    <td>boost_count_stolen_small</td>
    <td>movement_time_supersonic_speed</td>
  </tr>
  <tr>
    <td>ball_time_in_side</td>
    <td>boost_time_boost0_to25</td>
    <td>movement_total_distance</td>
  </tr>
  <tr>
    <td>boost_amount_collected</td>
    <td>boost_time_boost25_to50</td>
    <td>positioning_time_behind_ball</td>
  </tr>
  <tr>
    <td>boost_amount_collected_big</td>
    <td>boost_time_boost50_to75</td>
    <td>positioning_time_defensive_half</td>
  </tr>
  <tr>
    <td>boost_amount_collected_small</td>
    <td>boost_time_boost75_to100</td>
    <td>positioning_time_defensive_third</td>
  </tr>
  <tr>
    <td>boost_amount_overfill</td>
    <td>boost_time_full_boost</td>
    <td>positioning_time_infront_ball</td>
  </tr>
  <tr>
    <td>boost_amount_overfill_stolen</td>
    <td>boost_time_zero_boost</td>
    <td>positioning_time_neutral_third</td>
  </tr>
  <tr>
    <td>boost_amount_stolen</td>
    <td>demo_inflicted</td>
    <td>positioning_time_offensive_half</td>
  </tr>
  <tr>
    <td>boost_amount_stolen_big</td>
    <td>demo_taken</td>
    <td>positioning_time_offensive_third</td>
  </tr>
  <tr>
    <td>boost_amount_stolen_small</td>
    <td>goals</td>
    <td>saves</td>
  </tr>
  <tr>
    <td>boost_amount_used_while_supersonic</td>
    <td>movement_count_powerslide</td>
    <td>score</td>
  </tr>
  <tr>
    <td>boost_avg_amount</td>
    <td>movement_time_boost_speed</td>
    <td>shooting_percentage</td>
  </tr>
  <tr>
    <td>boost_bcpm</td>
    <td>movement_time_ground</td>
    <td>shots</td>
  </tr>
  <tr>
    <td>boost_bpm</td>
    <td>movement_time_high_air</td>
    <td>team_id</td>
  </tr>
  <tr>
    <td>boost_count_collected_big</td>
    <td>movement_time_low_air</td>
    <td>winner</td>
  </tr>
  <tr>
    <td>boost_count_collected_small</td>
    <td>movement_time_powerslide</td>
    <td></td>
  </tr>
</tbody>
</table>

In [3]:
game_variables = list()
team_variables = set()

for item in games.columns:
    str_list = item.split('_')
    if str_list[0] in {'blue', 'orange'}:
        team_variables.add('_'.join(str_list[1:]))
    else:
        game_variables.append(item)

team_variables = list(team_variables)

print('Game Variables:\n\t', game_variables)
print('Team Variables:\n\t', team_variables)

Game Variables:
	 ['created_at', 'updated_at', 'event_id', 'stage_id', 'match_id', 'id', 'octane_id', 'number', 'duration', 'date', 'overtime', 'flip_ballchasing', 'ballchasing', 'map_id', 'map_name']
Team Variables:
	 ['positioning_time_defensive_third', 'boost_time_full_boost', 'boost_time_boost75_to100', 'boost_amount_collected_big', 'positioning_time_offensive_half', 'ball_time_in_side', 'positioning_time_infront_ball', 'boost_time_boost0_to25', 'boost_amount_stolen', 'boost_count_stolen_small', 'movement_time_ground', 'score', 'boost_time_boost25_to50', 'saves', 'boost_amount_used_while_supersonic', 'shooting_percentage', 'boost_avg_amount', 'goals', 'movement_time_powerslide', 'positioning_time_neutral_third', 'movement_count_powerslide', 'positioning_time_behind_ball', 'boost_count_collected_small', 'boost_time_boost50_to75', 'positioning_time_defensive_half', 'boost_bpm', 'demo_taken', 'boost_amount_stolen_big', 'positioning_time_offensive_third', 'boost_amount_collected', 'sho

## Unify Two Teams

In [4]:
blue = games[game_variables + ['blue_' + item for item in team_variables]].copy()
blue.rename(columns={'blue_' + item: item for item in team_variables}, inplace=True)
blue['color'] = 'blue'

orange = games[game_variables + ['orange_' + item for item in team_variables]].copy()
orange.rename(columns={'orange_' + item: item for item in team_variables}, inplace=True)
orange['color'] = 'orange'

unified = pd.concat([blue, orange]).reset_index(drop=True)
del blue
del orange

unified

Unnamed: 0,created_at,updated_at,event_id,stage_id,match_id,id,octane_id,number,duration,date,...,movement_time_low_air,movement_time_high_air,boost_bcpm,movement_time_slow_speed,boost_amount_stolen_small,movement_total_distance,boost_count_collected_big,boost_time_zero_boost,ball_possession_time,color
0,2022-11-28 11:41:33.407230,2022-11-28 11:41:33.407230,5f35882d53fbbb5894b43035,0,6043147591504896348ebd1f,6043147c91504896348ec0ed,860101.0,4,300.0,2015-09-08,...,,,,,,,,,,blue
1,2022-11-28 11:41:33.407230,2022-11-28 11:41:33.407230,5f35882d53fbbb5894b43035,0,6043147591504896348ebd20,6043148391504896348ec527,860102.0,5,300.0,2015-09-08,...,,,,,,,,,,blue
2,2022-11-28 11:41:38.805554,2022-11-28 11:41:38.805554,5f35882d53fbbb5894b43035,0,6043147591504896348ebd36,6043149791504896348ed008,860105.0,5,300.0,2015-09-08,...,,,,,,,,,,blue
3,2022-11-28 11:41:38.805554,2022-11-28 11:41:38.805554,5f35882d53fbbb5894b43035,0,6043147591504896348ebd36,6043149791504896348ed01b,860105.0,6,300.0,2015-09-08,...,,,,,,,,,,blue
4,2022-11-28 11:41:33.407230,2022-11-28 11:41:33.407230,5f35882d53fbbb5894b43035,0,6043147591504896348ebd20,6043148391504896348ec529,860102.0,6,300.0,2015-09-08,...,,,,,,,,,,blue
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199089,2022-11-28 11:57:37.839562,2022-11-28 11:57:37.839562,61f1a1b3da9d7ca1c7ba659e,1,61f1a320c437fde7e02ce0ce,61f1a4f4da9d7ca1c7ba65a0,,2,300.0,NaT,...,,,,,,,,,,orange
199090,2022-11-28 11:57:37.839562,2022-11-28 11:57:37.839562,61f1a1b3da9d7ca1c7ba659e,1,61f1a320c437fde7e02ce0ce,61f1a4d6c437fde7e02ce0dc,,1,300.0,NaT,...,,,,,,,,,,orange
199091,2022-11-28 11:57:37.839562,2022-11-28 11:57:37.839562,61ed595eda9d7ca1c7ba5c7c,0,61ed59a1da9d7ca1c7ba5c82,61f1a100da9d7ca1c7ba6597,,5,300.0,NaT,...,,,,,,,,,,orange
199092,2022-11-28 11:57:37.839562,2022-11-28 11:57:37.839562,61ed595eda9d7ca1c7ba5c7c,0,61ed59a1da9d7ca1c7ba5c82,61f1a0cfda9d7ca1c7ba6590,,4,300.0,NaT,...,,,,,,,,,,orange


## Histograms by team variables

In [19]:

variable = 'saves'
title = f'Histogram of {variable} by team color.'

fig = go.Figure(layout=utils.layout_dict | dict(title=title))

fig.add_trace(go.Histogram(x=games['blue_' + variable], name='Blue', histnorm='probability'))
fig.add_trace(go.Histogram(x=games['orange_' + variable], name='Orange', histnorm='probability'))
fig.add_trace(go.Histogram(x=unified[variable], name='Unified', histnorm='probability'))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.show()

## Correlations

In [24]:
corr = unified[team_variables].corr(numeric_only=True)
fig = px.imshow(corr, zmin=-1, zmax=1)
fig.show(renderer='browser')

## Fix Missing Values

In [14]:
# TODO: There are approx 42 nullable cols. Set a max null threshold and delete rows above it. Then fill the remaining rows.

nullable = {
    'octane_id', 'flip_ballchasing', 'duration', 'date', 'overtime',
    'ballchasing', 'map_id', 'map_name', 'winner'
}

complete = unified.copy()
complete['winner'].fillna(False, inplace=True)
teams = complete['team_id'].unique()
w = 10
for team in teams:
    team_df = complete.loc[complete['team_id'] == team]
    for i in range(len(team_df)):
        elem = team_df.iloc[i]
        for item in list(set(elem[elem.isna()].keys()) - nullable):
            elem[item] = team_df.iloc[max(0, i - w):min(len(team_df), i + w)][item].mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  elem[item] = team_df.iloc[max(0, i - w):min(len(team_df), i + w)][item].mean()


KeyboardInterrupt: 

In [8]:
nullable = {
    'octane_id', 'flip_ballchasing', 'duration', 'date', 'overtime',
    'ballchasing', 'map_id', 'map_name', 'winner'
}

complete = unified.copy()
complete['winner'].fillna(False, inplace=True)
teams = complete['team_id'].unique()
w = 10
team = teams[2]
# for team in teams:
# team_np = complete.loc[complete['team_id'] == team, team_variables].to_numpy()
team_np = complete.loc[complete['team_id'] == team]
team_np

Unnamed: 0,created_at,updated_at,event_id,stage_id,match_id,id,octane_id,number,duration,date,...,movement_time_low_air,movement_time_high_air,boost_bcpm,movement_time_slow_speed,boost_amount_stolen_small,movement_total_distance,boost_count_collected_big,boost_time_zero_boost,ball_possession_time,color
6,2022-11-28 11:41:38.805554,2022-11-28 11:41:38.805554,5f35882d53fbbb5894b43035,0,6043147591504896348ebd34,6043149091504896348eccdc,860104.0,6,300.0,2015-09-08,...,,,,,,,,,,blue
13,2022-11-28 11:41:38.805554,2022-11-28 11:41:38.805554,5f35882d53fbbb5894b43035,0,6043147591504896348ebd34,6043149091504896348eccdb,860104.0,5,300.0,2015-09-08,...,,,,,,,,,,blue
14,2022-11-28 11:41:38.805554,2022-11-28 11:41:38.805554,5f35882d53fbbb5894b43035,0,6043147591504896348ebd34,6043149091504896348eccc2,860104.0,4,300.0,2015-09-08,...,,,,,,,,,,blue
16,2022-11-28 11:41:38.805554,2022-11-28 11:41:38.805554,5f35882d53fbbb5894b43035,0,6043147591504896348ebd34,6043149091504896348eccc0,860104.0,3,300.0,2015-09-08,...,,,,,,,,,,blue
17,2022-11-28 11:41:38.805554,2022-11-28 11:41:38.805554,5f35882d53fbbb5894b43035,0,6043147591504896348ebd34,6043149091504896348eccbe,860104.0,2,300.0,2015-09-08,...,,,,,,,,,,blue
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109394,2022-11-28 11:44:27.811755,2022-11-28 11:44:27.811755,5f35882d53fbbb5894b430bb,0,60431853a09e7fba40d4319a,60431880a09e7fba40d44275,2930107.0,5,300.0,2019-02-24,...,,,,,,,,,,orange
109395,2022-11-28 11:44:27.811755,2022-11-28 11:44:27.811755,5f35882d53fbbb5894b430bb,0,60431853a09e7fba40d4319a,60431880a09e7fba40d44255,2930107.0,4,300.0,2019-02-24,...,,,,,,,,,,orange
109409,2022-11-28 11:44:27.811755,2022-11-28 11:44:27.811755,5f35882d53fbbb5894b430bb,0,60431853a09e7fba40d4319a,60431880a09e7fba40d4423b,2930107.0,3,423.0,2019-02-24,...,,,,,,,,,,orange
109410,2022-11-28 11:44:27.811755,2022-11-28 11:44:27.811755,5f35882d53fbbb5894b430bb,0,60431853a09e7fba40d4319a,60431880a09e7fba40d44238,2930107.0,2,300.0,2019-02-24,...,,,,,,,,,,orange


In [31]:
missing = unified[team_variables + ['color']].groupby('team_id', as_index=False).count()
missing[list(set(team_variables) - {'team_id'})] = missing[list(set(team_variables) - {'team_id'})].div(missing['color'], axis=0)
missing

Unnamed: 0,team_id,positioning_time_defensive_third,boost_time_full_boost,boost_time_boost75_to100,boost_amount_collected_big,positioning_time_offensive_half,ball_time_in_side,positioning_time_infront_ball,boost_time_boost0_to25,boost_amount_stolen,...,movement_time_low_air,movement_time_high_air,boost_bcpm,movement_time_slow_speed,boost_amount_stolen_small,movement_total_distance,boost_count_collected_big,boost_time_zero_boost,ball_possession_time,color
0,6020bc70f1e4807cc7002386,0.698732,0.698732,0.698732,0.698732,0.698732,0.698203,0.698732,0.698732,0.698732,...,0.698732,0.698732,0.698732,0.698732,0.698732,0.698732,0.698732,0.698732,0.698203,1892
1,6020bc70f1e4807cc7002387,0.440678,0.440678,0.440678,0.440678,0.440678,0.440678,0.440678,0.440678,0.440678,...,0.440678,0.440678,0.440678,0.440678,0.440678,0.440678,0.440678,0.440678,0.440678,118
2,6020bc70f1e4807cc7002389,0.698848,0.698848,0.698848,0.698848,0.698848,0.698848,0.698848,0.698848,0.698848,...,0.698848,0.698848,0.698848,0.698848,0.698848,0.698848,0.698848,0.698848,0.698848,1823
3,6020bc70f1e4807cc700239c,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,...,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,30
4,6020bc70f1e4807cc700239d,0.674033,0.674033,0.674033,0.674033,0.674033,0.674033,0.674033,0.674033,0.674033,...,0.674033,0.674033,0.674033,0.674033,0.674033,0.674033,0.674033,0.674033,0.674033,724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3787,638c607df73a2c40baeef536,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,9
3788,638c6083f73a2c40baeef537,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,9
3789,638cb1da5a20c5676abeea0b,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,14
3790,638cb1e45a20c5676abeea0c,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,3


In [50]:
# unified[list(set(team_variables) - {'team_id', 'winner', 'color'})].isna().all(axis=1).astype(int).sum()
(unified[list(set(team_variables) - {'team_id', 'winner', 'color'})].isna().astype(int).sum(axis=1) >= 42).astype(int).sum()

44984

In [7]:
missing = unified.count() / len(unified)
for key in missing.keys():
    print(f'{key}: {missing[key] * 100:.2f}%')

created_at: 100.00%
updated_at: 100.00%
event_id: 100.00%
stage_id: 100.00%
match_id: 100.00%
id: 100.00%
octane_id: 53.88%
number: 100.00%
duration: 99.99%
date: 98.91%
overtime: 20.34%
flip_ballchasing: 19.85%
ballchasing: 77.41%
map_id: 77.41%
map_name: 99.59%
boost_amount_overfill_stolen: 77.41%
boost_amount_used_while_supersonic: 77.41%
team_id: 100.00%
boost_count_collected_big: 77.41%
demo_inflicted: 77.41%
boost_amount_collected_big: 77.41%
boost_amount_collected: 77.41%
shots: 100.00%
positioning_time_offensive_third: 77.41%
boost_avg_amount: 77.41%
boost_time_boost75_to100: 77.41%
boost_amount_stolen_small: 77.41%
score: 100.00%
movement_time_ground: 77.41%
boost_time_boost50_to75: 77.41%
saves: 100.00%
demo_taken: 77.41%
boost_time_boost0_to25: 77.41%
boost_amount_collected_small: 77.41%
movement_time_slow_speed: 77.41%
positioning_time_defensive_half: 77.41%
movement_time_low_air: 77.41%
positioning_time_behind_ball: 77.41%
ball_possession_time: 76.70%
boost_bcpm: 77.41%
mo

## Variable Groupings

In [50]:
groups = {
    'attack': {
        'vars': ['assists', 'ball_possession_time', 'goals', 'positioning_time_behind_ball', 
        'positioning_time_infront_ball', 'positioning_time_offensive_half', 'positioning_time_offensive_third',
        'shooting_percentage', 'shots'],
        'neg': ['positioning_time_infront_ball']
    },
    'defense': {
        'vars': ['ball_possession_time', 'ball_time_in_side', 'positioning_time_behind_ball', 'positioning_time_infront_ball',
		    'positioning_time_defensive_half', 'positioning_time_defensive_third', 'saves'],
        'neg': []
    },
    'movement': {
        'vars': ['boost_avg_amount',  'boost_bcpm', 'boost_bpm', 'boost_time_boost0_to25',
		    'boost_time_boost25_to50', 'boost_time_boost50_to75', 'boost_time_boost75_to100', 
		    'boost_time_full_boost', 'boost_time_zero_boost', 'movement_count_powerslide', 
		    'movement_time_boost_speed', 'movement_time_ground', 'movement_time_high_air', 
		    'movement_time_low_air', 'movement_time_powerslide', 'movement_time_slow_speed', 
		    'movement_time_supersonic_speed', 'movement_total_distance'],
        'neg': []
    },
    'boost_management': {
        'vars': ['boost_amount_collected', 'boost_amount_collected_big', 'boost_amount_collected_small',
            'boost_amount_overfill', 'boost_amount_overfill_stolen', 'boost_amount_stolen', 
            'boost_amount_stolen_big', 'boost_amount_stolen_small', 'boost_amount_used_while_supersonic', 
            'boost_avg_amount', 'boost_bcpm', 'boost_bpm', 'boost_count_collected_big',
            'boost_count_collected_small', 'boost_count_stolen_big', 'boost_count_stolen_small', 
            'boost_time_boost0_to25', 'boost_time_boost25_to50', 'boost_time_boost50_to75', 
            'boost_time_boost75_to100', 'boost_time_full_boost', 'boost_time_zero_boost'],
        'neg': []
    },
    'aggression': {
        'vars': ['boost_amount_stolen', 'boost_amount_stolen_big', 'boost_amount_stolen_small', 
            'boost_count_stolen_big', 'boost_count_stolen_small', 'demo_inflicted', 'demo_taken', 
            'positioning_time_offensive_half', 'positioning_time_offensive_third'],
        'neg': []
    }
}


pca = PCA(n_components=1)
attribute = 'attack'
# for attribute in groups.keys():
X = unified[['team_id'] + groups[attribute]['vars']].copy()
X = X.groupby('team_id', as_index=False).transform(lambda x: x.fillna(x.mean()))
X.dropna(inplace=True)
X
# for item in X.columns:
#     X[item] = X[['team_id', item]].groupby('team_id').transform(lambda x: x.fillna(x.mean()))
# for item in groups[attribute]['neg']:
#     X[item] = -X[item]
# unified[attribute] = pca.fit_transform(X)
# pca.fit_transform(X.drop(columns=['team_id']))

# unified

Unnamed: 0,team_id,assists,ball_possession_time,goals,positioning_time_behind_ball,positioning_time_infront_ball,positioning_time_offensive_half,positioning_time_offensive_third,shooting_percentage,shots
0,6020bc70f1e4807cc700242f,1,139.39,1,719.36000,231.12000,413.66998,236.28000,16.666667,6
1,6020bc70f1e4807cc700242f,0,143.65,0,705.83997,305.69998,339.83002,224.70999,0.000000,8
2,6020bc70f1e4807cc700242e,0,138.73,0,618.28000,323.31000,314.83002,185.38000,0.000000,7
3,6020bc70f1e4807cc700242f,2,128.06,2,674.35000,385.56000,389.96000,231.00000,50.000000,4
4,6020bc70f1e4807cc700242e,2,193.32,2,918.31006,365.66998,567.18000,347.70000,16.666667,12
...,...,...,...,...,...,...,...,...,...,...
199089,6020bc70f1e4807cc7002386,1,156.11,2,765.73004,241.31000,423.95000,256.49002,25.000000,8
199090,6020bc70f1e4807cc7002386,1,193.07,2,929.79990,437.59003,455.82000,269.20000,50.000000,4
199091,6020bc70f1e4807cc7002408,1,136.02,2,707.99000,289.90000,360.20000,202.98000,22.222222,9
199092,6020bc70f1e4807cc7002408,2,124.83,2,766.25000,312.96002,357.77997,204.07000,18.181818,11
