Imports

In [1]:
import os
import sys

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression as LR
from sklearn.decomposition import PCA

# Add parent directory and import utils
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import utils

# Table Loading

In [2]:
games = utils.df_from_table('rocket_league.games', 'date')
# games_players = utils.df_from_table('rocket_league.games_players')

# Variable distribution

There will be a couple of methods to explore the distributions of different variables. First, we want to see all the possible variables.

<table style="text-align: center; margin: 1rem auto">
<thead>
  <tr>
    <th colspan="3" style="text-align: center;">Game Variables</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td>created_at</td>
    <td>id</td>
    <td>overtime</td>
  </tr>
  <tr>
    <td>updated_at</td>
    <td>octane_id</td>
    <td>flip_ballchasing</td>
  </tr>
  <tr>
    <td>event_id</td>
    <td>number</td>
    <td>ballchasing</td>
  </tr>
  <tr>
    <td>stage_id</td>
    <td>duration</td>
    <td>map_id</td>
  </tr>
  <tr>
    <td>match_id</td>
    <td>date</td>
    <td>map_name</td>
  </tr>
</tbody>

<table style="text-align: center; margin: 1rem auto">
<thead>
  <tr>
    <th colspan="3" style="text-align: center;">Team Variables</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td>assists</td>
    <td>boost_count_stolen_big</td>
    <td>movement_time_slow_speed</td>
  </tr>
  <tr>
    <td>ball_possession_time</td>
    <td>boost_count_stolen_small</td>
    <td>movement_time_supersonic_speed</td>
  </tr>
  <tr>
    <td>ball_time_in_side</td>
    <td>boost_time_boost0_to25</td>
    <td>movement_total_distance</td>
  </tr>
  <tr>
    <td>boost_amount_collected</td>
    <td>boost_time_boost25_to50</td>
    <td>positioning_time_behind_ball</td>
  </tr>
  <tr>
    <td>boost_amount_collected_big</td>
    <td>boost_time_boost50_to75</td>
    <td>positioning_time_defensive_half</td>
  </tr>
  <tr>
    <td>boost_amount_collected_small</td>
    <td>boost_time_boost75_to100</td>
    <td>positioning_time_defensive_third</td>
  </tr>
  <tr>
    <td>boost_amount_overfill</td>
    <td>boost_time_full_boost</td>
    <td>positioning_time_infront_ball</td>
  </tr>
  <tr>
    <td>boost_amount_overfill_stolen</td>
    <td>boost_time_zero_boost</td>
    <td>positioning_time_neutral_third</td>
  </tr>
  <tr>
    <td>boost_amount_stolen</td>
    <td>demo_inflicted</td>
    <td>positioning_time_offensive_half</td>
  </tr>
  <tr>
    <td>boost_amount_stolen_big</td>
    <td>demo_taken</td>
    <td>positioning_time_offensive_third</td>
  </tr>
  <tr>
    <td>boost_amount_stolen_small</td>
    <td>goals</td>
    <td>saves</td>
  </tr>
  <tr>
    <td>boost_amount_used_while_supersonic</td>
    <td>movement_count_powerslide</td>
    <td>score</td>
  </tr>
  <tr>
    <td>boost_avg_amount</td>
    <td>movement_time_boost_speed</td>
    <td>shooting_percentage</td>
  </tr>
  <tr>
    <td>boost_bcpm</td>
    <td>movement_time_ground</td>
    <td>shots</td>
  </tr>
  <tr>
    <td>boost_bpm</td>
    <td>movement_time_high_air</td>
    <td>team_id</td>
  </tr>
  <tr>
    <td>boost_count_collected_big</td>
    <td>movement_time_low_air</td>
    <td>winner</td>
  </tr>
  <tr>
    <td>boost_count_collected_small</td>
    <td>movement_time_powerslide</td>
    <td></td>
  </tr>
</tbody>
</table>

In [3]:
game_variables = list()
team_variables = set()

for item in games.columns:
    str_list = item.split('_')
    if str_list[0] in {'blue', 'orange'}:
        team_variables.add('_'.join(str_list[1:]))
    else:
        game_variables.append(item)

team_variables = list(team_variables)

print('Game Variables:\n\t', game_variables)
print('Team Variables:\n\t', team_variables)

Game Variables:
	 ['created_at', 'updated_at', 'event_id', 'stage_id', 'match_id', 'id', 'octane_id', 'number', 'duration', 'date', 'overtime', 'flip_ballchasing', 'ballchasing', 'map_id', 'map_name']
Team Variables:
	 ['boost_time_boost25_to50', 'boost_count_collected_small', 'boost_count_stolen_big', 'positioning_time_offensive_third', 'boost_amount_stolen_small', 'boost_time_boost0_to25', 'movement_time_ground', 'movement_count_powerslide', 'team_id', 'goals', 'positioning_time_behind_ball', 'boost_bpm', 'shooting_percentage', 'boost_count_collected_big', 'movement_time_supersonic_speed', 'boost_time_zero_boost', 'score', 'positioning_time_infront_ball', 'boost_amount_stolen_big', 'boost_amount_overfill_stolen', 'boost_amount_collected_small', 'movement_time_boost_speed', 'boost_count_stolen_small', 'positioning_time_neutral_third', 'ball_time_in_side', 'boost_amount_collected_big', 'boost_amount_collected', 'positioning_time_offensive_half', 'winner', 'boost_time_boost50_to75', 'po

## Unify Two Teams

In [7]:
blue = games[game_variables + ['blue_' + item for item in team_variables]].copy()
blue.rename(columns={'blue_' + item: item for item in team_variables}, inplace=True)
blue['color'] = 'blue'

orange = games[game_variables + ['orange_' + item for item in team_variables]].copy()
orange.rename(columns={'orange_' + item: item for item in team_variables}, inplace=True)
orange['color'] = 'orange'

unified = pd.concat([blue, orange]).reset_index(drop=True)
del blue
del orange

unified

Unnamed: 0,created_at,updated_at,event_id,stage_id,match_id,id,octane_id,number,duration,date,...,boost_time_full_boost,movement_total_distance,movement_time_low_air,movement_time_high_air,positioning_time_defensive_third,movement_time_powerslide,shots,demo_taken,boost_bcpm,color
0,2022-11-28 11:41:33.407230,2022-11-28 11:41:33.407230,5f35882d53fbbb5894b43035,0,6043147591504896348ebd1f,6043147c91504896348ec0ed,860101.0,4,300.0,2015-09-08,...,,,,,,,10,,,blue
1,2022-11-28 11:41:33.407230,2022-11-28 11:41:33.407230,5f35882d53fbbb5894b43035,0,6043147591504896348ebd20,6043148391504896348ec527,860102.0,5,300.0,2015-09-08,...,,,,,,,11,,,blue
2,2022-11-28 11:41:38.805554,2022-11-28 11:41:38.805554,5f35882d53fbbb5894b43035,0,6043147591504896348ebd36,6043149791504896348ed008,860105.0,5,300.0,2015-09-08,...,,,,,,,3,,,blue
3,2022-11-28 11:41:38.805554,2022-11-28 11:41:38.805554,5f35882d53fbbb5894b43035,0,6043147591504896348ebd36,6043149791504896348ed01b,860105.0,6,300.0,2015-09-08,...,,,,,,,10,,,blue
4,2022-11-28 11:41:33.407230,2022-11-28 11:41:33.407230,5f35882d53fbbb5894b43035,0,6043147591504896348ebd20,6043148391504896348ec529,860102.0,6,300.0,2015-09-08,...,,,,,,,13,,,blue
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199089,2022-11-28 12:01:10.821867,2022-11-28 12:01:10.821867,621e6ee1da9d7ca1c7baa36c,0,6283e365da9d7ca1c7baf678,629efe6ada9d7ca1c7bb2246,,1,300.0,NaT,...,,,,,,,7,,,orange
199090,2022-11-28 12:01:10.821867,2022-11-28 12:01:10.821867,621e6ee1da9d7ca1c7baa36c,0,6283e365da9d7ca1c7baf678,629efe96da9d7ca1c7bb224d,,2,300.0,NaT,...,,,,,,,14,,,orange
199091,2022-11-28 12:01:10.821867,2022-11-28 12:01:10.821867,621e6ee1da9d7ca1c7baa36c,0,6283e365da9d7ca1c7baf678,629efec0da9d7ca1c7bb2254,,3,306.0,NaT,...,,,,,,,9,,,orange
199092,2022-11-28 12:00:22.187915,2022-11-28 12:00:22.187915,6284fe19c437fde7e02d7531,0,6284fe35c437fde7e02d7533,6284fee1c437fde7e02d7534,,1,300.0,NaT,...,,,,,,,13,,,orange


## Fix Missing Values

In [None]:
def fix_missing_col(team, miss_col):
    # Create df for complete and missing data
    team_df = unified.loc[unified['team_id']==team, :]
    complete = team_df.loc[team_df[miss_col].notna(), :]
    missing = team_df.loc[team_df[miss_col].isna(), :]

    # Exit if there are no complete rows for the team
    if len(complete) == 0: return None

    # Determine healthy variables to perform the fill
    X_cols = []
    for col in list(set(team_variables) - {miss_col, 'team_id', 'color'}):
        if team_df[col].count() == len(team_df): X_cols.append(col)

    # Fill missing values with linear regression
    reg = LR().fit(X=complete[X_cols], y=complete[miss_col])
    unified.loc[(unified['team_id']==team) & (team_df[miss_col].isna()), miss_col] = reg.predict(missing[X_cols])

In [33]:
# Columns to check
null_cols = list(set(team_variables) - {'team_id', 'color', 'winner'})

# Dispose rows with high missing rates
unified['winner'] = unified['winner'].fillna(False)
unified.loc[:, 'missing_count'] = unified[null_cols].isna().sum(axis=1)
print(f"{(unified['missing_count'] >= 41).sum()} rows deleted. Too many missing variables.")
unified = unified[unified['missing_count'] < 41]

# Perform the fill of missing values by team
for col in null_cols:
    if unified[col].count() == len(unified): continue
    teams = unified.loc[unified[col].isna(), 'team_id'].unique()
    for team in teams:
        fix_missing_col(team, col)

# Dispose of teams with no complete rows
unified.loc[:, 'missing_count'] = unified[null_cols].isna().sum(axis=1)
print(f"{(unified['missing_count'] > 0).sum()} rows for teams without a single complete rows.")
unified = unified[unified['missing_count'] == 0]

0 rows deleted. Too many missing variables.
0 rows for teams without a single complete rows.


## Histograms by team variables

In [26]:

variable = 'ball_possession_time'
title = f'Histogram of {variable} by team color.'

fig = go.Figure(layout=utils.layout_dict | dict(title=title))

fig.add_trace(go.Histogram(x=games['blue_' + variable], name='Blue', histnorm='probability'))
fig.add_trace(go.Histogram(x=games['orange_' + variable], name='Orange', histnorm='probability'))
fig.add_trace(go.Histogram(x=unified[variable], name='Unified', histnorm='probability'))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.5)
fig.show()

## Correlations

In [24]:
corr = unified[team_variables].corr(numeric_only=True)
fig = px.imshow(corr, zmin=-1, zmax=1)
fig.show(renderer='browser')

## Variable Groupings

In [50]:
groups = {
    'attack': {
        'vars': ['assists', 'ball_possession_time', 'goals', 'positioning_time_behind_ball', 
        'positioning_time_infront_ball', 'positioning_time_offensive_half', 'positioning_time_offensive_third',
        'shooting_percentage', 'shots'],
        'neg': ['positioning_time_infront_ball']
    },
    'defense': {
        'vars': ['ball_possession_time', 'ball_time_in_side', 'positioning_time_behind_ball', 'positioning_time_infront_ball',
		    'positioning_time_defensive_half', 'positioning_time_defensive_third', 'saves'],
        'neg': []
    },
    'movement': {
        'vars': ['boost_avg_amount',  'boost_bcpm', 'boost_bpm', 'boost_time_boost0_to25',
		    'boost_time_boost25_to50', 'boost_time_boost50_to75', 'boost_time_boost75_to100', 
		    'boost_time_full_boost', 'boost_time_zero_boost', 'movement_count_powerslide', 
		    'movement_time_boost_speed', 'movement_time_ground', 'movement_time_high_air', 
		    'movement_time_low_air', 'movement_time_powerslide', 'movement_time_slow_speed', 
		    'movement_time_supersonic_speed', 'movement_total_distance'],
        'neg': []
    },
    'boost_management': {
        'vars': ['boost_amount_collected', 'boost_amount_collected_big', 'boost_amount_collected_small',
            'boost_amount_overfill', 'boost_amount_overfill_stolen', 'boost_amount_stolen', 
            'boost_amount_stolen_big', 'boost_amount_stolen_small', 'boost_amount_used_while_supersonic', 
            'boost_avg_amount', 'boost_bcpm', 'boost_bpm', 'boost_count_collected_big',
            'boost_count_collected_small', 'boost_count_stolen_big', 'boost_count_stolen_small', 
            'boost_time_boost0_to25', 'boost_time_boost25_to50', 'boost_time_boost50_to75', 
            'boost_time_boost75_to100', 'boost_time_full_boost', 'boost_time_zero_boost'],
        'neg': []
    },
    'aggression': {
        'vars': ['boost_amount_stolen', 'boost_amount_stolen_big', 'boost_amount_stolen_small', 
            'boost_count_stolen_big', 'boost_count_stolen_small', 'demo_inflicted', 'demo_taken', 
            'positioning_time_offensive_half', 'positioning_time_offensive_third'],
        'neg': []
    }
}


pca = PCA(n_components=1)
attribute = 'attack'
# for attribute in groups.keys():
X = unified[['team_id'] + groups[attribute]['vars']].copy()
X = X.groupby('team_id', as_index=False).transform(lambda x: x.fillna(x.mean()))
X.dropna(inplace=True)
X
# for item in X.columns:
#     X[item] = X[['team_id', item]].groupby('team_id').transform(lambda x: x.fillna(x.mean()))
# for item in groups[attribute]['neg']:
#     X[item] = -X[item]
# unified[attribute] = pca.fit_transform(X)
# pca.fit_transform(X.drop(columns=['team_id']))

# unified

Unnamed: 0,team_id,assists,ball_possession_time,goals,positioning_time_behind_ball,positioning_time_infront_ball,positioning_time_offensive_half,positioning_time_offensive_third,shooting_percentage,shots
0,6020bc70f1e4807cc700242f,1,139.39,1,719.36000,231.12000,413.66998,236.28000,16.666667,6
1,6020bc70f1e4807cc700242f,0,143.65,0,705.83997,305.69998,339.83002,224.70999,0.000000,8
2,6020bc70f1e4807cc700242e,0,138.73,0,618.28000,323.31000,314.83002,185.38000,0.000000,7
3,6020bc70f1e4807cc700242f,2,128.06,2,674.35000,385.56000,389.96000,231.00000,50.000000,4
4,6020bc70f1e4807cc700242e,2,193.32,2,918.31006,365.66998,567.18000,347.70000,16.666667,12
...,...,...,...,...,...,...,...,...,...,...
199089,6020bc70f1e4807cc7002386,1,156.11,2,765.73004,241.31000,423.95000,256.49002,25.000000,8
199090,6020bc70f1e4807cc7002386,1,193.07,2,929.79990,437.59003,455.82000,269.20000,50.000000,4
199091,6020bc70f1e4807cc7002408,1,136.02,2,707.99000,289.90000,360.20000,202.98000,22.222222,9
199092,6020bc70f1e4807cc7002408,2,124.83,2,766.25000,312.96002,357.77997,204.07000,18.181818,11
