In [102]:
# Package Import
import pandas as pd
import numpy as np
from plotly import express as px

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV




In [103]:
# Data Import
matches21 = pd.read_csv("Data/2021_esports.csv", low_memory=False)
matches22 = pd.read_csv("Data/2022_esports.csv", low_memory=False)
matches23 = pd.read_csv("Data/2023_esports.csv", low_memory=False)

In [104]:
%%capture --no-stdout
# Data Cleaning

def clean(df):

    # Select valid observations
    df = df[df['position'] == 'team']

    # Select features
    df = df[['gameid', 'position', 'league', 'side', 'teamname', \
             'goldat10', 'xpat10', 'csat10', 'killsat10', \
             'opp_goldat10', 'opp_xpat10', 'opp_csat10', 'opp_killsat10', \
             'goldat15', 'xpat15', 'csat15', 'killsat15', \
             'opp_goldat15', 'opp_xpat15', 'opp_csat15', 'opp_killsat15', 'firstdragon', 'result']]
    
    # Compute new features
    df['goldat10pct'] = (df['goldat10'] - df['opp_goldat10']) / df['goldat10']
    df['xpat10pct'] = (df['xpat10'] - df['opp_xpat10']) / df['xpat10']
    df['csat10pct'] = (df['csat10'] - df['opp_csat10']) / df['csat10']
    df['goldat15pct'] = (df['goldat15'] - df['opp_goldat15']) / df['goldat15']
    df['xpat15pct'] = (df['xpat15'] - df['opp_xpat15']) / df['xpat15']
    df['csat15pct'] = (df['csat15'] - df['opp_csat15']) / df['csat15']
    
    return df

matches21 = clean(matches21)
matches22 = clean(matches22)
matches23 = clean(matches23)


In [130]:
# Interesting findings & graphs
plot_xp10pct = px.scatter(x=matches21['xpat10pct'], y=matches21['result'], title='xp10pct vs result')
plot_xp15pct = px.scatter(x=matches21['xpat15pct'], y=matches21['result'], title='xp15pct vs result')
plot_gold10 = px.scatter(x=matches21['goldat10'], y=matches21['result'], title='goldat10 vs result')
plot_gold15 = px.scatter(x=matches21['goldat15'], y=matches21['result'], title='goldat15 vs result')

dragons = matches21.dropna()
dragons = dragons[dragons['firstdragon'] == 1][['firstdragon', 'result']]
plot_dragon = px.pie(dragons, values='firstdragon', names='result', title='winrate with first dragon')

plot_xp10pct.write_html('xp10pct.html', include_plotlyjs='cdn')
plot_xp15pct.write_html('xp15pct.html', include_plotlyjs='cdn')
plot_gold10.write_html('gold10.html', include_plotlyjs='cdn')
plot_gold15.write_html('gold15.html', include_plotlyjs='cdn')
plot_dragon.write_html('dragon.html', include_plotlyjs='cdn')

In [49]:
# Combine dataframes
train_df = pd.concat([matches21, matches22, matches23[matches23['league'] != 'WLDS']], ignore_index=True).dropna()
test_df = matches23[matches23['league'] == 'WLDs'].dropna()

# Drop insignificant observations
game_cnt = train_df.groupby("teamname").count()['gameid']
teams_to_drop = game_cnt[game_cnt < 15].index
def keep(team):
    if team in teams_to_drop:
        return False
    return True
train_df['keep'] = train_df['teamname'].apply(keep)
train_df = train_df[train_df['keep']].drop(columns=['keep'])

# Train Test Split
train_X = train_df.drop(columns=['result'])
train_y = train_df['result']

test_X = test_df.drop(columns=['result'])
test_y = test_df['result']

X_train, X_test, y_train, y_test = train_test_split(train_X, train_y, random_state=42)

In [84]:
# Define custom transformer for dropping columns
# Drop teams with less than 10 games
class Dropper():
    def __init__(self, columns):
        self.columns = columns

    def transform(self, X):
        return X.drop(self.columns, axis=1)

    def fit(self, X, y=None):
        return self 
    
# Define baseline model
baseline = Pipeline([
    ('ColumnDrop', Dropper(['side', 'gameid', 'position', 'league', 'teamname'])),
    ('Model', LogisticRegression(max_iter=3000, C=0.1, solver='saga', random_state=42))
])


In [85]:
# Model Training
baseline.fit(X_train, y_train)


The max_iter was reached which means the coef_ did not converge



In [86]:
# Test set scoring
baseline.score(X_test, y_test)

0.7525904992598573

In [53]:
# Worlds set scoring
baseline.score(test_X, test_y)

0.7258064516129032

In [91]:
# Ensemble model construction
model = Pipeline([
    ('ColumnDrop', Dropper(['side', 'gameid', 'position', 'league', 'teamname'])),
    ('Model', RandomForestClassifier(random_state=42))
])

hyper_param = {
    'Model__n_estimators': list(range(50, 200, 50)),
    'Model__max_depth': list(range(7, 20, 2))
}

In [92]:
# Best hyperparam search
cv = GridSearchCV(model, hyper_param, cv=5)
cv.fit(X_train, y_train)

In [121]:
cv.best_params_

{'Model__max_depth': 7, 'Model__n_estimators': 150}

In [122]:
model = cv.best_estimator_

In [123]:
# Test set scoring
model.score(X_test, y_test)

0.7548782128919391

In [124]:
# Worlds set scoring
model.score(test_X, test_y)

0.7338709677419355

In [97]:
# Define column transformer for encoding and dropping columns
encoder = ColumnTransformer(
    transformers=[
        ('OneHot', OneHotEncoder(), ['teamname'])
    ], 
    remainder='passthrough'
)

# Final model construction
model_best = Pipeline([
    ('ColumnDrop', Dropper(['side', 'gameid', 'position', 'league'])),
    ('TextEncoding', encoder),
    ('Model', RandomForestClassifier(random_state=42))
])

model_best.fit(X_train, y_train)

In [98]:
# Test set scoring
model_best.score(X_test, y_test)

0.7531287848203472

In [99]:
# Worlds set scoring
model_best.score(test_X, test_y)

0.9475806451612904