In [2]:
#baseline -> num counter picks, num counter bans, side
#final -> num counterpicks, num counter bans, pga, side, champ wr, team wr


In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

import pandas as pd

In [4]:
df = pd.read_csv('complete_df.csv')
df


Unnamed: 0.1,Unnamed: 0,year,gameid,league,teamname,side,ban1,ban2,ban3,ban4,...,pick3,pick4,pick5,num_counters_picked,num_counters_banned,PGA,higher_PGA,mean_champ_wr,mean_team_wr,result
0,0,2017,1506-1540,LPL,I May,Blue,Syndra,Malzahar,Ashe,Karma,...,Cassiopeia,Varus,Tahm Kench,0,2,2,1,0.508666,0.438776,1
1,1,2017,1506-1540,LPL,Royal Never Give Up,Red,Camille,Rengar,Zyra,Elise,...,Ryze,Caitlyn,Nautilus,0,0,0,0,0.493086,0.598582,0
2,2,2017,1506-1541,LPL,I May,Blue,Syndra,Malzahar,Ashe,Rek'Sai,...,Corki,Caitlyn,Thresh,0,0,0,0,0.510833,0.438776,1
3,3,2017,1506-1541,LPL,Royal Never Give Up,Red,Rengar,Camille,Varus,Cassiopeia,...,Ryze,Jhin,Zyra,1,0,1,1,0.492558,0.598582,0
4,4,2017,1507-1544,LPL,Invictus Gaming,Blue,Jayce,Elise,Malzahar,Kha'Zix,...,LeBlanc,Varus,Tahm Kench,0,1,1,0,0.492380,0.523126,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88345,88345,2025,LOLTMNT06_96854,LFL,Vitality.Bee,Red,Skarner,Viktor,Maokai,Rell,...,Sylas,Varus,Nautilus,1,1,2,1,0.497897,0.614958,1
88346,88346,2025,LOLTMNT06_96867,LFL2,Zerance,Blue,Aurora,Karthus,Kayn,Galio,...,Sylas,Corki,Nautilus,1,2,3,1,0.501714,0.282609,0
88347,88347,2025,LOLTMNT06_96867,LFL2,Project Conquerors,Red,Viktor,K'Sante,Ivern,Jayce,...,Ambessa,Varus,Rell,2,0,2,0,0.509128,0.511111,1
88348,88348,2025,LOLTMNT06_96906,LFL2,IZI Dream,Blue,Akali,Rell,Varus,Skarner,...,Corki,Kai'Sa,Rakan,1,2,3,1,0.516646,0.504587,1


# Baseline Model = Decision Tree on Num counters, Num counters_banned, Side

In [8]:
X_train, X_test, y_train, y_test = (
    train_test_split(df[['num_counters_picked','num_counters_banned', 'side']], df['result'], random_state=1)
)

preproc = ColumnTransformer(
    transformers=[
        ('side', OneHotEncoder(), ['side'])
    ],
    remainder='passthrough',
)

pipeline = Pipeline([
    ('preprocessor', preproc),
    ('classifier', DecisionTreeClassifier(criterion='entropy', random_state=1))
])

hyperparameters = {
    'classifier__max_depth': [2, 3, 4, 5, 7, 10, 13, 15, 18, None],
    'classifier__min_samples_split': [2, 5, 10, 20, 50, 100, 200],
    'classifier__criterion': ['entropy', 'gini']
}

searcher = GridSearchCV(pipeline, hyperparameters, cv=5, n_jobs=-1, scoring='accuracy')

searcher.fit(X_train, y_train)

training_accuracy = searcher.score(X_train, y_train)

testing_accuracy = searcher.score(X_test, y_test)

print("Best Parameters:", searcher.best_params_)
print("Training Accuracy:", training_accuracy)
print("Testing Accuracy:", testing_accuracy)

Best Parameters: {'classifier__criterion': 'entropy', 'classifier__max_depth': 3, 'classifier__min_samples_split': 2}
Training Accuracy: 0.5310736168543057
Testing Accuracy: 0.5291561028612821


# Improved Baseline Model = Random Forest Classifier on Num counters, Num counters_banned, Side

In [10]:
X_train, X_test, y_train, y_test = (
    train_test_split(df[['num_counters_picked','num_counters_banned', 'side']], df['result'], random_state=1)
)

preproc = ColumnTransformer(
    transformers=[
        ('side', OneHotEncoder(), ['side'])
    ],
    remainder='passthrough',
)

pipeline = Pipeline([
    ('preprocessor', preproc),
    ('classifier', RandomForestClassifier(random_state=1, n_jobs=-1))
])

hyperparameters = {
    'classifier__n_estimators': [50, 100, 200],  # Number of trees
    'classifier__max_depth': [5, 10, 15, 20, None],
    'classifier__min_samples_split': [2, 5, 10, 20],
    'classifier__criterion': ['gini', 'entropy']
}

searcher = GridSearchCV(pipeline, hyperparameters, cv=5, n_jobs=-1, scoring='accuracy')

searcher.fit(X_train, y_train)

training_accuracy = searcher.score(X_train, y_train)

testing_accuracy = searcher.score(X_test, y_test)

print("Best Parameters:", searcher.best_params_)
print("Training Accuracy:", training_accuracy)
print("Testing Accuracy:", testing_accuracy)

Best Parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 5, 'classifier__min_samples_split': 20, 'classifier__n_estimators': 200}
Training Accuracy: 0.5314659986115723
Testing Accuracy: 0.5291108294096342


# Final Model - Random Forest on Num Counter Picks, Num Counters Banned, Side, PGA, Higher_PGA, Champ Mean WR, Team Mean WR

In [15]:
X_train, X_test, y_train, y_test = (
    train_test_split(df[['num_counters_picked','num_counters_banned', 'side', 'mean_team_wr','mean_champ_wr']], df['result'], random_state=1)
)

preproc = ColumnTransformer(
    transformers=[
        ('side', OneHotEncoder(), ['side'])
    ],
    remainder='passthrough',
)


pipeline = Pipeline([
    ('preprocessor', preproc),
    ('classifier', RandomForestClassifier(random_state=1, n_jobs=-1))
])


hyperparameters = {
    'classifier__n_estimators': [100, 120, 140, 160, 180, 200],  # Number of trees
    'classifier__max_depth': [5, 10, 15, 20, None],
    'classifier__min_samples_split': [2, 5, 10, 20],
    'classifier__criterion': ['gini', 'entropy']
}

searcher = GridSearchCV(pipeline, hyperparameters, cv=5, n_jobs=-1, scoring='accuracy', verbose=2)

# Fit Model
searcher.fit(X_train, y_train)

# Get Best Model
best_model = searcher.best_estimator_

# Accuracy Scores
training_accuracy = best_model.score(X_train, y_train)
testing_accuracy = best_model.score(X_test, y_test)

# Display Results
print("Best Parameters:", searcher.best_params_)
print("Training Accuracy:", training_accuracy)
print("Testing Accuracy:", testing_accuracy)


Fitting 5 folds for each of 240 candidates, totalling 1200 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 5, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Training Accuracy: 0.5996800579517673
Testing Accuracy: 0.5995110467222021


# Given that a big weighted importance is team wr, lets try removing it in case its overwhelming the model 

In [43]:
X_train, X_test, y_train, y_test = (
    train_test_split(df[['num_counters','num_counters_banned', 'side', 'PGA', 'higher_PGA', 'mean_wr']], df['result'], random_state=1)
)

preproc = ColumnTransformer(
    transformers=[
        ('side', OneHotEncoder(), ['side'])
    ],
    remainder='passthrough',
)


pipeline = Pipeline([
    ('preprocessor', preproc),
    ('classifier', RandomForestClassifier(random_state=1, n_jobs=-1))
])


hyperparameters = {
    'classifier__n_estimators': [50, 100, 200],  # Number of trees
    'classifier__max_depth': [5, 10, 15, 20, None],
    'classifier__min_samples_split': [2, 5, 10, 20],
    'classifier__criterion': ['gini', 'entropy']
}

searcher = GridSearchCV(pipeline, hyperparameters, cv=5, n_jobs=-1, scoring='accuracy', verbose=2)

# Fit Model
searcher.fit(X_train, y_train)

# Get Best Model
best_model = searcher.best_estimator_

# Accuracy Scores
training_accuracy = best_model.score(X_train, y_train)
testing_accuracy = best_model.score(X_test, y_test)

# Display Results
print("Best Parameters:", searcher.best_params_)
print("Training Accuracy:", training_accuracy)
print("Testing Accuracy:", testing_accuracy)


Fitting 5 folds for each of 120 candidates, totalling 600 fits
Best Parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 5, 'classifier__min_samples_split': 20, 'classifier__n_estimators': 200}
Training Accuracy: 0.5477114756110659
Testing Accuracy: 0.5376294591484465


# Since the accuracy decreased, we know that was not the case. Lets try logistic regression

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler


# Define features and target
X = df[['num_counters', 'num_counters_banned', 'side', 'PGA', 'higher_PGA', 'mean_wr', 'team_wr']]
y = df['result']

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Preprocessing pipeline
from sklearn.impute import SimpleImputer

# Updated preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),  # Fill NaN with mean
            ('scaler', StandardScaler())
        ]), ['num_counters', 'num_counters_banned', 'PGA', 'higher_PGA', 'mean_wr', 'team_wr']),
        
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill NaN with most common category
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), ['side'])
    ]
)


# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Hyperparameter tuning
hyperparameters = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'classifier__solver': ['lbfgs', 'liblinear']
}

searcher = GridSearchCV(pipeline, hyperparameters, cv=5, n_jobs=-1, scoring='accuracy')

# Train model
searcher.fit(X_train, y_train)

# Evaluate model
training_accuracy = searcher.score(X_train, y_train)
testing_accuracy = searcher.score(X_test, y_test)

# Print results
print("Best Parameters:", searcher.best_params_)
print("Training Accuracy:", training_accuracy)
print("Testing Accuracy:", testing_accuracy)



Best Parameters: {'classifier__C': 0.01, 'classifier__solver': 'liblinear'}
Training Accuracy: 0.5999416936461418
Testing Accuracy: 0.6028998849252014
