In [2]:
import pandas as pd
import os
import six
import pydot
import graphviz
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

**Test on binary encoded data with and without traits, label encoded data with and without traits**

In [None]:
match_data = pd.read_csv('../data/test_dataset_encoded.csv', delimiter='\t') #load in binary encoded data
match_data

Unnamed: 0,Champion 1_0,Champion 1_1,Champion 1_2,Champion 1_3,Champion 1_4,Champion 1_5,Champion 1_6,Level 1,Item 1 1_0,Item 1 1_1,...,Trait 7_3,Trait 7_4,Tier 7,Trait 8_0,Trait 8_1,Trait 8_2,Trait 8_3,Trait 8_4,Tier 8,Placement
0,0,0,0,0,0,0,1,2,0,0,...,0,1,3,0,0,0,0,1,0,2
1,0,0,0,0,0,1,0,2,0,0,...,1,0,0,0,0,0,0,1,0,3
2,0,0,0,0,0,1,1,2,0,0,...,1,0,0,0,0,0,0,1,0,7
3,0,0,0,0,1,0,0,2,0,0,...,1,0,0,0,0,0,0,1,0,5
4,0,0,0,0,0,1,0,2,0,0,...,1,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270891,0,0,0,0,0,0,1,2,0,0,...,0,1,3,0,0,0,0,1,0,3
270892,0,0,0,0,1,0,0,3,1,0,...,1,0,0,0,0,0,0,1,0,6
270893,0,0,0,0,1,0,1,2,0,0,...,1,0,0,0,0,0,0,1,0,7
270894,0,0,0,0,0,0,1,2,0,0,...,1,0,0,0,0,0,0,1,0,5


In [None]:
match_data.iloc[:, :344] #344 is index where traits begin

Unnamed: 0,Champion 1_0,Champion 1_1,Champion 1_2,Champion 1_3,Champion 1_4,Champion 1_5,Champion 1_6,Level 1,Item 1 1_0,Item 1 1_1,...,Item 11 2_4,Item 11 2_5,Item 11 2_6,Item 11 3_0,Item 11 3_1,Item 11 3_2,Item 11 3_3,Item 11 3_4,Item 11 3_5,Item 11 3_6
0,0,0,0,0,0,0,1,2,0,0,...,0,0,1,0,0,0,0,0,0,1
1,0,0,0,0,0,1,0,2,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0,0,0,0,0,1,1,2,0,0,...,0,0,1,0,0,0,0,0,0,1
3,0,0,0,0,1,0,0,2,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,0,1,0,2,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270891,0,0,0,0,0,0,1,2,0,0,...,0,0,1,0,0,0,0,0,0,1
270892,0,0,0,0,1,0,0,3,1,0,...,0,0,1,0,0,0,0,0,0,1
270893,0,0,0,0,1,0,1,2,0,0,...,0,0,1,0,0,0,0,0,0,1
270894,0,0,0,0,0,0,1,2,0,0,...,0,0,1,0,0,0,0,0,0,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split(match_data.iloc[:, :-1], match_data['Placement'], test_size=0.2, random_state=23)
model = RandomForestClassifier(n_estimators=20, random_state=23)
cv_scores = cross_val_score(model, X_train, y_train, cv=5)

#print cross-validation scores for each fold
print(f'Cross-validation scores for each fold: {cv_scores}')

#print the average cross-validation score
print(f'Average cross-validation score: {cv_scores.mean()}')

#train the model on the full training set
model.fit(X_train, y_train)

#make predictions on the test set
y_pred = model.predict(X_test)

#evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print(f'Test set accuracy: {test_accuracy}')

Cross-validation scores for each fold: [0.2630814  0.26188773 0.26251067 0.2608495  0.2608495 ]
Average cross-validation score: 0.26183576087767185
Test set accuracy: 0.26690660760428203


In [None]:
X_train_traitless, X_test_traitless, y_train_traitless, y_test_traitless = train_test_split(match_data.iloc[:, :344], match_data['Placement'], test_size=0.2, random_state=23)
model_traitless = RandomForestClassifier(n_estimators=20, random_state=23)
cv_scores_traitless = cross_val_score(model_traitless, X_train_traitless, y_train_traitless, cv=5)

#print cross-validation scores for each fold
print(f'Cross-validation scores for each fold: {cv_scores_traitless}')

#print the average cross-validation score
print(f'Average cross-validation score: {cv_scores_traitless.mean()}')

#train the model on the full training set
model_traitless.fit(X_train_traitless, y_train_traitless)

#make predictions on the test set
y_pred_traitless = model_traitless.predict(X_test_traitless)

#evaluate the model on the test set
test_accuracy_traitless = accuracy_score(y_test_traitless, y_pred_traitless)
print(f'Test set accuracy: {test_accuracy_traitless}')

Cross-validation scores for each fold: [0.26271226 0.26389498 0.26449484 0.26221074 0.2632259 ]
Average cross-validation score: 0.2633077419087029
Test set accuracy: 0.2639719453672942


In [None]:
label_data = pd.read_csv('../data/test_dataset_label_encoded.csv', delimiter='\t') #read in label encoded data
label_data

Unnamed: 0,Champion 1,Level 1,Item 1 1,Item 1 2,Item 1 3,Champion 2,Level 2,Item 2 1,Item 2 2,Item 2 3,...,Tier 4,Trait 5,Tier 5,Trait 6,Tier 6,Trait 7,Tier 7,Trait 8,Tier 8,Placement
0,6,2,0,0,0,51,1,0,0,0,...,1,10,1,8,1,7,3,0,0,2
1,62,2,0,0,0,50,3,142,142,37,...,1,13,1,0,0,0,0,0,0,3
2,36,2,0,0,0,58,2,0,0,0,...,3,0,0,0,0,0,0,0,0,7
3,48,2,0,0,0,25,2,0,0,0,...,0,0,0,0,0,0,0,0,0,5
4,62,2,157,0,0,58,2,0,0,0,...,1,15,3,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270891,6,2,149,0,0,51,2,140,0,0,...,1,10,1,8,1,7,3,0,0,3
270892,48,3,15,159,0,43,1,0,0,0,...,1,0,0,0,0,0,0,0,0,6
270893,25,2,0,0,0,31,2,0,0,0,...,3,0,0,0,0,0,0,0,0,7
270894,6,2,0,0,0,51,1,130,0,0,...,1,13,3,0,0,0,0,0,0,5


In [None]:
label_data.iloc[:, :55] #55 is column where traits start

Unnamed: 0,Champion 1,Level 1,Item 1 1,Item 1 2,Item 1 3,Champion 2,Level 2,Item 2 1,Item 2 2,Item 2 3,...,Champion 10,Level 10,Item 10 1,Item 10 2,Item 10 3,Champion 11,Level 11,Item 11 1,Item 11 2,Item 11 3
0,6,2,0,0,0,51,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,62,2,0,0,0,50,3,142,142,37,...,0,0,0,0,0,0,0,0,0,0
2,36,2,0,0,0,58,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,48,2,0,0,0,25,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,62,2,157,0,0,58,2,0,0,0,...,34,1,0,0,0,44,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270891,6,2,149,0,0,51,2,140,0,0,...,0,0,0,0,0,0,0,0,0,0
270892,48,3,15,159,0,43,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
270893,25,2,0,0,0,31,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
270894,6,2,0,0,0,51,1,130,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X_train_label, X_test_label, y_train_label, y_test_label = train_test_split(label_data.iloc[:, :55], label_data['Placement'], test_size=0.2, random_state=23)
label_model = RandomForestClassifier(n_estimators=20, random_state=23)
label_cv_scores = cross_val_score(label_model, X_train_label, y_train_label, cv=5)

#print cross-validation scores for each fold
print(f'Cross-validation scores for each fold: {label_cv_scores}')

#print the average cross-validation score
print(f'Average cross-validation score: {label_cv_scores.mean()}')

#train the model on the full training set
label_model.fit(X_train_label, y_train_label)

#make predictions on the test set
y_pred_label = label_model.predict(X_test_label)

#evaluate the model on the test set
test_accuracy_label = accuracy_score(y_test_label, y_pred_label)
print(f'Test set accuracy: {test_accuracy_label}')

Cross-validation scores for each fold: [0.27625508 0.27603073 0.27699975 0.27563851 0.27014743]
Average cross-validation score: 0.27501429870993854
Test set accuracy: 0.2763935031376892


In [None]:
X_train_label_traitless, X_test_label_traitless, y_train_label_traitless, y_test_label_traitless = train_test_split(label_data.iloc[:, :-1], label_data['Placement'], test_size=0.2, random_state=23)
label_model_traitless = RandomForestClassifier(n_estimators=20, random_state=23)
label_cv_scores_traitless = cross_val_score(label_model_traitless, X_train_label_traitless, y_train_label_traitless, cv=5)

#print cross-validation scores for each fold
print(f'Cross-validation scores for each fold: {label_cv_scores_traitless}')

#print the average cross-validation score
print(f'Average cross-validation score: {label_cv_scores_traitless.mean()}')

#train the model on the full training set
label_model_traitless.fit(X_train_label_traitless, y_train_label_traitless)

#make predictions on the test set
y_pred_label_traitless = label_model_traitless.predict(X_test_label_traitless)

#evaluate the model on the test set
test_accuracy_label_traitless = accuracy_score(y_test_label_traitless, y_pred_label_traitless)
print(f'Test set accuracy: {test_accuracy_label_traitless}')

Cross-validation scores for each fold: [0.27399409 0.28002215 0.274831   0.27446185 0.27420806]
Average cross-validation score: 0.27550343080038864
Test set accuracy: 0.2750645994832041


**Label with traits has best test set accuracy but label without traits has best cross-validation average**