In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Image
from statsmodels.stats.outliers_influence import variance_inflation_factor
sns.set_style('whitegrid')
%matplotlib inline

import plotly.express as px

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn import metrics


from matplotlib import pyplot

In [None]:
lol_data = pd.read_csv("C:/Users/jose.ramos/Documents/python/TCC/input/high_diamond_ranked_10min.csv")

In [None]:
lol_data.head()

In [None]:
lol_data.info()

In [None]:
lol_data.shape

In [None]:
lol_data.describe()

In [None]:
px.imshow(lol_data.corr(), text_auto=True, aspect="auto")

In [None]:
lol_data_final = lol_data.drop(['gameId', 'redGoldDiff','redExperienceDiff', 'redFirstBlood'], axis=1)

### Criação das bases


In [None]:
X_lol = lol_data_final.drop(columns=['blueWins']).values
Y_lol = lol_data_final['blueWins'].values
X_train_lol, X_test_lol, Y_train_lol, Y_test_lol = train_test_split(X_lol,Y_lol, 
                                                                                      test_size=0.3, random_state=42)

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

Para a criação do modelo de machine larning foram utilizados os seguintes algoritmos para predizer o resultado das partidas: 
•	Regressão Logística
•	Árvore de decisão
•	Floresta aleatória
Já para avaliar os modelos utilizados, foram utilizados o F1 Score e Repeated K-Fold Cross Validation. 

In [None]:
lol_log_model = LogisticRegression(random_state=42, max_iter=10000)
lol_log_model.fit(X_train_lol,Y_train_lol)

In [None]:
lol_log_f1scores = cross_val_score(lol_log_model, X_train_lol, Y_train_lol, 
                                       scoring="f1",cv=cv, n_jobs=-1)

lol_log_ROCscores = cross_val_score(lol_log_model, X_train_lol, Y_train_lol,
                                        scoring="roc_auc", cv=cv, n_jobs=-1)

In [None]:
print('F1: %.3f (%.3f)' % (np.mean(lol_log_f1scores), np.std(lol_log_f1scores)))
print('ROC: %.3f (%.3f)' % (np.mean(lol_log_ROCscores), np.std(lol_log_ROCscores)))

In [None]:
lol_dt_model = DecisionTreeClassifier(criterion = 'entropy', random_state = 42)
lol_dt_model.fit(X_train_lol,Y_train_lol)

In [None]:
lol_dt_f1scores = cross_val_score(lol_dt_model, X_train_lol, Y_train_lol, 
                                    scoring="f1",cv=cv, n_jobs=-1)
lol_dt_ROCscores = cross_val_score(lol_dt_model, X_train_lol, Y_train_lol,
                                       scoring="roc_auc", cv=cv, n_jobs=-1)

In [None]:
print('F1: %.3f (%.3f)' % (np.mean(lol_dt_f1scores), np.std(lol_dt_f1scores)))
print('ROC: %.3f (%.3f)' % (np.mean(lol_dt_ROCscores), np.std(lol_dt_ROCscores)))

In [None]:
lol_rf_model = RandomForestClassifier(criterion = 'entropy', random_state = 42)
lol_rf_model.fit(X_train_lol,Y_train_lol)

In [None]:
lol_rf_f1scores = cross_val_score(lol_rf_model, X_train_lol, Y_train_lol, 
                                      scoring="f1",cv=cv, n_jobs=-1)

lol_rf_ROCscores = cross_val_score(lol_rf_model, X_train_lol, Y_train_lol,
                                       scoring="roc_auc", cv=cv, n_jobs=-1)

In [None]:
print('F1: %.3f (%.3f)' % (np.mean(lol_rf_f1scores), np.std(lol_rf_f1scores)))
print('ROC: %.3f (%.3f)' % (np.mean(lol_rf_ROCscores), np.std(lol_rf_ROCscores)))



### Resultados comparados

In [None]:
print('Logisitic Regression')
print('F1: %.3f (%.3f)' % (np.mean(lol_log_f1scores), np.std(lol_log_f1scores)))
print('ROC: %.3f (%.3f)' % (np.mean(lol_log_ROCscores), np.std(lol_log_ROCscores)))
print('Decision Tree')
print('F1: %.3f (%.3f)' % (np.mean(lol_dt_f1scores), np.std(lol_dt_f1scores)))
print('ROC: %.3f (%.3f)' % (np.mean(lol_dt_ROCscores), np.std(lol_dt_ROCscores)))
print('Random Forest')
print('F1: %.3f (%.3f)' % (np.mean(lol_rf_f1scores), np.std(lol_rf_f1scores)))
print('ROC: %.3f (%.3f)' % (np.mean(lol_rf_ROCscores), np.std(lol_rf_ROCscores)))

In [None]:
y_lol_predict = lol_log_model.predict(X_test_lol)

In [None]:
print("Accuracy Score", metrics.accuracy_score(Y_test_lol, y_lol_predict))
print("F1 Score", metrics.f1_score(Y_test_lol, y_lol_predict))
print("ROC-AUC Score", metrics.roc_auc_score(Y_test_lol, y_lol_predict))

### Importancia

In [None]:
support_feature_importance=pd.DataFrame({
    'Random Forest':lol_rf_model.feature_importances_,
    'Decision Tree':lol_dt_model.feature_importances_,
    'Logistic Regression':[abs(i) for i in lol_log_model.coef_[0]]
},index=lol_data_final.drop(columns=['blueWins']).columns)
support_feature_importance.sort_values(by='Logistic Regression',ascending=True,inplace=True)

support_feature_importance.plot(kind='barh',figsize=(12,10), width=.85, colormap='Paired', fontsize=15)