# Pré-processamento e Testes Dataset Copa

O dataset aqui utilizado foi retirado do [FIFA World Men's Ranking](https://www.fifa.com/fifa-world-ranking/ranking-table/men/index.html). Pretende-se utilizar a posição no ranking de determinada seleção nos 5 meses anteriores à Copa do Mundo e a sua classificação nos anos de 2002, 2006, 2010 e 2014 para prever a classificação na Copa do Mundo de 2018.

In [34]:
#imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

In [330]:
#abrindo o dataset de treino
df_total = pd.read_csv("dataset_rank_fifa.csv")
df_pred = df_total.drop(['country','year','classification'], axis=1)

## Pré-processamento
### Separando o dataset em anos

Vamos separar o dataset em anos para então aplicar o `StandardScaler`, já que alguns anos possui uma média de valores no ranking diferente dos outros. Após feito isso, juntamos os datasets separados em `new_df`.

In [331]:
df_2002 = df_pred.loc[df_total['year'] == 2002]
df_2006 = df_pred.loc[df_total['year'] == 2006]
df_2010 = df_pred.loc[df_total['year'] == 2010]
df_2014 = df_pred.loc[df_total['year'] == 2014]

In [332]:
df_2002 = pd.DataFrame(StandardScaler().fit_transform(df_2002), index=df_2002.index, columns=df_2002.columns)
df_2006 = pd.DataFrame(StandardScaler().fit_transform(df_2006), index=df_2006.index, columns=df_2006.columns)
df_2010 = pd.DataFrame(StandardScaler().fit_transform(df_2010), index=df_2010.index, columns=df_2010.columns)
df_2014 = pd.DataFrame(StandardScaler().fit_transform(df_2014), index=df_2014.index, columns=df_2014.columns)

In [333]:
#separando os dados em X e Y para treino posterior
X = pd.concat([df_2002,df_2006,df_2010,df_2014]) 
y = df_total['classification'].astype(int)

In [334]:
new_df = X
new_df['classification'] = df_total['classification']

In [335]:
new_df.classification.value_counts()

0    112
4      4
3      4
2      4
1      4
Name: classification, dtype: int64

In [336]:
new_df0 = new_df.loc[new_df['classification'] == 0]
new_df0 = new_df0.sample(92)
new_df = new_df.drop(new_df0.index)

In [None]:
new_df0 = new_df.loc[new_df['classification'] == 6]
new_df0 = new_df0.sample(24)
new_df = new_df.drop(new_df0.index)

In [None]:
new_df0 = new_df.loc[new_df['classification'] == 5]
new_df0 = new_df0.sample(12)
new_df = new_df.drop(new_df0.index)

In [337]:
new_df = new_df.reset_index(drop=True)
new_df.classification.value_counts()

0    20
4     4
3     4
2     4
1     4
Name: classification, dtype: int64

### Divisão dos dados em treino e teste

Utilização de `StratifiedShuffleSplit` para a divisão do dataset.

In [338]:
feature_cols = [x for x in new_df.columns if x not in 'classification']

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25)

train_idx, test_idx = next(sss.split(new_df[feature_cols], new_df['classification']))

In [339]:
X_train = new_df.loc[train_idx, feature_cols]
y_train = new_df.loc[train_idx, 'classification']

X_test = new_df.loc[test_idx, feature_cols]
y_test = new_df.loc[test_idx, 'classification']

## Testes com modelos de ML
### Regressão Logística

In [274]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

y_pred = lr.predict(X_test)

print("F-score: ", f1_score(y_pred, y_test, average='micro'))
print("Acurácia: ", accuracy_score(y_pred,y_test))

F-score:  0.5555555555555556
Acurácia:  0.5555555555555556


In [275]:
y_pred

array([0, 1, 0, 1, 0, 0, 1, 0, 0])

### Máquinas de Vetores de Suporte

In [276]:
svc = SVC()
svc.fit(X_train,y_train)

y_pred = svc.predict(X_test)

print("F-score: ", f1_score(y_pred, y_test, average='micro'))
print("Acurácia: ", accuracy_score(y_pred,y_test))

F-score:  0.6666666666666666
Acurácia:  0.6666666666666666


In [277]:
y_pred

array([0, 0, 0, 1, 0, 0, 0, 0, 0])

### Redes Neurais

In [278]:
mlp = MLPClassifier(hidden_layer_sizes=(3,2), activation='identity', solver='lbfgs')
mlp.fit(X_train,y_train)

y_pred = mlp.predict(X_test)

#print(cross_val_score(mlp,X,y,cv=sss))

print("F-score: ", f1_score(y_pred, y_test, average='micro'))
print("Acurácia: ", accuracy_score(y_pred,y_test))

F-score:  0.6666666666666666
Acurácia:  0.6666666666666666


In [279]:
y_pred

array([0, 3, 2, 1, 0, 0, 0, 0, 3])

In [280]:
#GridSearchCV
params = {'hidden_layer_sizes': [(3,2)],
          'activation':['identity', 'relu'],
          'solver': ['adam', 'lbfgs'],
          'max_iter': [2000]}

mlp = MLPClassifier()
gs = GridSearchCV(mlp, params, n_jobs=-1, scoring='f1_micro')

gs.fit(X_train, y_train)

best_model = gs.best_estimator_




In [281]:
gs.cv_results_['mean_test_score']

array([0.55555556, 0.51851852, 0.51851852, 0.55555556])

### Bagging

In [298]:
bc = BaggingClassifier(n_estimators=500)

bc.fit(X_train,y_train)

y_pred = bc.predict(X_test)

print("F-score: ", f1_score(y_pred, y_test, average='micro'))
print("Acurácia: ", accuracy_score(y_pred,y_test))

F-score:  0.4444444444444444
Acurácia:  0.4444444444444444


In [299]:
y_pred

array([3, 3, 1, 0, 0, 2, 4, 0, 4])

In [300]:
np.array(y_test)

array([0, 3, 1, 0, 0, 0, 2, 4, 0])

In [289]:
X_test

Unnamed: 0,jan,feb,mar,apr,may
7,0.714255,0.801098,0.804446,0.835152,0.902181
25,1.338899,1.275136,1.260623,1.023761,1.024657
33,-0.448348,-0.917788,-0.894197,-0.926965,-0.93535
24,1.638263,1.522995,1.695854,1.726191,1.707498
32,-0.363453,-0.375859,-0.211482,-0.122978,-0.134487
20,-0.051424,-0.001074,-0.061901,-0.019859,-0.028148
26,0.985918,1.1239,1.111279,1.324198,1.307066
9,-0.903456,-0.880481,-0.874786,-0.842064,-0.831514
28,0.691022,0.716402,0.590709,0.147838,0.135277


In [304]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print("F-score: ", f1_score(y_pred, y_test, average='micro'))
print("Acurácia: ", accuracy_score(y_pred,y_test))

F-score:  0.6666666666666666
Acurácia:  0.6666666666666666


In [305]:
y_pred

array([0, 3, 1, 0, 0, 2, 2, 2, 2])

In [306]:
np.array(y_test)

array([0, 3, 1, 0, 0, 0, 2, 4, 0])

In [315]:
df2018 = pd.read_csv("dataset_rank_fifa_2018.csv")
df2018.head()

Unnamed: 0,country,jan,feb,mar,apr,may
0,Germany,1602,1602,1609,1533,1558
1,Brazil,1483,1484,1489,1384,1431
2,Portugal,1358,1358,1360,1306,1274
3,Argentina,1348,1348,1359,1254,1241
4,Belgium,1325,1325,1337,1346,1298


In [316]:
df2018_predict = df2018.drop('country', axis=1)
df2018_predict = pd.DataFrame(StandardScaler().fit_transform(df2018_predict),
                              index=df2018_predict.index, columns=df2018_predict.columns)

In [329]:
df2018_predict

Unnamed: 0,jan,feb,mar,apr,may
0,2.245093,2.189513,2.174015,2.13828,2.221156
1,1.807879,1.77097,1.753806,1.596615,1.760278
2,1.34862,1.324051,1.302082,1.313059,1.190532
3,1.31188,1.288581,1.29858,1.124022,1.070777
4,1.227376,1.207001,1.221542,1.458473,1.277627
5,0.882013,0.873585,0.839852,0.789571,0.653446
6,0.801184,0.80974,0.839852,0.629616,0.860297
7,0.731377,0.728159,0.731299,0.851372,0.91836
8,0.705658,0.703331,0.689278,0.804112,0.914731
9,0.503584,0.508247,0.489679,0.585992,0.649818


In [340]:
X = new_df.drop('classification', axis=1)
y = new_df['classification']

In [341]:
bc = BaggingClassifier(n_estimators=20)

bc.fit(X,y)

y_pred = bc.predict(df2018_predict)

y_pred

array([0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 4, 0, 0, 0, 0])

In [342]:
df2018['predict'] = y_pred

In [343]:
df2018

Unnamed: 0,country,jan,feb,mar,apr,may,predict
0,Germany,1602,1602,1609,1533,1558,0
1,Brazil,1483,1484,1489,1384,1431,1
2,Portugal,1358,1358,1360,1306,1274,2
3,Argentina,1348,1348,1359,1254,1241,2
4,Belgium,1325,1325,1337,1346,1298,2
5,Spain,1231,1231,1228,1162,1126,0
6,Poland,1209,1213,1228,1118,1183,0
7,Switzerland,1190,1190,1197,1179,1199,0
8,France,1183,1183,1185,1166,1198,0
9,Peru,1128,1128,1128,1106,1125,0
