# Pré-processamento copa dataset

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score as acc

In [2]:
df_total = pd.read_csv("dataset_rank_fifa_classificacoes.csv")
df_pred = df_total.drop(['country','year','classification'], axis=1)

In [3]:
df_2002 = df_pred.loc[df_total['year'] == 2002]
df_2006 = df_pred.loc[df_total['year'] == 2006]
df_2010 = df_pred.loc[df_total['year'] == 2010]
df_2014 = df_pred.loc[df_total['year'] == 2014]

In [4]:
df_2002 = pd.DataFrame(StandardScaler().fit_transform(df_2002), index=df_2002.index, columns=df_2002.columns)
df_2006 = pd.DataFrame(StandardScaler().fit_transform(df_2006), index=df_2006.index, columns=df_2006.columns)
df_2010 = pd.DataFrame(StandardScaler().fit_transform(df_2010), index=df_2010.index, columns=df_2010.columns)
df_2014 = pd.DataFrame(StandardScaler().fit_transform(df_2014), index=df_2014.index, columns=df_2014.columns)

In [5]:
X = pd.concat([df_2002,df_2006,df_2010,df_2014])

In [64]:
y = df_total['classification'].astype(int)

In [65]:
new_df = X
new_df['classification'] = df_total['classification']

In [66]:
new_df.classification.value_counts()

0    64
6    32
5    16
4     4
3     4
2     4
1     4
Name: classification, dtype: int64

In [67]:
new_df.head()

Unnamed: 0,jan,feb,mar,apr,may,classification
0,0.041544,-0.093359,-0.03517,-0.17855,-0.209645,6
1,-2.13676,-1.435043,-1.410712,-1.35813,-1.302627,5
2,-0.086592,-0.236472,-0.231676,-0.17855,-0.303868,0
3,2.283916,2.39323,2.376494,2.586091,2.522809,0
4,0.970527,0.926321,0.965224,0.835152,0.845647,5


In [83]:
feature_cols = [x for x in new_df.columns if x not in 'classification']

sss = StratifiedShuffleSplit(n_splits=3)

train_idx, test_idx = next(sss.split(new_df[feature_cols], new_df['classification']))

In [84]:
X_train = new_df.loc[train_idx, feature_cols]
y_train = new_df.loc[train_idx, 'classification']

X_test = new_df.loc[test_idx, feature_cols]
y_test = new_df.loc[test_idx, 'classification']

In [85]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train,y_train)

y_pred = lr.predict(X_test)

In [17]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train,y_train)

y_pred = svc.predict(X_test)

In [82]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score


mlp = MLPClassifier(hidden_layer_sizes=(3,2), activation='identity', solver='lbfgs')

print(cross_val_score(mlp,X,y,cv=sss))

[1. 1. 1.]


In [86]:
y_pred

array([0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0])

In [89]:
from sklearn.metrics import f1_score

f1_score(y_pred, y_test, average='micro')

0.46153846153846156

In [96]:
from sklearn.model_selection import GridSearchCV

params = {'hidden_layer_sizes': [(3,2)], 'activation':['identity', 'relu'], 'solver': ['adam', 'lbfgs'], 'max_iter': [1000]}

mlp = MLPClassifier()
gs = GridSearchCV(mlp, params, n_jobs=-1, scoring='f1_micro')

gs.fit(X_train, y_train)

best_model = gs.best_estimator_




In [97]:
gs.predict(X_test)

array([0, 0, 0, 0, 6, 0, 0, 0, 0, 6, 0, 0, 6])

In [98]:
best_model

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(3, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)