# Pré-processamento copa dataset

In [191]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score as acc

In [192]:
df_total = pd.read_csv("dataset_rank_fifa.csv")
df_pred = df_total.drop(['country','year','classification'], axis=1)

In [193]:
df_2002 = df_pred.loc[df_total['year'] == 2002]
df_2006 = df_pred.loc[df_total['year'] == 2006]
df_2010 = df_pred.loc[df_total['year'] == 2010]
df_2014 = df_pred.loc[df_total['year'] == 2014]

In [194]:
df_2002 = pd.DataFrame(StandardScaler().fit_transform(df_2002), index=df_2002.index, columns=df_2002.columns)
df_2006 = pd.DataFrame(StandardScaler().fit_transform(df_2006), index=df_2006.index, columns=df_2006.columns)
df_2010 = pd.DataFrame(StandardScaler().fit_transform(df_2010), index=df_2010.index, columns=df_2010.columns)
df_2014 = pd.DataFrame(StandardScaler().fit_transform(df_2014), index=df_2014.index, columns=df_2014.columns)

In [195]:
X = pd.concat([df_2002,df_2006,df_2010,df_2014])

In [196]:
y = df_total['classification']

In [197]:
new_df = X
new_df['classification'] = df_total['classification']
new_df.classification = new_df.classification.astype(int)

In [198]:
new_df.min()

jan              -2.136760
feb              -2.100925
mar              -2.219902
apr              -2.197709
may              -2.240329
classification    0.000000
dtype: float64

In [199]:
new_df.head()

Unnamed: 0,jan,feb,mar,apr,may,classification
0,0.041544,-0.093359,-0.03517,-0.17855,-0.209645,0
1,-2.13676,-1.435043,-1.410712,-1.35813,-1.302627,0
2,-0.086592,-0.236472,-0.231676,-0.17855,-0.303868,0
3,2.283916,2.39323,2.376494,2.586091,2.522809,0
4,0.970527,0.926321,0.965224,0.835152,0.845647,0


In [200]:
new_df.classification.value_counts()

0    111
4      4
3      4
2      4
1      4
Name: classification, dtype: int64

In [201]:
new_df0 = new_df.loc[new_df['classification'] == 0]

In [202]:
new_df0 = new_df0.sample(95)

In [203]:
new_df = new_df.drop(new_df0.index)
new_df

Unnamed: 0,jan,feb,mar,apr,may,classification
0,0.041544,-0.093359,-0.03517,-0.17855,-0.209645,0
8,1.979594,2.071226,2.037075,2.16218,2.183608,1
9,-0.102609,-0.254361,-0.320997,-0.362859,-0.266179,3
12,-1.127693,-1.30982,-1.339255,-1.247544,-1.227249,4
14,1.146713,1.123102,1.054545,1.037892,1.090626,0
16,0.602137,0.514871,0.643669,0.503395,0.506446,2
17,0.041544,-0.093359,-0.017306,0.00576,0.110712,0
29,-0.054558,-0.200694,-0.160219,-0.233843,-0.285023,0
30,-0.070575,-0.236472,-0.303133,-0.362859,-0.454624,0
36,0.999414,0.981629,0.926564,0.949574,0.934895,2


In [204]:
new_df = new_df.reset_index(drop=True)

In [205]:
feature_cols = [x for x in new_df.columns if x not in 'classification']

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

train_idx, test_idx = next(sss.split(new_df[feature_cols], new_df['classification']))

In [206]:
X_train = new_df.loc[train_idx, feature_cols]
y_train = new_df.loc[train_idx, 'classification']

X_test = new_df.loc[test_idx, feature_cols]
y_test = new_df.loc[test_idx, 'classification']

In [207]:
y_train

2     3
12    1
0     0
8     0
20    0
35    0
19    2
7     0
23    0
22    0
11    4
15    0
16    0
17    0
24    0
18    1
32    0
9     2
28    2
21    3
25    4
27    1
3     4
33    0
10    0
4     0
29    3
26    0
Name: classification, dtype: int64

In [208]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train,y_train)

y_pred = lr.predict(X_test)

In [212]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train,y_train)

y_pred = svc.predict(X_test)

In [213]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1])

In [214]:
y_test

13    3
5     2
31    0
34    0
14    0
30    4
6     0
1     1
Name: classification, dtype: int64

In [215]:
from sklearn.metrics import f1_score

f1_score(y_pred, y_test, average='micro')

0.625