In [None]:
import pandas as pd
import numpy as np

#modelos para calculo#
from sklearn import metrics as mt, preprocessing as pp
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import svm.svr as SVR

In [2]:
data = pd.read_csv('dataset/sc2-matches-history.csv', header=0)

In [3]:
data.head()

Unnamed: 0,match_date,player_1,player_1_match_status,score,player_2,player_2_match_status,player_1_race,player_2_race,addon,tournament_type
0,09/19/2016,MC,[loser],0–2,Stats,[winner],P,P,LotV,online
1,09/19/2016,MC,[winner],2–1,NaTuRal,[loser],P,T,LotV,online
2,09/19/2016,MC,[loser],1–2,Dark,[winner],P,Z,LotV,online
3,09/13/2016,MC,[loser],0–2,INnoVation,[winner],P,T,LotV,online
4,08/27/2016,MC,[loser],0–1,TRUE,[winner],P,Z,LotV,online


# O que fazer no dataset?
    - match_date           : transformar em algo mais util. e.g:dia da semana
    - player_1             : transformar em id
    - player_1_match_status: transformar em binario
    - score                : transformar em algo mais util (?)
    - player_2             : transformar em id
    - player_2_match_status: apagar coluna. redundante com a coluna player_1_match_status
    - player_1_race        : usar hot encoding para separar em colunas
    - player_2_race        : usar hot encoding para separar em colunas
    - addon                : usar hot encoding para separar em colunas
    - tournament_type      : transformar em binario

In [4]:
data.isnull().sum()

match_date               0
player_1                 0
player_1_match_status    0
score                    0
player_2                 1
player_2_match_status    0
player_1_race            0
player_2_race            0
addon                    0
tournament_type          0
dtype: int64

## drop null rows

In [5]:
data = data.dropna(axis=0)

In [6]:
data.isnull().sum()

match_date               0
player_1                 0
player_1_match_status    0
score                    0
player_2                 0
player_2_match_status    0
player_1_race            0
player_2_race            0
addon                    0
tournament_type          0
dtype: int64

## Preprocessing data

### Acrescentando a coluna day_of_week a partir da coluna match_date e apagando as colunas match_date e score (?)

In [7]:
#Transformar match_date em dia da semana
data['match_date'] = pd.to_datetime(data['match_date'])
data['day_of_week'] = data['match_date'].dt.weekday_name

In [8]:
data = data.drop(columns='match_date', axis=1)
data = data.drop(columns='score', axis=1)

In [9]:
data.head()

Unnamed: 0,player_1,player_1_match_status,player_2,player_2_match_status,player_1_race,player_2_race,addon,tournament_type,day_of_week
0,MC,[loser],Stats,[winner],P,P,LotV,online,Monday
1,MC,[winner],NaTuRal,[loser],P,T,LotV,online,Monday
2,MC,[loser],Dark,[winner],P,Z,LotV,online,Monday
3,MC,[loser],INnoVation,[winner],P,T,LotV,online,Tuesday
4,MC,[loser],TRUE,[winner],P,Z,LotV,online,Saturday


### Transforming columns "player_1_match_status" and "tournament_type" into binary and dropping column "player_2_match_status"

In [10]:
transform_p1_match_status = {label: idx for idx, label in enumerate(np.unique(data['player_1_match_status']))}
data['player_1_match_status'] = data['player_1_match_status'].map(transform_p1_match_status)

In [11]:
transform_tournament_type = {label: idx for idx, label in enumerate(np.unique(data['tournament_type']))}
data['tournament_type'] = data['tournament_type'].map(transform_tournament_type)

In [12]:
data = data.drop(['player_2_match_status'], axis=1)

In [13]:
data.head()

Unnamed: 0,player_1,player_1_match_status,player_2,player_1_race,player_2_race,addon,tournament_type,day_of_week
0,MC,0,Stats,P,P,LotV,1,Monday
1,MC,1,NaTuRal,P,T,LotV,1,Monday
2,MC,0,Dark,P,Z,LotV,1,Monday
3,MC,0,INnoVation,P,T,LotV,1,Tuesday
4,MC,0,TRUE,P,Z,LotV,1,Saturday


## Transforming "player_1" and "player_2" columns into unique numbers

In [14]:
#transform_players = {label: idx for idx, label in enumerate(np.unique(data['player_2']))}
#transform_players

transform_players = {label: idx for idx, label in enumerate(np.unique(data[['player_1', 'player_2']].values))}
data['player_1'] = data['player_1'].map(transform_players)
data['player_2'] = data['player_2'].map(transform_players)

In [15]:
data.isnull().sum()

player_1                 0
player_1_match_status    0
player_2                 0
player_1_race            0
player_2_race            0
addon                    0
tournament_type          0
day_of_week              0
dtype: int64

In [16]:
data.head()

Unnamed: 0,player_1,player_1_match_status,player_2,player_1_race,player_2_race,addon,tournament_type,day_of_week
0,4450,0,7328,P,P,LotV,1,Monday
1,4450,1,5125,P,T,LotV,1,Monday
2,4450,0,1575,P,Z,LotV,1,Monday
3,4450,0,3270,P,T,LotV,1,Tuesday
4,4450,0,7548,P,Z,LotV,1,Saturday


### ATÉ AQUI
    - match_date           : não existe mais
    - score                : não existe mais
    - player_2_match_status: não existe mais
    ------------------------------------------------------------------
    - player_1             : OK
    - player_1_match_status: OK
    - player_2             : OK
    - player_1_race        : usar hot encoding para separar em colunas
    - player_2_race        : usar hot encoding para separar em colunas
    - addon                : usar hot encoding para separar em colunas
    - tournament_type      : OK
    - day_of_week          : OK

## One Hot Enconding with Pandas

In [17]:
# multicollinearity guard in get_dummies
data = pd.get_dummies(data[['player_1', 'player_1_match_status', 'player_2', 'player_1_race', 'player_2_race', 'addon', 'tournament_type', 'day_of_week']], drop_first=True)

In [18]:
data.head()

Unnamed: 0,player_1,player_1_match_status,player_2,tournament_type,player_1_race_R,player_1_race_T,player_1_race_Z,player_2_race_R,player_2_race_T,player_2_race_Z,addon_LotV,addon_WoL,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday
0,4450,0,7328,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0
1,4450,1,5125,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0
2,4450,0,1575,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0
3,4450,0,3270,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0
4,4450,0,7548,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0


## Rearranging columns

In [19]:
#cols = data.columns.tolist()
data = data[['player_1',
             'player_2',
             'tournament_type',
             'player_1_race_R',
             'player_1_race_T',
             'player_1_race_Z',
             'player_2_race_R',
             'player_2_race_T',
             'player_2_race_Z',
             'addon_LotV',
             'addon_WoL',
             'day_of_week_Monday',
             'day_of_week_Saturday',
             'day_of_week_Sunday',
             'day_of_week_Thursday',
             'day_of_week_Tuesday',
             'day_of_week_Wednesday',
             'player_1_match_status']]

In [20]:
data.head()

Unnamed: 0,player_1,player_2,tournament_type,player_1_race_R,player_1_race_T,player_1_race_Z,player_2_race_R,player_2_race_T,player_2_race_Z,addon_LotV,addon_WoL,day_of_week_Monday,day_of_week_Saturday,day_of_week_Sunday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,player_1_match_status
0,4450,7328,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0
1,4450,5125,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,1
2,4450,1575,1,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0
3,4450,3270,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0
4,4450,7548,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0


### ATÉ AQUI
    - match_date           : não existe mais
    - score                : não existe mais
    - player_2_match_status: não existe mais
    ------------------------------------------------------------------
    - player_1             : OK
    - player_1_match_status: OK
    - player_2             : OK
    - player_1_race        : OK
    - player_2_race        : OK
    - addon                : OK
    - tournament_type      : OK
    - day_of_week          : OK

# 2ª PARTE - TESTES

## Setting X and y

In [21]:
X = data.iloc[:, 0:17].values
y = data.iloc[:, 17].values

X, y

(array([[4450, 7328,    1, ...,    0,    0,    0],
        [4450, 5125,    1, ...,    0,    0,    0],
        [4450, 1575,    1, ...,    0,    0,    0],
        ...,
        [6569, 3578,    1, ...,    1,    0,    0],
        [6569, 5285,    1, ...,    1,    0,    0],
        [6569, 5824,    1, ...,    0,    0,    1]]),
 array([0, 1, 0, ..., 0, 1, 0]))

# Falta 
    - processar os dados e calcular a accuracy

In [None]:
vt_acc_lr  = []
vt_acc_knn = []
vt_acc_nb  = []
vt_acc_svr = []

clf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in clf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    std = pp.StandardScaler().fit(X_train)
    X_train_std = std.transform(X_train)
    X_test_std = std.transform(X_test)
    
    model_lr = LogisticRegression().fit(X_train_std, y_train)
    y_pred_lr = model_lr.predict(X_test_std)
    vt_acc_lr.append(mt.accuracy_score(y_test, y_pred_lr))
    
    model_knn = KNeighborsRegressor().fit(X_train_std, y_train)
    y_pred_knn = model_lr.predict(X_test_std)
    vt_acc_knn.append(mt.accuracy_score(y_test, y_pred_knn))
        
    model_nb = GaussianNB().fit(X_train_std, y_train)
    y_pred_nb = model_nb.predict(X_test_std)
    vt_acc_nb.append(mt.accuracy_score(y_test, y_pred_nb))
    
    model_svr = SVR().fit(X_train_std, y_train)
    y_pred_svr = model_svr.predict(X_test_std)
    vt_acc_svr.append(mt.accuracy_score(y_test, y_pred_svr))

In [None]:
class_names = ['LR', 'kNN','NB','SVR']
class_1 = [vt_acc_lr, vt_acc_knn, vt_acc_nb, vt_acc_svr]

print('Accuracy Logistic Regression = {}'.format(np.mean(vt_acc_lr)))
print('Accuracy kNN = {}'.format(np.mean(vt_acc_knn)))
print('Accuracy Naive Bayes = {}'.format(np.mean(vt_acc_nb)))
print('Accuracy SVR = {}'.format(np.mean(vt_acc_svr)))