# **4. Encoding & Normalization**

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

games_data = pd.read_csv('./data/games_extracted_data.csv')
games_data.head(3)

Unnamed: 0,Event,W_ES,B_ES,ECO,WS_OPC,BS_OPC,PG-MovesCount,White-Mistakes,Black-Mistakes,W-WP,B-WP,AN,W,B,Termination
0,Blitz,0.510072,0.489928,C30,1-N,0-N,13,8,12,0.451226,0.548774,1. e4 { [%eval 0.27] } 1... e5 { [%eval 0.27] ...,1,0,Time forfeit
1,Bullet,0.491366,0.508634,B00,0-WC2,1-BC3,10,2,2,0.495683,0.504317,1. e4 { [%eval 0.27] } 1... b6 { [%eval 0.51] ...,1,0,Time forfeit
2,Classical,0.280081,0.719919,C00,0-WC1,1-BC2,18,11,7,0.79924,0.20076,1. e4 { [%eval 0.22] } 1... e6 { [%eval 0.41] ...,0,1,Time forfeit


In [2]:
from sklearn.preprocessing import LabelEncoder

def encode(df):

    df_cpy = df.copy()

    # Event
    # onehot encode the Event column
    event_dummies = pd.get_dummies(df_cpy['Event'], prefix='E')

    # WS_OPC
    # onehot encode the WS_OPC column
    ws_opc_dummies = pd.get_dummies(df_cpy['WS_OPC'], prefix='WO')

    # BS_OPC
    # onehot encode the BS_OPC column
    bs_opc_dummies = pd.get_dummies(df_cpy['BS_OPC'], prefix='BO')

    # Normalize MovesCount and Mistakes columns so that min: 0, max: 1
    maxEval = df_cpy['PG-MovesCount'].max()
    df_cpy['White-Mistakes'] = df_cpy['White-Mistakes'] / maxEval
    
    maxEval = df_cpy['PG-MovesCount'].max()
    df_cpy['Black-Mistakes'] = df_cpy['Black-Mistakes'] / maxEval

    # Encode the 'Termination' column using LabelEncoder
    le = LabelEncoder()
    df_cpy['Termination'] = le.fit_transform(df_cpy['Termination'])

    mapping = dict(zip(le.classes_, range(len(le.classes_))))
    print(f"LabelEncoder mappings for Termination: \n{mapping}")

    # Merge all the onehot encoded columns with the original dataframe
    df_cpy = pd.concat([df_cpy, event_dummies, ws_opc_dummies, bs_opc_dummies], axis=1)
    
    COLS = ['E_Blitz', 'E_Bullet', 'E_Classical', 'E_Correspondence',
            'WO_0-N', 'WO_0-WC1', 'WO_0-WC2', 'WO_0-WC3', 'WO_1-N', 'WO_1-WC1', 'WO_1-WC2', 'WO_1-WC3',
            'BO_0-BC1', 'BO_0-BC2', 'BO_0-BC3', 'BO_0-N', 'BO_1-BC1', 'BO_1-BC2', 'BO_1-BC3', 'BO_1-N',
            'W_ES', 'B_ES', 'White-Mistakes', 'Black-Mistakes', 'W-WP', 'B-WP',
            'W', 'B', 'Termination']
    
    df_cpy = df_cpy[COLS]

    return df_cpy

In [4]:
encoded_games = encode(games_data)
encoded_games.head(3)

LabelEncoder mappings for Termination: 
{'Checkmate': 0, 'Resignation': 1, 'Time forfeit': 2}


Unnamed: 0,E_Blitz,E_Bullet,E_Classical,E_Correspondence,WO_0-N,WO_0-WC1,WO_0-WC2,WO_0-WC3,WO_1-N,WO_1-WC1,...,BO_1-N,W_ES,B_ES,White-Mistakes,Black-Mistakes,W-WP,B-WP,W,B,Termination
0,1,0,0,0,0,0,0,0,1,0,...,0,0.510072,0.489928,0.186047,0.27907,0.451226,0.548774,1,0,2
1,0,1,0,0,0,0,1,0,0,0,...,0,0.491366,0.508634,0.046512,0.046512,0.495683,0.504317,1,0,2
2,0,0,1,0,0,1,0,0,0,0,...,0,0.280081,0.719919,0.255814,0.162791,0.79924,0.20076,0,1,2


## Divide dataset by approaches - White & Black

### (1) White

In [5]:
W_COLS = ['E_Blitz', 'E_Bullet', 'E_Classical', 'E_Correspondence', 
          'WO_0-N', 'WO_0-WC1', 'WO_0-WC2', 'WO_0-WC3', 'WO_1-N', 'WO_1-WC1', 'WO_1-WC2', 'WO_1-WC3',
          'W_ES', 'B_ES', 'White-Mistakes', 'W-WP', 'W', 'Termination']

w_games = encoded_games[W_COLS]
w_games.head(3)

Unnamed: 0,E_Blitz,E_Bullet,E_Classical,E_Correspondence,WO_0-N,WO_0-WC1,WO_0-WC2,WO_0-WC3,WO_1-N,WO_1-WC1,WO_1-WC2,WO_1-WC3,W_ES,B_ES,White-Mistakes,W-WP,W,Termination
0,1,0,0,0,0,0,0,0,1,0,0,0,0.510072,0.489928,0.186047,0.451226,1,2
1,0,1,0,0,0,0,1,0,0,0,0,0,0.491366,0.508634,0.046512,0.495683,1,2
2,0,0,1,0,0,1,0,0,0,0,0,0,0.280081,0.719919,0.255814,0.79924,0,2


In [6]:
w_wl_classification = w_games.drop('Termination', axis=1)
w_termination_classification = w_games.drop('W', axis=1)

In [7]:
# White win/loss classification dataset
w_wl_classification.head(3)

Unnamed: 0,E_Blitz,E_Bullet,E_Classical,E_Correspondence,WO_0-N,WO_0-WC1,WO_0-WC2,WO_0-WC3,WO_1-N,WO_1-WC1,WO_1-WC2,WO_1-WC3,W_ES,B_ES,White-Mistakes,W-WP,W
0,1,0,0,0,0,0,0,0,1,0,0,0,0.510072,0.489928,0.186047,0.451226,1
1,0,1,0,0,0,0,1,0,0,0,0,0,0.491366,0.508634,0.046512,0.495683,1
2,0,0,1,0,0,1,0,0,0,0,0,0,0.280081,0.719919,0.255814,0.79924,0


In [8]:
# White termination classification dataset
w_termination_classification.head(3)

Unnamed: 0,E_Blitz,E_Bullet,E_Classical,E_Correspondence,WO_0-N,WO_0-WC1,WO_0-WC2,WO_0-WC3,WO_1-N,WO_1-WC1,WO_1-WC2,WO_1-WC3,W_ES,B_ES,White-Mistakes,W-WP,Termination
0,1,0,0,0,0,0,0,0,1,0,0,0,0.510072,0.489928,0.186047,0.451226,2
1,0,1,0,0,0,0,1,0,0,0,0,0,0.491366,0.508634,0.046512,0.495683,2
2,0,0,1,0,0,1,0,0,0,0,0,0,0.280081,0.719919,0.255814,0.79924,2


### (2) Black

In [9]:
B_COLS = ['E_Blitz', 'E_Bullet', 'E_Classical', 'E_Correspondence', 
          'BO_0-BC1', 'BO_0-BC2', 'BO_0-BC3', 'BO_0-N', 'BO_1-BC1', 'BO_1-BC2', 'BO_1-BC3', 'BO_1-N',
          'W_ES', 'B_ES', 'Black-Mistakes', 'B-WP', 'B', 'Termination']

b_games = encoded_games[B_COLS]
b_games.head(3)

Unnamed: 0,E_Blitz,E_Bullet,E_Classical,E_Correspondence,BO_0-BC1,BO_0-BC2,BO_0-BC3,BO_0-N,BO_1-BC1,BO_1-BC2,BO_1-BC3,BO_1-N,W_ES,B_ES,Black-Mistakes,B-WP,B,Termination
0,1,0,0,0,0,0,0,1,0,0,0,0,0.510072,0.489928,0.27907,0.548774,0,2
1,0,1,0,0,0,0,0,0,0,0,1,0,0.491366,0.508634,0.046512,0.504317,0,2
2,0,0,1,0,0,0,0,0,0,1,0,0,0.280081,0.719919,0.162791,0.20076,1,2


In [10]:
b_wl_classification = b_games.drop('Termination', axis=1)
b_termination_classification = b_games.drop('B', axis=1)

In [11]:
# Black win/loss classification dataset
b_wl_classification.head(3)

Unnamed: 0,E_Blitz,E_Bullet,E_Classical,E_Correspondence,BO_0-BC1,BO_0-BC2,BO_0-BC3,BO_0-N,BO_1-BC1,BO_1-BC2,BO_1-BC3,BO_1-N,W_ES,B_ES,Black-Mistakes,B-WP,B
0,1,0,0,0,0,0,0,1,0,0,0,0,0.510072,0.489928,0.27907,0.548774,0
1,0,1,0,0,0,0,0,0,0,0,1,0,0.491366,0.508634,0.046512,0.504317,0
2,0,0,1,0,0,0,0,0,0,1,0,0,0.280081,0.719919,0.162791,0.20076,1


In [12]:
# Black termination classification dataset
b_termination_classification.head(3)

Unnamed: 0,E_Blitz,E_Bullet,E_Classical,E_Correspondence,BO_0-BC1,BO_0-BC2,BO_0-BC3,BO_0-N,BO_1-BC1,BO_1-BC2,BO_1-BC3,BO_1-N,W_ES,B_ES,Black-Mistakes,B-WP,Termination
0,1,0,0,0,0,0,0,1,0,0,0,0,0.510072,0.489928,0.27907,0.548774,2
1,0,1,0,0,0,0,0,0,0,0,1,0,0.491366,0.508634,0.046512,0.504317,2
2,0,0,1,0,0,0,0,0,0,1,0,0,0.280081,0.719919,0.162791,0.20076,2


## Export

In [13]:
w_wl_classification.to_csv('./data/encoded/w_wl_classification.csv', index=False)
w_termination_classification.to_csv('./data/encoded/w_termination_classification.csv', index=False)

b_wl_classification.to_csv('./data/encoded/b_wl_classification.csv', index=False)
b_termination_classification.to_csv('./data/encoded/b_termination_classification.csv', index=False)

## Train-Test-Split & Export

In [15]:
import os
from sklearn.model_selection import train_test_split

### White Win/Loss Classification
wwcX = w_wl_classification.drop('W', axis=1)
wwcY = w_wl_classification['W']

wwcX_train, wwcX_test, wwcY_train, wwcY_test = train_test_split(wwcX, wwcY, test_size=0.25, random_state=42)

# Export
os.mkdir('./data/train-test-split/white-win-loss')
wwcX_train.to_csv('./data/train-test-split/white-win-loss/wwcX_train.csv', index=False)
wwcX_test.to_csv('./data/train-test-split/white-win-loss/wwcX_test.csv', index=False)
wwcY_train.to_csv('./data/train-test-split/white-win-loss/wwcY_train.csv', index=False)
wwcY_test.to_csv('./data/train-test-split/white-win-loss/wwcY_test.csv', index=False)

### White Termination Classification
wtcX = w_termination_classification.drop('Termination', axis=1)
wtcY = w_termination_classification['Termination']

wtcX_train, wtcX_test, wtcY_train, wtcY_test = train_test_split(wtcX, wtcY, test_size=0.25, random_state=42)

# Export
os.mkdir('./data/train-test-split/white-termination')
wtcX_train.to_csv('./data/train-test-split/white-termination/wtcX_train.csv', index=False)
wtcX_test.to_csv('./data/train-test-split/white-termination/wtcX_test.csv', index=False)
wtcY_train.to_csv('./data/train-test-split/white-termination/wtcY_train.csv', index=False)
wtcY_test.to_csv('./data/train-test-split/white-termination/wtcY_test.csv', index=False)

### Black Win/Loss Classification
bwcX = b_wl_classification.drop('B', axis=1)
bwcY = b_wl_classification['B']

bwcX_train, bwcX_test, bwcY_train, bwcY_test = train_test_split(bwcX, bwcY, test_size=0.25, random_state=42)

# Export
os.mkdir('./data/train-test-split/black-win-loss')
bwcX_train.to_csv('./data/train-test-split/black-win-loss/bwcX_train.csv', index=False)
bwcX_test.to_csv('./data/train-test-split/black-win-loss/bwcX_test.csv', index=False)
bwcY_train.to_csv('./data/train-test-split/black-win-loss/bwcY_train.csv', index=False)
bwcY_test.to_csv('./data/train-test-split/black-win-loss/bwcY_test.csv', index=False)

### Black Termination Classification
btcX = b_termination_classification.drop('Termination', axis=1)
btcY = b_termination_classification['Termination']

btcX_train, btcX_test, btcY_train, btcY_test = train_test_split(btcX, btcY, test_size=0.25, random_state=42)

# Export
os.mkdir('./data/train-test-split/black-termination')
btcX_train.to_csv('./data/train-test-split/black-termination/btcX_train.csv', index=False)
btcX_test.to_csv('./data/train-test-split/black-termination/btcX_test.csv', index=False)
btcY_train.to_csv('./data/train-test-split/black-termination/btcY_train.csv', index=False)
btcY_test.to_csv('./data/train-test-split/black-termination/btcY_test.csv', index=False)