# **3. Feature Extraction**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('./data/cleaned_chess_games.csv')
data.head(3)

Unnamed: 0,Event,WhiteElo,W_ES,BlackElo,B_ES,ECO,PG-MovesCount,White-Mistakes,Black-Mistakes,W-WP,B-WP,AN,W,B,Termination
0,Blitz,1761,0.510072,1754,0.489928,C30,13,8,12,0.451226,0.548774,1. e4 { [%eval 0.27] } 1... e5 { [%eval 0.27] ...,1,0,Time forfeit
1,Bullet,1964,0.491366,1970,0.508634,B00,10,2,2,0.495683,0.504317,1. e4 { [%eval 0.27] } 1... b6 { [%eval 0.51] ...,1,0,Time forfeit
2,Classical,1597,0.280081,1761,0.719919,C00,18,11,7,0.79924,0.20076,1. e4 { [%eval 0.22] } 1... e6 { [%eval 0.41] ...,0,1,Time forfeit


In [2]:
import math

def organize_features(df):

    df_cpy = df.copy()
    
    # OPC
    # White openings
    WC1 = ['A00', 'B01', 'C00', 'A40']
    WC2 = ['C41', 'D00', 'B00', 'C20']
    WC3 = ['B20']

    # Black openings
    BC1 = ['A00']
    BC2 = ['C00', 'B01', 'A40', 'D00']
    BC3 = ['C20', 'B00', 'B20', 'C41']

    def w_opc(eco):
        if eco in WC1: return "WC1"
        elif eco in WC2: return "WC2"
        elif eco in WC3: return "WC3"
        else: return "N"
    
    def b_opc(eco):
        if eco in BC1: return "BC1"
        elif eco in BC2: return "BC2"
        elif eco in BC3: return "BC3"
        else: return "N"

    df_cpy['W_OPC'] = df_cpy['ECO'].apply(w_opc)
    df_cpy['B_OPC'] = df_cpy['ECO'].apply(b_opc)

    # WS_OPC, BS_OPC
    def ws_opc(whiteElo, blackElo, w_opc):
        if whiteElo >= blackElo:
            return f"1-{w_opc}"
        else:
            return f"0-{w_opc}"
    
    def bs_opc(whiteElo, blackElo, b_opc):
        if blackElo >= whiteElo:
            return f"1-{b_opc}"
        else:
            return f"0-{b_opc}"
    
    elo_opc = df_cpy[["WhiteElo", "BlackElo", "W_OPC", "B_OPC"]]

    elo_opc = elo_opc.assign(WS_OPC=elo_opc.apply(lambda row: ws_opc(row["WhiteElo"], row["BlackElo"], row["W_OPC"]), axis=1))
    elo_opc = elo_opc.assign(BS_OPC=elo_opc.apply(lambda row: bs_opc(row["WhiteElo"], row["BlackElo"], row["B_OPC"]), axis=1))

    df_cpy = pd.concat([df_cpy, elo_opc[["WS_OPC", "BS_OPC"]]], axis=1)

    df_cpy = df_cpy[['Event', 'W_ES', 'B_ES', 
                     'ECO', 'WS_OPC', 'BS_OPC',
                     'PG-MovesCount', 'White-Mistakes', 'Black-Mistakes', 'W-WP', 'B-WP',
                     'AN', 'W', 'B', 'Termination']]
    
    return df_cpy


In [3]:
games_data = organize_features(data)
games_data.head(3)

Unnamed: 0,Event,W_ES,B_ES,ECO,WS_OPC,BS_OPC,PG-MovesCount,White-Mistakes,Black-Mistakes,W-WP,B-WP,AN,W,B,Termination
0,Blitz,0.510072,0.489928,C30,1-N,0-N,13,8,12,0.451226,0.548774,1. e4 { [%eval 0.27] } 1... e5 { [%eval 0.27] ...,1,0,Time forfeit
1,Bullet,0.491366,0.508634,B00,0-WC2,1-BC3,10,2,2,0.495683,0.504317,1. e4 { [%eval 0.27] } 1... b6 { [%eval 0.51] ...,1,0,Time forfeit
2,Classical,0.280081,0.719919,C00,0-WC1,1-BC2,18,11,7,0.79924,0.20076,1. e4 { [%eval 0.22] } 1... e6 { [%eval 0.41] ...,0,1,Time forfeit


In [4]:
games_data.to_csv('./data/games_extracted_data.csv', index=False)