In [44]:
import pandas as pd
import numpy as np

In [45]:
raw_data = pd.read_csv("chess_data.csv")
raw_data.columns

Index(['date', 'white', 'black', 'result', 'opening', 'white_elo',
       'black_elo'],
      dtype='object')

In [46]:
raw_data = raw_data.rename(columns={
    'white': 'home_team',
    'black': 'away_team'
})

raw_data = raw_data.drop(columns=['opening'])

In [47]:
raw_data.columns

Index(['date', 'home_team', 'away_team', 'result', 'white_elo', 'black_elo'], dtype='object')

In [48]:
def result_to_HAD(result):
    # Handle draws written as 1/2-1/2
    if result in ["1/2-1/2", "½-½"]:
        return "D"

    # Split integer-integer results
    home, away = result.split("-")
    home = float(home)
    away = float(away)

    if home > away:
        return "H"
    elif away > home:
        return "A"
    else:
        return "D"

def results_to_code(result):
    if result in ["1/2-1/2", "½-½"]:
        return 0

    # Split integer-integer results
    home, away = result.split("-")
    home = float(home)
    away = float(away)

    if home > away:
        return 1
    elif away > home:
        return -1
    else:
        return 0
# Apply to dataframe
raw_data["result_HAD"] = raw_data["result"].apply(result_to_HAD)
raw_data["result_code"] = raw_data["result"].apply(results_to_code)

In [49]:
raw_data['result'], raw_data['result_HAD']

(0           1-0
 1           0-1
 2       1/2-1/2
 3           0-1
 4       1/2-1/2
          ...   
 4619    1/2-1/2
 4620        0-1
 4621    1/2-1/2
 4622        1-0
 4623    1/2-1/2
 Name: result, Length: 4624, dtype: object,
 0       H
 1       A
 2       D
 3       A
 4       D
        ..
 4619    D
 4620    A
 4621    D
 4622    H
 4623    D
 Name: result_HAD, Length: 4624, dtype: object)

In [50]:
raw_data = raw_data.drop(columns=["white_elo", "black_elo"])

In [51]:
raw_data.columns

Index(['date', 'home_team', 'away_team', 'result', 'result_HAD',
       'result_code'],
      dtype='object')

In [52]:
raw_data["date"] = pd.to_datetime(raw_data["date"])

In [53]:
raw_data.head()

Unnamed: 0,date,home_team,away_team,result,result_HAD,result_code
0,2020-02-21,Duda,Vidit,1-0,H,1
1,2020-02-21,Navara,Harikrishna,0-1,A,-1
2,2020-02-20,Vitiugov,Duda,1/2-1/2,D,0
3,2020-02-20,Vidit,Navara,0-1,A,-1
4,2020-02-20,Vitiugov,Duda,1/2-1/2,D,0


In [54]:
raw_data = raw_data.sort_values("date").reset_index(drop=True)

In [55]:
raw_data["season"] = raw_data["date"].dt.year.astype(str)
raw_data = raw_data.drop(columns=["result"])
raw_data = raw_data.rename(columns={
    'result_HAD': 'result',
})
raw_data.columns


Index(['date', 'home_team', 'away_team', 'result', 'result_code', 'season'], dtype='object')

In [56]:
raw_data.dtypes

date           datetime64[ns]
home_team              object
away_team              object
result                 object
result_code             int64
season                 object
dtype: object

In [59]:
final_data = raw_data[['date', 'home_team', 'away_team', 'result', 'season', 'result_code']]

In [60]:
from pathlib import Path

# Get the root of the repo
repo_root = Path.cwd()

# Create folder if it doesn't exist
output_folder = repo_root / "cleaned_data"
output_folder.mkdir(exist_ok=True)

# Full path to save
output_file = output_folder / "cleaned_data_chess.csv"


# Save to repo relative folder
final_data.to_csv(output_file, index=False)
print(f"Saved processed data to {output_file}")


Saved processed data to /Users/wissalhaouami/projects/skill_rating_ssm/skill_rating_ssm/data_prep/chess datasets/cleaned_data/cleaned_data_chess.csv
