In [2]:
from chess import pgn
from thefuzz import process
import pandas as pd
from tqdm import tqdm
import re

In [12]:
fields = ['White', 'WhiteElo', 'Black', 'BlackElo', 'Result', 'Date']
data = []
with tqdm(total=5611947) as pbar:
    with open('../data/caissabase.pgn') as database:
        game = pgn.read_game(database)
        gidx = 0
        while game is not None:
            head = game.headers
            try: 
                row = [gidx] + [head[x] for x in fields]
                row.append(game.end().ply() // 2 + 1)
                misc = ''
                if 'Event' in head: misc += head['Event']
                if 'Site' in head: misc += head['Site']
                row.append(misc)
                data.append(row)
            except: pass
            game = pgn.read_game(database)
            pbar.update(1)
            gidx += 1

  6%|███▏                                                  | 332685/5611947 [20:45<5:38:46, 259.72it/s]ERROR:chess.pgn:error during pgn parsing
Traceback (most recent call last):
  File "/home/tudor/.local/share/virtualenvs/chessdraws-_6Ch22Bv/lib/python3.10/site-packages/chess/__init__.py", line 2959, in parse_san
    return next(move for move in self.generate_castling_moves() if self.is_queenside_castling(move))
StopIteration

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/tudor/.local/share/virtualenvs/chessdraws-_6Ch22Bv/lib/python3.10/site-packages/chess/pgn.py", line 1667, in read_game
    move = visitor.parse_san(board_stack[-1], token)
  File "/home/tudor/.local/share/virtualenvs/chessdraws-_6Ch22Bv/lib/python3.10/site-packages/chess/pgn.py", line 1045, in parse_san
    return board.parse_san(san)
  File "/home/tudor/.local/share/virtualenvs/chessdraws-_6Ch22Bv/lib/python3.10/site-packages/chess/__init__.py"

 78%|████████████████████████████████████████           | 4404701/5611947 [4:18:38<1:15:33, 266.30it/s]ERROR:chess.pgn:error during pgn parsing
Traceback (most recent call last):
  File "/home/tudor/.local/share/virtualenvs/chessdraws-_6Ch22Bv/lib/python3.10/site-packages/chess/__init__.py", line 2957, in parse_san
    return next(move for move in self.generate_castling_moves() if self.is_kingside_castling(move))
StopIteration

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/tudor/.local/share/virtualenvs/chessdraws-_6Ch22Bv/lib/python3.10/site-packages/chess/pgn.py", line 1667, in read_game
    move = visitor.parse_san(board_stack[-1], token)
  File "/home/tudor/.local/share/virtualenvs/chessdraws-_6Ch22Bv/lib/python3.10/site-packages/chess/pgn.py", line 1045, in parse_san
    return board.parse_san(san)
  File "/home/tudor/.local/share/virtualenvs/chessdraws-_6Ch22Bv/lib/python3.10/site-packages/chess/__init__.py",

In [13]:
df = pd.DataFrame(data, columns=['GameIdx'] + fields + ['NMoves', 'Misc'])

In [14]:
names = set(pd.concat([df['White'], df['Black']]))
all_names = sorted(names)
name_dict = dict()

for i, name in enumerate(tqdm(all_names)):
    name_dict[name] = name
    if name in names:
        s = max(0, i-50); e = min(len(all_names)-1, i+50)
        equiv = [x[0] for x in process.extract(name, all_names[s:e])]
        names.remove(name)
        for n in equiv: 
            name_dict[n] = name
            try: names.remove(n)
            except: pass

100%|█████████████████████| 137774/137774 [03:49<00:00, 600.10it/s]


In [15]:
df['White'] = df['White'].apply(lambda x: name_dict[x])
df['Black'] = df['Black'].apply(lambda x: name_dict[x])
df = df.drop_duplicates()
df = df[~df['Misc'].str.contains('960|simul|Simul')]
df = df[~df['Result'].str.contains('\*')]

In [16]:
df.to_csv('../data/games.csv')

In [17]:
features = dict()

def parse_game(row):
    for color in ['White', 'Black']:
        player = row[color]
        
        if player in features:
            feats = features[player]
        else:
            feats = [0] * 7
            features[player] = feats
        
        feats[0] += 1
        feats[1] += row[color + 'Elo']
        feats[2] += abs(row['WhiteElo'] - row['BlackElo'])
        
        if row['Result'] == '1/2-1/2':
            feats[3] += 1
            if row['NMoves'] <= 30: 
                feats[4] += 1
                if color == 'White': feats[5] += 1
            feats[6] += row['NMoves']
        
_ = df.progress_apply(parse_game, axis=1)

In [None]:
feature_df = pd.DataFrame.from_dict(features, orient='index', columns=['N', 'eloav', 'elodiff', 'd', 'd_short', 'd_short_W', 'd_length'])
feature_df['d_length'] = feature_df['d_length'] / feature_df['d']
for c in feature_df.columns[1:-1]:
    feature_df[c] = feature_df[c] / feature_df['N']

feature_df = feature_df[~(feature_df['N'] < 10)]
feature_df = feature_df[~(feature_df['d'] == 0)]
feature_df.to_csv('../data/features.csv')
feature_df

In [25]:
feature_df = pd.read_csv('../data/features.csv', index_col=0)
df = pd.read_csv('../data/games.csv', index_col=0)
names_to_features = dict(zip(feature_df.index, feature_df.values))
names = names_to_features.keys()
df = df[df['White'].isin(names) & df['Black'].isin(names)]
df.to_csv('../data/games_clean.csv', index=False)

w_f = pd.DataFrame(zip(*df['White'].apply(lambda x: names_to_features[x]))).T
b_f = pd.DataFrame(zip(*df['Black'].apply(lambda x: names_to_features[x]))).T
w_f.to_csv('../data/white_features.csv', index=False)
b_f.to_csv('../data/black_features.csv', index=False)