# First notebook for experimentation. 
Portions of this notebook has been cleaned and the function's been moved to utils.py for readability. 

In [1]:
import pandas as pd 
import torch 
from torch.utils.data import Dataset
import numpy as np
import torch.nn as nn
import importlib
import utils
importlib.reload(utils)
from utils import *
import models
importlib.reload(models)
from models import *
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:

def process_raw_csv(filepath, elo_min = 2100, elo_max = 2199):
    # Import CSV File (from Maia: http://csslab.cs.toronto.edu/datasets/#monthly_chess_csv)
    # The CSV has 151,072,060 rows
    data_types ={'clock': 'float32',
        'cp': 'object',
        'opp_clock': 'float32',
        'opp_clock_percent': 'float32'}
    df = dd.read_csv(filepath, blocksize='64e6', dtype= data_types, low_memory=False)

    # Filter out quick games (Bullet and HyperBullet) and take out moves that happened in the last XX seconds (this won't affect how many games we import but the # of moves we look at)
    condition_time_control = ~df['time_control'].isin(['Bullet', 'HyperBullet'])
    condition_clock = df['clock'] > 45
    # condition_plays = df['num_ply'] < 80
    filtered_df = df[condition_time_control & condition_clock]

    # Select Relevant Columns
    selected_columns = ['game_id','white_elo','black_elo','move','white_active','board']
    filtered_df = filtered_df[selected_columns]

    # Filter only games of Elo 1100-1199
    filtered_df = filtered_df[(filtered_df['white_elo'].between(elo_min, elo_max)) & (filtered_df['black_elo'].between(elo_min, elo_max))]

    # Group Same Games Together 
    def aggregate_moves(group):
        moves = ' '.join(group['move'])  # Concatenate moves into a single string
        white_elo = group['white_elo'].iloc[0]  # Get the first white_elo
        black_elo = group['black_elo'].iloc[0]  # Get the first black_elo
        white_active = group['white_active'].iloc[0]  # Get the first num_ply
        board = '*'.join(group['board'])  # Get the first num_ply
        import pandas as pd
        return pd.Series({'moves': moves, 'white_elo': white_elo, 'black_elo': black_elo, 'white_active': white_active, 'board': board})

    grouped_df = filtered_df.groupby('game_id',sort=True).apply(aggregate_moves, meta={'moves': 'str', 'white_elo': 'int', 'black_elo': 'int', 'white_active': 'str', 'board': 'str'}).compute()

    # This gives us 99,300 Games when we don't filter games with more than 80 half-moves
    return grouped_df

In [9]:
# Run this cell only if you don't have haha_longer.csv
grouped_df = process_raw_csv('../data/lichess_db_standard_rated_2019-12.csv', elo_min=2000, elo_max=2199)
grouped_df.to_csv('2100-dec.csv')