In [19]:
import time
import os
import chess
import numpy as np
import pandas as pd
import re
import sklearn as sk
import matplotlib.pyplot as plt
import pickle

# Functions for loading and extracting features

In [20]:
def data_loader(path):
    
    a = time.monotonic()
    
    # Opens the file
    with open(path) as f:
        data = f.read()
    
    # Converts the data into string
    data = str(data)
    
    # Divides the file into games
    raw_games = data.split('[Event')
    del raw_games[0]
    del data
    
    # Gets rid of blitz, correspondence, variants, games without elo scores, etc
    cleaned_matches = initial_cleanup(raw_games)
    del raw_games
    
    # Returns clean text move strings, pairs of elo, match result (who won)
    clean, elo, results = feature_extraction(cleaned_matches)
    del cleaned_matches
    
    b = time.monotonic()
    print(f'Runtime: {b-a}')
    
    return clean, elo, results

In [21]:
def initial_cleanup(game_list):
    
    wo_bullet = []
    game_list_len = len(game_list)
    #a = time.monotonic()
    
    print(f'The initial dataset has length {game_list_len}')
    
    # Gets rid of games with Bullet time control, empty games, and short games
    for game in game_list:
        if game.count('Bullet') > 0:
            del game
            continue
        if game.count('Variant') > 1:
            del game
            continue
        elif game.count('Result') == 0:
            del game
            continue
        elif game.count('eval') > 0:
            del game
            continue
        elif game.count('6.') == 0:
            del game
            continue
        elif game.count('[WhiteElo "?"]') > 0:
            del game
            continue
        elif game.count('[BlackElo "?"]') > 0:
            del game
            continue
        elif game.count('Correspondence') > 0:
            del game
            continue
        else:
            wo_bullet.append(game)
    
    #b = time.monotonic()
    
    print(f'The length after cleanup is of {len(wo_bullet)}')
    print(f'We have pruned {game_list_len - len(wo_bullet)} games!')
    #print(f'Runtime: {b-a}')
    
    return wo_bullet

def feature_extraction(game_list):
    
    elo = []
    results = []
    termination_type = []
    game_features = []
    game_moves = []
    
    for game in game_list:
        
        features = game[:game.find("\n\n1.")].strip()
        game_features.append(features)
        final_game = game[game.find("\n\n1."):]
        
        if final_game.find("1-0") > 0:
            final_game = final_game[final_game.find("1."):final_game.find("1-0")]
        
        elif final_game.find("0-1") > 0:
            final_game = final_game[final_game.find("1."):final_game.find("0-1")]
        
        elif final_game.find("1/2") > 0:
            final_game = final_game[final_game.find("1."):final_game.find("1/2")]
        
        game_moves.append(final_game.strip())
    
    # Gets the game result
    for game in game_features:
        index = game.find('Result')+8
        victory = game[index:index+2]
        if victory == "1-":
            results.append("White Wins")
        elif victory == "0-":
            results.append("Black Wins")
        elif victory == "1/":
            results.append("Draw")
        else:
            indexed_list = enumerate(game_features)
            
            
    
    # Gets the ELO for each player
    for game in game_features:
        
        index1 = game.find("[WhiteElo")
        index2 = game.find("[WhiteRatingDiff")
        
        game = game[index1:index2]
        
        score = re.findall('\d+', game)[:2]
        White_Elo = float(score[0])
        Black_Elo = float(score[1])
        elo_list = [White_Elo, Black_Elo]
        elo.append(elo_list)
        
    # Gets the Termination type
    for game in game_features:
        
        index = game.find("[Termination")
        game = game[index:index+30]
        
        
        
    
    # Transforms lists into np.arrays
    
    game_moves = np.array(game_moves)
    elo = np.array(elo)
    results = np.array(results)
    
    print(f"Clean list length: {len(game_moves)}")
    print(f"Elo list length: {len(elo)}")
    print(f"Results list length: {len(results)}")
    print(f"The games at indexes {faulty} are faulty.")
    return game_moves, elo, results

In [30]:
def feature_extraction(game_list):
    
    faulty = []
    elo = []
    results = []
    termination_type = []
    game_features = []
    game_moves = []
    
    for game in game_list:
        
        features = game[:game.find("\n\n1.")].strip()
        game_features.append(features)
        final_game = game[game.find("\n\n1."):]
        
        if final_game.find("1-0") > 0:
            final_game = final_game[final_game.find("1."):final_game.find("1-0")]
        
        elif final_game.find("0-1") > 0:
            final_game = final_game[final_game.find("1."):final_game.find("0-1")]
        
        elif final_game.find("1/2") > 0:
            final_game = final_game[final_game.find("1."):final_game.find("1/2")]
        
        game_moves.append(final_game.strip())
    
    # Gets the game result
    for i, game in enumerate(game_features):
        
        index = game.find('Result')+8
        victory = game[index:index+2]
        if victory == "1-":
            results.append("White Wins")
        elif victory == "0-":
            results.append("Black Wins")
        elif victory == "1/":
            results.append("Draw")
        else:
            faulty.append(i)
                  
    
    # Gets the ELO for each player
    for game in game_features:
        
        index1 = game.find("[WhiteElo")
        index2 = game.find("[WhiteRatingDiff")
        
        game = game[index1:index2]
        
        score = re.findall('\d+', game)[:2]
        White_Elo = float(score[0])
        Black_Elo = float(score[1])
        elo_list = [White_Elo, Black_Elo]
        elo.append(elo_list)
        
    # Gets the Termination type
    for game in game_features:
        
        index = game.find("[Termination")
        game = game[index:index+30]
        
    
    # Transforms lists into np.arrays
    
    game_moves = np.array(game_moves)
    elo = np.array(elo)
    results = np.array(results)
    
    print(f"Clean list length: {len(game_moves)}")
    print(f"Elo list length: {len(elo)}")
    print(f"Results list length: {len(results)}")
    print(f"The games at indexes {faulty} are faulty.")
    return game_moves, elo, results

# Utility functions

In [23]:
def get_elo_mean(elo_list):
    elo_mean = []
    for elo_pair in elo_list:
        mean = np.mean(elo_pair)
        elo_mean.append(mean)
        
    elo_mean = np.array(elo_mean)
    
    return elo_mean

In [24]:
def backlash_remover(game_list):
    
    clean_games = []
    
    for game in game_list:
        game = game.replace("\n", " ")
        clean_games.append(game)
    
    return clean_games             

In [25]:
def get_tokens(game_list):
    
    split_games = []
    
    for game in game_list:
        split_game = game.split()
        split_game = set(split_game)
        for string in split_game:
            split_games.append(string)
        
    split_games = set(split_games)
    split_games = list(split_games)
    split_games = sorted(split_games)
    
    return split_games

In [26]:
# write list to binary file
def write_list(a_list, filename):
    # store list in binary file so 'wb' mode
    filename = filename
    with open(filename, 'wb') as fp:
        pickle.dump(a_list, fp)
        print('Done writing list into a binary file')

In [27]:
# Read list to memory
def read_list(path):
    # for reading also binary mode is important
    with open(path, 'rb') as fp:
        n_list = pickle.load(fp)
        return n_list

# Loading data and cleaning it up

### Dataset 1

In [11]:
path = "/Users/victorlacerda/Jupyter/Guess the ELO/Input/2014_06.pgn"

In [12]:
clean1, elo1, results1 = data_loader(path)

The initial dataset has length 961868
The length after cleanup is of 533397
We have pruned 428471 games!
Clean list length: 533397
Elo list length: 533397
Results list length: 533397
The games at indexes [] are faulty.
Runtime: 9.947896291996585


### Dataset 2

In [13]:
path = "/Users/victorlacerda/Jupyter/Guess the ELO/Input/2014_12.pgn"

In [14]:
clean2, elo2, results2 = data_loader(path)

The initial dataset has length 1350176
The length after cleanup is of 756942
We have pruned 593234 games!
Clean list length: 756942
Elo list length: 756942
Results list length: 756942
The games at indexes [] are faulty.
Runtime: 14.416008083993802


### Dataset 3

path = "/Users/victorlacerda/Jupyter/Guess the ELO/Input/2015_11.pgn"

clean3, elo3, results3 = data_loader(path)

### Dataset 4

In [31]:
path = "/Users/victorlacerda/Jupyter/Guess the ELO/Input/elite_2019_06.pgn"

In [32]:
clean4, elo4, results4 = data_loader(path)

clean4 = backlash_remover(clean4)

The initial dataset has length 76809
The length after cleanup is of 76793
We have pruned 16 games!
Clean list length: 76793
Elo list length: 76793
Results list length: 76788
The games at indexes [51030, 51040, 51044, 71922, 71923] are faulty.
Runtime: 1.3607132920005824


# Concatenating the output into single files

In [16]:
final_clean = np.concatenate((clean2, clean4), axis=0)

In [17]:
final_elo = np.concatenate((elo2, elo4), axis=0)

In [18]:
final_results = np.concatenate((results2, results4), axis=0)

# Saving the cleaned up files

In [19]:
write_list(final_clean,'clean24')

Done writing list into a binary file


In [20]:
write_list(final_elo,'elo24')

Done writing list into a binary file


In [21]:
write_list(final_results, 'results24')

Done writing list into a binary file


# Getting chess tokens

a = time.monotonic()
tokens1 = get_tokens(clean1)
tokens2 = get_tokens(clean2)
tokens3 = get_tokens(clean3)
tokens4 = get_tokens(clean4)
b = time.monotonic()
print(b-a)

token_list = np.concatenate((tokens1, tokens2, tokens3, tokens4),axis=0)