In [1]:
from chessdotcom import get_player_profile, get_player_stats, get_player_game_archives
import requests
import pprint
import pandas as pd
from parsita import *
from parsita.util import constant
import json
import chess.pgn
import io
import ftplib
import os
import logging

# https://pypi.org/project/pgn2data/
from converter.pgn_data import PGNData

In [2]:
"""
chess_fact table
player_id
game_id
move_id

dimention tables

Player_table
Player id - primary key
player username
class
player rating
rating updated time


Game_table
game id - primary key
player id - foreign key
urls
time_control
Date
EndDate
StartTime
EndTime
Timezone
UTCDate
UTCTime
initial setup
time class
rules
white_rating
white_username
black_rating
black_username
player_rating
event
Site
Round
Result
CurrentPosition
ECO
ECOUrl
Termination
pgn

Moves_table
game_id
move_id
move_number
white_move
black_move
white_time
black_time
"""

'\nchess_fact table\nplayer_id\ngame_id\nmove_id\n\ndimention tables\n\nPlayer_table\nPlayer id - primary key\nplayer username\nclass\nplayer rating\nrating updated time\n\n\nGame_table\ngame id - primary key\nplayer id - foreign key\nurls\ntime_control\nDate\nEndDate\nStartTime\nEndTime\nTimezone\nUTCDate\nUTCTime\ninitial setup\ntime class\nrules\nwhite_rating\nwhite_username\nblack_rating\nblack_username\nplayer_rating\nevent\nSite\nRound\nResult\nCurrentPosition\nECO\nECOUrl\nTermination\npgn\n\nMoves_table\ngame_id\nmove_id\nmove_number\nwhite_move\nblack_move\nwhite_time\nblack_time\n'

In [3]:
# Fetch data from chess.com API

# make json data easier to read
pp = pprint.PrettyPrinter(width=41, compact=True)

def get_user_archives(username, months):
    """
    get archive monthly files of specific chess.com player
    input:
    username - username of the chess.com player
    months - target months that we want to get the archives
    
    output:
    target_month - files of archives according to months parameter
    """
    archives = get_player_game_archives(username).json['archives']
    target_month = []
    for archive in archives:
        if archive[-7:] in months:
            target_month.append(archive)
    return target_month

#files = get_user_archives("AGcuber19",["2023/02"])


In [4]:
def get_archive_games(filename):
    """
    return games in one archive file
    
    input:
    filename - filename that contains game urls
    
    output: 
    """
    games = requests.get(filename).json()['games']
    return games

#games = get_archive_games(files[-1])

In [5]:
def game_df(username,files):
    """
    import data from archive files and turn relevant data parameters into data frames
    
    input:
    username - username of the player
    files - file archived of the player
    
    output:
    a dataframe contains wanted columns
                        'username',
                         'urls', 
                         'time_control',
                        'end_time',
                        'uuid',
                        'initial_setup',
                        'time_class',
                        'rules',
                        'white_rating',
                        'white_username',
                        'black_rating',
                        'black_username',
                        'Event',
                        'Site',
                        'Date',
                        'Round',
                        'Result',
                        'CurrentPosition',
                        'ECO',
                        'ECOUrl',
                        'EndDate',
                        'EndTime',
                        'StartTime',
                        'Termination',
                        'Timezone',
                        'UTCDate',
                        'UTCTime',                        
                        
    """
    print("Player " + username + " is processing...")
    usernames = []
    urls = []
    time_control = [] 
    end_time = [] 
    uuid = []
    initial_setup = [] 
    time_class = [] 
    rules = [] 
    white_rating = [] 
    white_username = [] 
    black_rating = [] 
    black_username = []
    pgn = []
    event = []
    Site = []
    Date = []
    Round = []
    Result = []
    CurrentPosition = []
    ECO = []
    ECOUrl = []
    EndDate = []
    EndTime = []
    StartTime = []
    Termination = []
    Timezone = []
    UTCDate = []
    UTCTime = []
    
    for file in files:
        print(file + " " + " is processing...")
        games = get_archive_games(file)
        for game in games:
            try:
                usernames.append(username)
                urls.append(game.get('url',None)) # game.get('url', None)
                time_control.append(game.get('time_control',None))
                end_time.append(game.get('end_time',None))
                uuid.append(game.get('uuid',None))
                initial_setup.append(game.get('initial_setup',None))
                time_class.append(game.get('time_class',None))
                rules.append(game.get('rules',None))
                white_rating.append(game.get('white',None)['rating'])
                white_username.append(game.get('white',None)['username'])
                black_rating.append(game.get('black',None)['rating'])
                black_username.append(game.get('black',None)['username'])
                pgn.append(game.get('pgn',None))
                
                pgn_written = io.StringIO(game['pgn'])
                game_data = chess.pgn.read_game(pgn_written)
                event.append(game_data.headers['Event'])
                Site.append(game_data.headers['Site'])
                Date.append(game_data.headers['Date'])
                Round.append(game_data.headers['Round'])
                Result.append(game_data.headers['Result'])
                CurrentPosition.append(game_data.headers['CurrentPosition'])
                ECO.append(game_data.headers['ECO'])
                ECOUrl.append(game_data.headers['ECOUrl'])
                EndDate.append(game_data.headers['EndDate'])
                EndTime.append(game_data.headers['EndTime'])
                StartTime.append(game_data.headers['StartTime'])
                Termination.append(game_data.headers['Termination'])
                Timezone.append(game_data.headers['Timezone'])
                UTCDate.append(game_data.headers['UTCDate'])
                UTCTime.append(game_data.headers['UTCTime'])
            except Exception as e:
                print(e)
                print(type(e))
                print(str(e))
                print("add " + str(game.get(str(e),None)) + " into row")
                #e.append(None)
                #print(game['url'])
                print(game['uuid'])
    
    print("data fetch work is done.")
    
    df = pd.DataFrame(list(zip(usernames,
                               urls, 
                           time_control,
                          end_time,
                           uuid,
                           initial_setup,
                           time_class,
                           rules,
                           white_rating,
                           white_username,
                           black_rating,
                           black_username,
                           pgn,
                           event,
                          Site,
                          Date,
                          Round,
                          Result,
                          CurrentPosition,
                          ECO,
                          ECOUrl,
                          EndDate,
                          EndTime,
                          StartTime,
                          Termination,
                          Timezone,
                          UTCDate,
                          UTCTime)),
               columns =['username',
                         'urls', 
                         'time_control',
                        'end_time',
                        'uuid',
                        'initial_setup',
                        'time_class',
                        'rules',
                        'white_rating',
                        'white_username',
                        'black_rating',
                        'black_username',
                         'pgn',
                         'event',
                         'Site',
                          'Date',
                          'Round',
                          'Result',
                          'CurrentPosition',
                          'ECO',
                          'ECOUrl',
                          'EndDate',
                          'EndTime',
                          'StartTime',
                          'Termination',
                          'Timezone',
                          'UTCDate',
                          'UTCTime'
                        ])
    print("dataframe importing is done.")
    return df
    
#data = game_df('AGcuber19',files)


In [6]:
# students from Tianmin's classes - BO, BP, AN
tianmin_players = {
    "Teacher" : ['tianminlyu'],
    "BO" : ['AGcuber19',
            'TLPAWN',
            'xiaoanwu',
            'EmmaXLi',
            'akfunchess66',
            'Marsboom', 
            'Claraqiu',
            'Ravenclawfairy', 
            'Zora_zhu',
            'BurleyWalrus'],
    "BP" : ['taionemm',
            'augustinewz',
            'oscarzhang818',
            'yaohengli',
            'Wallacewang1214',
            'SophiaZ2022',
            'AliceCLi',
            'yumitang',
            'james2945',
            'Oinkoinkw'],
    "AN" : ['Cathye1',
            'lunathekitsune',
            'ArthurRocket',
            'vivianwwww20',
            'ChloeWang16',
            'Tyzalex',
            'ZhichengW',
            'Haochen1123',
            'jaydenlan0118',
            'ImRacoonie']
}

In [7]:
df_players = []
error_players = []

for classes in tianmin_players.keys():
    for player in tianmin_players[classes]:
        try:
            files = get_user_archives(player,["2023/02","2023/01","2022/12"]) # target months
            df = game_df(player, files)
            df['class'] = classes
            df_players.append(df)
        except:
            print(Exception)
            print("This player account " + player + " does not exist")
            error_players.append(player)

Player tianminlyu is processing...
https://api.chess.com/pub/player/tianminlyu/games/2022/12  is processing...
'ECO'
<class 'KeyError'>
'ECO'
add None into row
46a9a36e-7e1c-11ed-94ac-78ac4409ff3c
https://api.chess.com/pub/player/tianminlyu/games/2023/01  is processing...
'ECO'
<class 'KeyError'>
'ECO'
add None into row
8cffaa50-939c-11ed-90fb-78ac4409ff3c
https://api.chess.com/pub/player/tianminlyu/games/2023/02  is processing...
data fetch work is done.
dataframe importing is done.
Player AGcuber19 is processing...
https://api.chess.com/pub/player/agcuber19/games/2022/12  is processing...
'ECO'
<class 'KeyError'>
'ECO'
add None into row
3fc79932-72a1-11ed-a69d-78ac4409ff3c
'ECO'
<class 'KeyError'>
'ECO'
add None into row
5115878f-74c8-11ed-a69d-78ac4409ff3c
'ECO'
<class 'KeyError'>
'ECO'
add None into row
7d0bb414-7591-11ed-950c-78ac4409ff3c
https://api.chess.com/pub/player/agcuber19/games/2023/01  is processing...
'ECO'
<class 'KeyError'>
'ECO'
add None into row
c18773db-941f-11ed-9

data fetch work is done.
dataframe importing is done.
Player ArthurRocket is processing...
https://api.chess.com/pub/player/arthurrocket/games/2022/12  is processing...
https://api.chess.com/pub/player/arthurrocket/games/2023/02  is processing...
data fetch work is done.
dataframe importing is done.
Player vivianwwww20 is processing...
https://api.chess.com/pub/player/vivianwwww20/games/2023/01  is processing...
https://api.chess.com/pub/player/vivianwwww20/games/2023/02  is processing...
data fetch work is done.
dataframe importing is done.
Player ChloeWang16 is processing...
https://api.chess.com/pub/player/chloewang16/games/2023/01  is processing...
https://api.chess.com/pub/player/chloewang16/games/2023/02  is processing...
data fetch work is done.
dataframe importing is done.
Player Tyzalex is processing...
https://api.chess.com/pub/player/tyzalex/games/2023/01  is processing...
https://api.chess.com/pub/player/tyzalex/games/2023/02  is processing...
data fetch work is done.
dataf

In [8]:
print("% of error players is ...")
print(len(error_players) * 100/ len(tianmin_players))

% of error players is ...
25.0


In [9]:
players_df = pd.concat(df_players)


In [10]:
def player_rating(row):  
    if row['username'].lower() == row['white_username'].lower():
        return row['white_rating']
    else:
        return row['black_rating']
    
players_df['player_rating'] = players_df.apply(lambda row: player_rating(row), axis=1)


In [11]:
# generate moves table from pgn column

def moves_split(pgn):
    move_number = []
    white_move = []
    black_move = []
    white_time = []
    black_time = []
    for note in range(len(pgn)):
        if note % 8 == 0:
            move_number.append(pgn[note].split(".")[0])
        elif note % 8 == 1:
            white_move.append(pgn[note])
        elif note % 8 == 2:
            pass
        elif note % 8== 3:
            white_time.append(pgn[note].split("]}")[0])
        elif note %8 == 4:
            pass
        elif note % 8 == 5:
            black_move.append(pgn[note])
        elif note % 8 == 6:
            pass
        elif note % 8 == 7:
            black_time.append(pgn[note].split("]}")[0])
        
    if len(black_move) < len(white_move):
        black_move.append("NaN")
        black_time.append("NaN")

    return move_number, white_move, black_move, white_time, black_time



In [12]:
def create_moves_df(game):
    moves_df = []
    for i in range(game.shape[0]):
        try:
            pgn = game['pgn'].iloc[i].split("\n")[-2].split(" ")[:-1]
        except:
            #pgn = "."
            move_number = [None]
            white_move = [None]
            black_move = [None]
            white_time = [None]
            black_time = [None]
            #print(i)
            #print(game['pgn'].iloc[i])
            #print("---")
            #print(game['uuid'].iloc[i])
            #print([game['uuid'].iloc[i]] * 1)
        if len(pgn) != 0:
            moves = moves_split(pgn)
            move_number = moves[0]
            white_move = moves[1]
            black_move = moves[2]
            white_time = moves[3]
            black_time = moves[4]
        else:
            move_number = [None]
            white_move = [None]
            black_move = [None]
            white_time = [None]
            black_time = [None]
        #uuid = [game['uuid'].iloc[i]] * len(move_number)
        uuid = [game['uuid'].iloc[i]] * len(move_number)
        #if len(uuid) == 1:
        #    print(game['uuid'].iloc[i])
        df = pd.DataFrame(list(zip(uuid,
                              move_number,
                              white_move,
                              black_move,
                              white_time,
                              black_time)),
               columns =[     'uuid',
                              'move_number',
                              'white_move',
                              'black_move',
                              'white_time',
                              'black_time'
                        ])
        moves_df.append(df)
    moves_df = pd.concat(moves_df)
    return moves_df


In [13]:
players_df['timestamp'] = pd.to_datetime(players_df['end_time'],unit='s')
players_df.date = pd.to_datetime(players_df.timestamp)


  players_df.date = pd.to_datetime(players_df.timestamp)


In [14]:
games = players_df[['timestamp', 'username', 'urls', 'time_control', 'end_time', 'uuid',
       'initial_setup', 'time_class', 'rules', 'pgn', 'event',
       'Site', 'Date', 'Round', 'Result', 'CurrentPosition', 'ECO', 'ECOUrl',
       'EndDate', 'EndTime', 'StartTime', 'Termination', 'Timezone', 'UTCDate',
       'UTCTime']]



In [15]:
games['username'].unique()

array(['tianminlyu', 'AGcuber19', 'TLPAWN', 'xiaoanwu', 'EmmaXLi',
       'akfunchess66', 'Marsboom', 'Claraqiu', 'Ravenclawfairy',
       'Zora_zhu', 'BurleyWalrus', 'taionemm', 'augustinewz',
       'oscarzhang818', 'yaohengli', 'Wallacewang1214', 'SophiaZ2022',
       'yumitang', 'james2945', 'Oinkoinkw', 'Cathye1', 'lunathekitsune',
       'ArthurRocket', 'vivianwwww20', 'ChloeWang16', 'Tyzalex',
       'ZhichengW', 'Haochen1123', 'jaydenlan0118', 'ImRacoonie'],
      dtype=object)

In [16]:
moves = create_moves_df(players_df)

In [17]:
moves.head()


Unnamed: 0,uuid,move_number,white_move,black_move,white_time,black_time
0,0c860a8f-7785-11ed-a76d-78ac4409ff3c,1,d4,Nf6,0:10:00,0:09:57.2
1,0c860a8f-7785-11ed-a76d-78ac4409ff3c,2,c4,g6,0:09:59,0:09:53.9
2,0c860a8f-7785-11ed-a76d-78ac4409ff3c,3,f3,Bg7,0:09:58.1,0:09:51.7
3,0c860a8f-7785-11ed-a76d-78ac4409ff3c,4,e4,O-O,0:09:57.4,0:09:50.1
4,0c860a8f-7785-11ed-a76d-78ac4409ff3c,5,Nc3,d6,0:09:56.5,0:09:48.4


In [18]:
moves.shape

(35614, 6)

In [19]:
players = players_df[['timestamp', 'class','username', 'uuid', 'player_rating','white_rating',
       'white_username', 'black_rating', 'black_username']]

In [20]:
players.to_csv("player_tianmin_class.csv")
moves.to_csv("move_tianmin_class.csv")
games.to_csv("games_tianmin_class.csv")


In [21]:
# https://docs.google.com/spreadsheets/d/1QJjp2wG_k4XPW6Z8hbvArKkCQH3fI5O7RmPH2Ongto4/edit#gid=0

"""
chess_fact table
player_id
game_id
move_id
"""



'\nchess_fact table\nplayer_id\ngame_id\nmove_id\n'

In [22]:
"""
Player_table
Player id - primary key
player username
class
player rating
rating updated time
"""



'\nPlayer_table\nPlayer id - primary key\nplayer username\nclass\nplayer rating\nrating updated time\n'

In [23]:
"""
Game_table
game id - primary key
player id - foreign key?
urls
time_control
Date
EndDate
StartTime
EndTime
Timezone
UTCDate
UTCTime
initial setup
time class
rules
white_rating
white_username
black_rating
black_username
player_rating
event
Site
Round
Result
CurrentPosition
ECO
ECOUrl
Termination
pgn
"""

'\nGame_table\ngame id - primary key\nplayer id - foreign key?\nurls\ntime_control\nDate\nEndDate\nStartTime\nEndTime\nTimezone\nUTCDate\nUTCTime\ninitial setup\ntime class\nrules\nwhite_rating\nwhite_username\nblack_rating\nblack_username\nplayer_rating\nevent\nSite\nRound\nResult\nCurrentPosition\nECO\nECOUrl\nTermination\npgn\n'

In [24]:
"""
Moves_table
game_id
move_id
move_number
white_move
black_move
white_time
black_time
"""

'\nMoves_table\ngame_id\nmove_id\nmove_number\nwhite_move\nblack_move\nwhite_time\nblack_time\n'