In [1]:
from chessdotcom import get_player_profile, get_player_stats, get_player_game_archives
import requests
import pprint
import pandas as pd
from parsita import *
from parsita.util import constant
import json
import chess.pgn
import io
import ftplib
import os
import logging

# https://pypi.org/project/pgn2data/
from converter.pgn_data import PGNData

In [3]:
# Fetch data from chess.com API

# make json data easier to read
pp = pprint.PrettyPrinter(width=41, compact=True)

def get_user_archives(username, months):
    """
    get archive monthly files of specific chess.com player
    input:
    username - username of the chess.com player
    months - target months that we want to get the archives
    
    output:
    target_month - files of archives according to months parameter
    """
    archives = get_player_game_archives(username).json['archives']
    target_month = []
    for archive in archives:
        if archive[-7:] in months:
            target_month.append(archive)
    return target_month

files = get_user_archives("tianminlyu",["2023/02"])
files

['https://api.chess.com/pub/player/tianminlyu/games/2023/02']

In [5]:
def get_archive_games(filename):
    """
    return games in one archive file
    
    input:
    filename - filename that contains game urls
    
    output: 
    """
    games = requests.get(filename).json()['games']
    return games

games = get_archive_games(files[-1])
games

[{'url': 'https://www.chess.com/game/live/68981142153',
  'pgn': '[Event "Live Chess"]\n[Site "Chess.com"]\n[Date "2023.02.01"]\n[Round "-"]\n[White "tianminlyu"]\n[Black "kruupy"]\n[Result "1-0"]\n[CurrentPosition "8/5ppk/p1P4p/8/8/2qQK1P1/5P1P/8 b - -"]\n[Timezone "UTC"]\n[ECO "A07"]\n[ECOUrl "https://www.chess.com/openings/Kings-Indian-Attack"]\n[UTCDate "2023.02.01"]\n[UTCTime "09:00:26"]\n[WhiteElo "1957"]\n[BlackElo "1786"]\n[TimeControl "900+10"]\n[Termination "tianminlyu won by resignation"]\n[StartTime "09:00:26"]\n[EndDate "2023.02.01"]\n[EndTime "09:23:36"]\n[Link "https://www.chess.com/game/live/68981142153"]\n\n1. Nf3 {[%clk 0:15:10]} 1... d5 {[%clk 0:15:03]} 2. g3 {[%clk 0:15:18.8]} 2... e6 {[%clk 0:14:57.4]} 3. Bg2 {[%clk 0:15:27.7]} 3... Nf6 {[%clk 0:15:02]} 4. O-O {[%clk 0:15:36.7]} 4... Be7 {[%clk 0:14:47.5]} 5. d3 {[%clk 0:15:45.9]} 5... O-O {[%clk 0:14:43.7]} 6. c4 {[%clk 0:15:55.1]} 6... c6 {[%clk 0:13:43.3]} 7. a3 {[%clk 0:16:02.7]} 7... Nbd7 {[%clk 0:13:38.3]} 8.

In [None]:
def game_df(username,files):
    """
    import data from archive files and turn relevant data parameters into data frames
    
    input:
    username - username of the player
    files - file archived of the player
    
    output:
    a dataframe contains wanted columns
                        'username',
                         'urls', 
                         'time_control',
                        'end_time',
                        'uuid',
                        'initial_setup',
                        'time_class',
                        'rules',
                        'white_rating',
                        'white_username',
                        'black_rating',
                        'black_username',
                        'Event',
                        'Site',
                        'Date',
                        'Round',
                        'Result',
                        'CurrentPosition',
                        'ECO',
                        'ECOUrl',
                        'EndDate',
                        'EndTime',
                        'StartTime',
                        'Termination',
                        'Timezone',
                        'UTCDate',
                        'UTCTime',                        
                        
    """
    print("Player " + username + " is processing...")
    usernames = []
    urls = []
    time_control = [] 
    end_time = [] 
    uuid = []
    initial_setup = [] 
    time_class = [] 
    rules = [] 
    white_rating = [] 
    white_username = [] 
    black_rating = [] 
    black_username = []
    pgn = []
    event = []
    Site = []
    Date = []
    Round = []
    Result = []
    CurrentPosition = []
    ECO = []
    ECOUrl = []
    EndDate = []
    EndTime = []
    StartTime = []
    Termination = []
    Timezone = []
    UTCDate = []
    UTCTime = []
    
    for file in files:
        print(file + " " + " is processing...")
        games = get_archive_games(file)
        for game in games:
            try:
                usernames.append(username)
                urls.append(game.get('url',None)) # game.get('url', None)
                time_control.append(game.get('time_control',None))
                end_time.append(game.get('end_time',None))
                uuid.append(game.get('uuid',None))
                initial_setup.append(game.get('initial_setup',None))
                time_class.append(game.get('time_class',None))
                rules.append(game.get('rules',None))
                white_rating.append(game.get('white',None)['rating'])
                white_username.append(game.get('white',None)['username'])
                black_rating.append(game.get('black',None)['rating'])
                black_username.append(game.get('black',None)['username'])
                pgn.append(game.get('pgn',None))
                
                pgn_written = io.StringIO(game['pgn'])
                game_data = chess.pgn.read_game(pgn_written)
                event.append(game_data.headers['Event'])
                Site.append(game_data.headers['Site'])
                Date.append(game_data.headers['Date'])
                Round.append(game_data.headers['Round'])
                Result.append(game_data.headers['Result'])
                CurrentPosition.append(game_data.headers['CurrentPosition'])
                ECO.append(game_data.headers['ECO'])
                ECOUrl.append(game_data.headers['ECOUrl'])
                EndDate.append(game_data.headers['EndDate'])
                EndTime.append(game_data.headers['EndTime'])
                StartTime.append(game_data.headers['StartTime'])
                Termination.append(game_data.headers['Termination'])
                Timezone.append(game_data.headers['Timezone'])
                UTCDate.append(game_data.headers['UTCDate'])
                UTCTime.append(game_data.headers['UTCTime'])
            except Exception as e:
                print(e)
                print(type(e))
                print(str(e))
                print("add " + str(game.get(str(e),None)) + " into row")
                #e.append(None)
                #print(game['url'])
                print(game['uuid'])
    
    print("data fetch work is done.")
    
    df = pd.DataFrame(list(zip(usernames,
                               urls, 
                           time_control,
                          end_time,
                           uuid,
                           initial_setup,
                           time_class,
                           rules,
                           white_rating,
                           white_username,
                           black_rating,
                           black_username,
                           pgn,
                           event,
                          Site,
                          Date,
                          Round,
                          Result,
                          CurrentPosition,
                          ECO,
                          ECOUrl,
                          EndDate,
                          EndTime,
                          StartTime,
                          Termination,
                          Timezone,
                          UTCDate,
                          UTCTime)),
               columns =['username',
                         'urls', 
                         'time_control',
                        'end_time',
                        'uuid',
                        'initial_setup',
                        'time_class',
                        'rules',
                        'white_rating',
                        'white_username',
                        'black_rating',
                        'black_username',
                         'pgn',
                         'event',
                         'Site',
                          'Date',
                          'Round',
                          'Result',
                          'CurrentPosition',
                          'ECO',
                          'ECOUrl',
                          'EndDate',
                          'EndTime',
                          'StartTime',
                          'Termination',
                          'Timezone',
                          'UTCDate',
                          'UTCTime'
                        ])
    print("dataframe importing is done.")
    return df
    
#data = game_df('AGcuber19',files)
