In [1]:
import os
import re
import ssl
import requests
import json
import urllib.request
from pathlib import Path
import pandas as pd

requests.packages.urllib3.disable_warnings()
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
# Declare Globals
user = 'tuggypetu'  # The user for whom the script is intended to run
pgnMeta = ["Event", "Site", "Date", "Round", "White", "Black", "Result", "Tournament",
           "CurrentPosition", "Timezone", "ECO", "ECOURL", "UTCDate", "UTCTime", "WhiteELO",
           "BlackELO", "TimeControl", "Termination", "StartTime", "EndDate", "EndTime", "Link", "Moves"]
tgtFilePath = f"data/{user}_games.csv"  # This is the path where the final CSV gets created
moveStartLine = 22  # Moves in chess.com PGNs typically start from the 22nd line for each game
PGNDirectory = f"data/pgn/{user}"  # This is the location where the API downloads the PGNs from the archives
PGNFile = f"data/pgn/{user}_games.pgn"

In [3]:
def getPGN(user):
    """This function accesses the chess.com public API and downloads all the PGNs to a folder"""
    pgn_archive_links = requests.get("https://api.chess.com/pub/player/" + user + "/games/archives", verify=False)
    if not os.path.exists(PGNDirectory):
        os.makedirs(PGNDirectory)
    for url in json.loads(pgn_archive_links.content)["archives"]:
        filepath = PGNDirectory + "/" + url.split("/")[7] + url.split("/")[8] + '.pgn'
        urllib.request.urlretrieve(url + '/pgn', filepath)
    with open(PGNFile, 'w') as outfile:
        pgn_files = sorted(os.listdir(PGNDirectory))
        for fe in pgn_files:
            with open(f"{PGNDirectory}/{fe}") as infile:
                outfile.write(infile.read())
            os.remove(f"{PGNDirectory}/{fe}")


def importPGNData(filepath):
    """This function returns the data read as a string"""
    with open(filepath) as f:
        return f.readlines()

In [4]:
def getEdgePoints(data):
    """This function returns the start and end indices for each game in the PGN"""
    ends = []
    starts = []
    for n, l in enumerate(data):
        if l.startswith("[Event"):
            if n != 0:
                ends.append(n - 1)
            starts.append(n)
        elif n == len(data) - 1:
            ends.append(n)

    return starts, ends


def grpGames(data, starts, ends):
    """This function groups games into individual lists based on the start and end index"""
    blocks = []
    for i in range(len(ends)):
        element = data[starts[i]: ends[i] + 1]
        if element not in blocks:
            blocks.append(element)
    return blocks

In [5]:
def mergeMoves(game):
    """This function cleans out the moves and other attributes, removes newlines and formats the list to be converted
    into a dictionary"""
    if len(game) == 22:
        game.insert(7, '[Tournament "-"]')
    for n, eachrow in enumerate(game):
        game[n] = game[n].replace('\n', '')
        try:
            if n <= moveStartLine - 1:
                game[n] = stripwhitespace(game[n]).split('~')[1].strip(']["')
        except:
            if n <= moveStartLine - 4:
                game[n] = stripwhitespace(game[n]).split('~')[1].strip(']["')
    return list(filter(None, game))


def stripwhitespace(text):
    lst = text.split('"')
    for i, item in enumerate(lst):
        if not i % 2:
            lst[i] = re.sub("\s+", "~", item)
    return '"'.join(lst)

def arrange_game_list(games_list):
    """Removes unwanted games from list, and clean data in game list"""
    games = []
    for gg in games_list:
        game = [x.strip() for x in gg if x.strip() != '']
        games.append(game)
    games = [mergeMoves(game) for game in games]
    del_list = []
    for n, oo in enumerate(games):
        if len(oo) == 25 or len(oo) < 22:
            del_list.append(n)
    for index in sorted(del_list, reverse=True):
        del games[index]
    return games

In [6]:
getPGN(user)
try:
    tgtFilePathObj = Path(tgtFilePath)
    tgtFilePathObj.unlink()
except FileNotFoundError:
    with open(tgtFilePath, "w"):
        tgtFilePathObj = Path(tgtFilePath)
        tgtFilePathObj.unlink()

data = importPGNData(PGNFile)
starts, ends = getEdgePoints(data)
games_n = grpGames(data, starts, ends)
games = arrange_game_list(games_n)
df = pd.DataFrame(games, columns=pgnMeta)
df.to_csv(tgtFilePath, index=False)
print("Export Complete!")

Export Complete!


In [7]:
df

Unnamed: 0,Event,Site,Date,Round,White,Black,Result,Tournament,CurrentPosition,Timezone,...,UTCTime,WhiteELO,BlackELO,TimeControl,Termination,StartTime,EndDate,EndTime,Link,Moves
0,Live Chess,Chess.com,2022.10.31,-,PremV80,tuggypetu,1-0,-,r2qk2r/pp1npp1p/3p3Q/1B3p2/8/2P5/PP1N1PPP/R1B1...,UTC,...,04:55:27,1433,1524,180+2,PremV80 won by resignation,04:55:27,2022.10.31,04:57:06,https://www.chess.com/game/live/60931139631,1. e4 {[%clk 0:03:02]} 1... c5 {[%clk 0:03:01....
1,Live Chess,Chess.com,2022.10.31,-,KD-T,tuggypetu,0-1,-,8/pR4Np/3k2p1/4p3/3bPpP1/3P1P1K/5r1P/8 w - -,UTC,...,04:35:03,1502,1535,180+2,tuggypetu won on time,04:35:03,2022.10.31,04:43:50,https://www.chess.com/game/live/60929936967,1. b3 {[%clk 0:03:02]} 1... g6 {[%clk 0:03:00....
2,Live Chess,Chess.com,2022.10.31,-,tuggypetu,mmas50,0-1,-,r3rnk1/3b2p1/p1n1p2p/1p1p3P/5N2/P5R1/1qBBQPP1/...,UTC,...,04:28:05,1527,1660,180+2,mmas50 won on time,04:28:05,2022.10.31,04:34:47,https://www.chess.com/game/live/60929363913,1. e4 {[%clk 0:03:02]} 1... e6 {[%clk 0:02:56....
3,Live Chess,Chess.com,2022.10.31,-,gabrielkum,tuggypetu,0-1,-,1r6/2bn1pk1/4p1p1/P2p4/2pPp3/2P1P2P/2Q2PPq/R4R...,UTC,...,04:21:23,1548,1532,180+2,tuggypetu won by checkmate,04:21:23,2022.10.31,04:27:54,https://www.chess.com/game/live/60928793303,1. d4 {[%clk 0:03:02]} 1... Nf6 {[%clk 0:03:00...
4,Live Chess,Chess.com,2022.10.31,-,segismund,tuggypetu,0-1,-,2r2r1k/4Npbp/p2p2P1/7Q/8/1n2B3/1P2N3/qK1R1B2 w...,UTC,...,04:14:46,1573,1523,180+2,tuggypetu won by checkmate,04:14:46,2022.10.31,04:21:08,https://www.chess.com/game/live/60928734547,1. e4 {[%clk 0:03:02]} 1... c5 {[%clk 0:03:01....
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
568,Live Chess,Chess.com,2023.01.02,-,AkilZengini,tuggypetu,1-0,-,8/4R3/6p1/8/5rPK/5k1P/8/3b4 b - -,UTC,...,08:43:50,1670,1650,180+2,AkilZengini won on time,08:43:50,2023.01.02,08:52:09,https://www.chess.com/game/live/66388200575,1. e4 {[%clk 0:03:02]} 1... g6 {[%clk 0:03:01....
569,Live Chess,Chess.com,2023.01.02,-,tuggypetu,Jasoncardenasgenobiagon,0-1,-,r6r/p4kn1/1p1q4/2pP2p1/2P5/P1B1P3/4KPPP/1R1R4 ...,UTC,...,08:36:31,1658,1612,180+2,Jasoncardenasgenobiagon won by resignation,08:36:31,2023.01.02,08:43:30,https://www.chess.com/game/live/66387642527,1. b3 {[%clk 0:03:02]} 1... e5 {[%clk 0:02:59....
570,Live Chess,Chess.com,2023.01.02,-,VahanAbraham,tuggypetu,1-0,-,5rk1/1q2nppp/4p3/1b1pP3/1p1P2P1/1PpN1N2/R1P1QP...,UTC,...,08:31:17,1631,1667,180+2,VahanAbraham won by resignation,08:31:17,2023.01.02,08:36:20,https://www.chess.com/game/live/66387111535,1. e4 {[%clk 0:03:02]} 1... b6 {[%clk 0:03:00....
571,Live Chess,Chess.com,2023.01.02,-,tuggypetu,MaciejJakub,0-1,-,r4rk1/4ppb1/2Np2p1/3P1P1p/1p5P/1Pn5/q3QP2/2BK2...,UTC,...,08:24:28,1676,1594,180+2,MaciejJakub won by resignation,08:24:28,2023.01.02,08:31:06,https://www.chess.com/game/live/66387011183,1. b3 {[%clk 0:03:00.2]} 1... Nf6 {[%clk 0:03:...
