In [1]:
import pandas as pd
import numpy as np
import requests
import json
import datetime
from random import sample

In [2]:
url_base = 'https://api.chess.com/pub/player/'


def get_player_ratings(username):
    url = url_base + username + '/stats'
    response = requests.get(url).text
    data = json.loads(response)
    df = pd.json_normalize(data)
    return df
    
    
def get_player_games(username, game_archive, months=6):
    months += 1
    url = url_base + username + '/games/archives'
    response = requests.get(url).text
    archive = json.loads(response).get('archives')
    archive.sort(reverse=True)
    archive = archive[:months]
    
    games_df = pd.DataFrame()
    for month in archive:
        data = requests.get(month).text
        games = json.loads(data).get('games')
        for game in games:
            df = pd.json_normalize(game)
            if df.uuid[0] not in game_archive:
                games_df = pd.concat([games_df, df], ignore_index=True)
                
    return games_df

def update_player_list(dataframe, existing):
    white = dataframe['white.username']
    black = dataframe['black.username']
    white = pd.concat([white,black],ignore_index=True)
    
    players = [i for i in white if i not in existing]
    players = [*set(players)]
    
    return pd.DataFrame(players, columns = ['username'])
    

In [3]:
now = datetime.datetime.now()
now = now.strftime('%y%m%d-%H%M')
month = now[:4]
day = now[:6]

In [4]:
games_df = pd.read_csv('games.csv', dtype='object')
games_df.to_csv('games/{}.csv'.format(now), index=False)
game_ids = games_df.uuid.to_list()

In [5]:
players_df = pd.read_csv('players.csv', dtype='object')

players_df = players_df[(players_df['chess_rapid.last.rating'].astype(float) < 1300) & (players_df['chess_rapid.last.rating'].astype(float) > 1050)]

existing_players = players_df.username.drop_duplicates().to_list()

#to_update = players_df[(players_df['updated'] < month) | (players_df['updated'].isnull())]
#usernames = to_update.username.tolist()

In [6]:
new_players = update_player_list(games_df, existing_players)



new_players = new_players.username.to_list()


#new_players = ['GMChessrob','windleypratt','Wizard456', 'BogdanDeac', 'MagnusCarlsen','GM-NAGY']



len(new_players)


new_players = sample(new_players, 500)

In [7]:
def run_players(player_list, limit=None):
    
    players = pd.DataFrame()
    
    start = datetime.datetime.now()
    
    if limit != None:
        player_list = player_list[:limit]
        
    print('Running {} players.'.format(len(player_list)))

    for username in player_list:
        try:
            df = get_player_ratings(username)
            df['username'] = username
            df['updated_day'] = day
            df['updated_mnth'] = month
            players = pd.concat([players, df], ignore_index=True)
        except:
            continue
    
    print('Finished in {}.'.format(datetime.datetime.now()-start))
    
    return players
    
def run_games(player_list, limit=None, months=6):
    
    players = pd.DataFrame()
    games = pd.DataFrame()
    
    start = datetime.datetime.now()
    
    if limit != None:
        player_list = player_list[:limit]
        
    print('Getting games from {} players.'.format(len(player_list)))
    
    i = 0
    
    for username in player_list:
        i += 1
        
        if i % 25 == 0:
            print('{}/{}'.format(i, limit)) 

        try:
            gms = get_player_games(username, game_ids, months = months)
            games = pd.concat([games, gms], ignore_index=True)
        except:
            continue
            


    print('Finished in {}.'.format(datetime.datetime.now()-start))
    
    return games

In [8]:
players = run_players(new_players, 500)
games = run_games(new_players, 100, 3)

Running 500 players.
Finished in 0:04:41.722342.
Getting games from 100 players.
25/100
50/100
75/100
100/100
Finished in 0:34:29.076386.


In [9]:
player_csv = pd.concat([players_df,players], ignore_index=True).sort_values(by='updated_day', ascending=False)
player_csv = player_csv[player_csv['status'] != 'error']
player_csv = player_csv.drop_duplicates().reset_index(drop=True)
player_csv.to_csv('players.csv',index=False)

In [11]:
game_csv = pd.concat([games_df, games], ignore_index=True).drop_duplicates(subset = ['uuid']).reset_index(drop=True)
game_csv.to_csv('games.csv', index=False)

In [12]:
rating_thresh = 1300

shrink = game_csv[['url', 'rated', 'white.username', 'white.rating', 'white.result','black.username', 'black.rating', 'black.result', 'pgn']]

white = shrink[(shrink['black.result'] == 'checkmated') | (shrink['black.result'] == 'resigned')]
white = white[(white['white.rating'].apply(float) < 1300) & (white['white.rating'].apply(float) > 1050)]


black = shrink[(shrink['white.result'] == 'checkmated') | (shrink['white.result'] == 'resigned')]
black = black[(black['black.rating'].apply(float) < 1300) & (black['black.rating'].apply(float) > 1050)]


white.to_csv('white.csv', index=False)
black.to_csv('black.csv', index=False)

In [13]:
len(games)

134060

In [14]:
len(game_csv)

764465