In [1]:
import csv
import requests
import pandas as pd
import numpy as np
import datetime as dt
import json
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests

from bs4 import BeautifulSoup

from fetcher import Fetcher

In [3]:
PLAYER = 'YVogel'

In [4]:
# get country codes and names

csv_url = 'https://pkgstore.datahub.io/core/country-list/data_csv/data/d7c9d7cfb42cb69f4422dec222dbbaa8/data_csv.csv'
res = requests.get(csv_url)
decoded_content = res.content.decode('utf-8')

countries_iter = csv.reader(decoded_content.splitlines(), delimiter=',')

In [4]:
# create country Fetcher objects

next(countries_iter)  # skipping headers
country_fetcher = Fetcher('pub/country/')
countries = {}
for country, code in countries_iter:
    countries[country] = country_fetcher.create_child(code)

In [5]:
# create how many users per country

def fetch_country(country):
    fetcher = countries[country]
    res = fetcher.fetch_json('players')
    if res:
        num_of_country_users = len(res['players'])
        return country, num_of_country_users

user_count = []
with ThreadPoolExecutor(10) as pool:
    futures = [pool.submit(fetch_country, country) for country in countries]
    for future in as_completed(futures):
        data = future.result()
        if data:
            user_count.append(data)

In [5]:
user_count = pd.DataFrame(user_count)
user_count.columns = ['country', 'users']
user_count = user_count.set_index('country')['users'].to_frame()

user_count.to_csv('datasets/countries.csv')

NameError: name 'user_count' is not defined

In [7]:
# build the profile table

profile_fetcher = Fetcher(path='/pub/player/'+PLAYER)

profile_data = [(k, v) for k, v in profile_fetcher.fetch_json().items()]
profile_data = pd.DataFrame(profile_data, columns=['field', 'value'])
profile_data.set_index('field', inplace=True)

profile_data.to_csv('datasets/profile.csv')

In [8]:
# stats tree:
#     the hierarical structure of this dataset goes most natural with a tree

stats_fetcher = profile_fetcher.create_child('stats')

stats = stats_fetcher.fetch_json()

# with open('datasets/stats.json', 'w') as f:
#     json.dump(stats, f)

## Creating the game table
#### big dimension table to conduct most research on

In [9]:
# game table:
#     A list of all played games

game_fetcher = profile_fetcher.create_child('games')
archives = game_fetcher.fetch_json('archives')['archives']

games = pd.DataFrame()

def fetch_gamelist():
    fetcher = game_fetcher.create_child()
    thread_games = pd.DataFrame()
    while archives:
        x = archives.pop()
        df = fetcher.fetch_df(x)
        thread_games = thread_games.append(df, sort=False)
        
    return thread_games

raw_games = pd.DataFrame()
with ThreadPoolExecutor(10) as Pool:
    futures = [Pool.submit(fetch_gamelist) for i in range(5)]
    for future in as_completed(futures):
        data = future.result()
        raw_games = raw_games.append(data, sort=False)

In [11]:
raw_games.shape

(7681, 12)

##### the games have been collected- and now for some processing

In [12]:
games = raw_games.copy()

games['end_time'] = games.end_time.apply(dt.datetime.fromtimestamp)

games = games.loc[(games.black.apply(type) == dict) & (games.white.apply(type) == dict)]

games.sort_values('end_time', inplace=True)

games = games.loc[~(games.black.isna() | games.white.isna())]

for color in ('black', 'white'):
    for category in ('rating', 'username', 'result'):
        games[f'{color}_{category}'] = games[color].apply(lambda x: x.get(category))
del games['black'], games['white']

games = games.loc[games.rules == 'chess']
del games['rules']
del games['fen']

In [13]:
def extract_opening(pgn):
    pat = r'ECOUrl "https:\/\/www.chess.com\/openings\/([A-Za-z0-9-\.]+)'
    mat = re.search(pat, pgn)
    try:
        return mat.group(1)
    except AttributeError:
        return None

games['opening'] = games.pgn.apply(extract_opening)

In [14]:
def extract_moves(pgn):
    pat = '\{?\[.+?\]\}?'
    nums = '\d\.{3}'
    out = re.sub(pat, '', pgn)
    out = re.sub(nums, '', out)
    out = re.sub('\s+', ' ', out)
    out = re.sub('\d{1,2}-\d{1,2}$', '', out)
    return out.strip()

games['moves'] = games.pgn.apply(extract_moves)

In [15]:
games['color_played'] = np.where(games.white_username == PLAYER, 'White', 'Black')

games['rival_rating'] = np.where(games.white_username == PLAYER, games.black_rating, games.white_rating)

games['my_rating'] = np.where(games.white_username == PLAYER, games.white_rating, games.black_rating)

games['my_result'] = np.where(games.white_username == PLAYER, games.white_result, games.black_result)
games['rival_result'] = np.where(games.white_username == PLAYER, games.black_result, games.white_result)

In [16]:
inv_res_dict = {
    'win': ['win'],
    'draw': ['stalemate', 'timevsinsufficient', 'repetition', 'agreed', 'insufficient'],
    'loss': ['resigned', 'checkmated', 'timeout', 'abandoned']
} 
res_dict = {}
for k in inv_res_dict:
    for v in inv_res_dict[k]:
        res_dict[v] = k
        
games['won'] = games.my_result.apply(lambda x: res_dict[x])

In [17]:
games['game_id'] = games.url.apply(lambda x: int(re.search(r'\d+', x).group(0)))

games = games.set_index('game_id').sort_index()

In [18]:
games.drop(['black_rating', 'black_username', 'black_result', 'white_rating', 'white_username', 'white_result'], axis=1, inplace=True)

In [19]:
games['end_time'] = games.end_time.astype(np.datetime64)

In [25]:
games.to_csv('datasets/games.csv')