In [1]:
from datetime import datetime
import requests
import numpy as np
import pandas as pd
import yaml
import re
from bs4 import BeautifulSoup
import wikitextparser as wtp
from ratelimit import rate_limited
from operator import *

def save_obj(obj, name ):
    import pickle
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        
def load_obj(name ):
    import pickle
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [2]:
@rate_limited(1, 30)
def parseSquad(title):
    '''
    input: title of tournament
    output: dictionary of participants of that tournament
    '''
    headers = {
    'User-Agent': 'Data for Research',
    'From': 'terthasarit@live.com', 
    'Accept-Encoding': 'gzip'
    }
    titleurl = re.sub(' ', '%20', title)
    r = requests.get(url='http://liquipedia.net/dota2/api.php?action=parse&format=json&page=' + titleurl + '&prop=wikitext%7Ctext', headers=headers)
    json = r.json()
    wiki = json.get('parse').get('wikitext').get('*')
    wiki = re.sub('\n', '', wiki)
    parsed = wtp.parse(wiki)
    templates = parsed.templates
    squad = {}
    for t in templates:
        if str(t)[:11] == '{{TeamCard|':
            for ar in t.arguments:
                if (str(ar)[:5] == '|team'):
                    teamname = re.sub(r'\W+', '', ar.value).lower()
                    squad[teamname] = {}
                for i in range(1, 6):
                    if (str(ar)[:4] == '|p' + str(i) + '='):
                        squad[teamname]['pos' + str(i)] = re.sub(r'\W+', '', ar.value).lower()  
    return squad, json

In [3]:
def getEloDict(ifd):
    '''
    receive infomation dictionary of tournament and
    return elodict
    '''
    start = re.sub('/', '-', ifd['start'])
    end = re.sub('/', '-', ifd['end'])
    #rating of startdate from datdota and turn into list of dict
    urldat = 'http://www.datdota.com/api/ratings?date=' + start
    htmldat = requests.get(urldat)
    htmldat.encoding = 'utf-8'
    elo = BeautifulSoup(htmldat.text, "lxml")
    elodict = yaml.load(elo.text)
    startelolist = elodict['data']   
    #rating of enddate from datdota and turn into list of dict
    urldat = 'http://www.datdota.com/api/ratings?date=' + end
    htmldat = requests.get(urldat)
    htmldat.encoding = 'utf-8'
    elo = BeautifulSoup(htmldat.text, "lxml")
    elodict = yaml.load(elo.text)
    endelolist = elodict['data']
    return (startelolist, endelolist)

In [4]:
def compareAndPut(df, squad, ifd, startelodict, endelodict):
    for lqn, pdc in squad.items():
        try:
            teamName = comparedict[lqn]
        except KeyError:
            teamName = lqn
        j = 0
        for srt in startelodict:
            for ert in endelodict:
                if (teamName == re.sub(r'\W+', '', srt['teamName']).lower()) & (teamName == re.sub(r'\W+', '', ert['teamName']).lower()):
                    j += 1
                    for pos, id in pdc.items():
                        df = df.append({
                            'team': teamName,
                            'position': pos,
                            'id': id,
                            'tour': ifd['title'],
                            'prizeusd': ifd['prize'],
                            'start': ifd['start'],
                            'end': ifd['end'],
                            'startelo': srt['elo64']['current'],
                            'startglicko2mu': srt['glicko2']['mu'],
                            'startglicko2phi': srt['glicko2']['phi'],
                            'startglicko2rating': srt['glicko2']['rating'],
                            'endelo': ert['elo64']['current'],
                            'endglicko2mu': ert['glicko2']['mu'],
                            'endglicko2phi': ert['glicko2']['phi'],
                            'endglicko2rating': ert['glicko2']['rating']
                        }, ignore_index=True)
        if j == 0:
            for pos, id in pdc.items():
                if id == '':
                    continue
                else:
                    df = df.append({
                        'team': teamName,
                        'position': pos,
                        'id': id,
                        'tour': ifd['title'],
                        'prizeusd': ifd['prize'],
                        'start': ifd['start'],
                        'end': ifd['end'],
                        'startelo': np.nan,
                        'startglicko2mu': np.nan,
                        'startglicko2phi': np.nan,
                        'startglicko2rating': np.nan,
                        'endelo': np.nan,
                        'endglicko2mu': np.nan,
                        'endglicko2phi': np.nan,
                        'endglicko2rating': np.nan
                    }, ignore_index=True)
        
    return df

In [5]:
alltour = load_obj('/dict/touranddate')
comparedict = load_obj('/dict/lqtodat')

In [6]:
endti2 = '2012-09-02'
endti7 = '2017-08-12'

In [11]:
%%time
errortitle = []
df = pd.DataFrame(columns=['team', 'position', 'id', 'tour', 'prizeusd', 'start', 'end',
                           'startelo', 'startglicko2mu', 'startglicko2phi', 'startglicko2rating', 'endelo', 'endglicko2mu', 'endglicko2phi', 'endglicko2rating'])
for turl, ifd in alltour.items():
    try:
        try:
            load_obj(re.sub(r'\W+', '', ifd['title']).lower())
            print('Already Exist: ' + re.sub(r'\W+', '', ifd['title']).lower())
        except FileNotFoundError:
            if (pd.to_datetime(ifd['start'], format='%d/%m/%Y') > pd.to_datetime(endti2)) & (pd.to_datetime(ifd['end'], format='%d/%m/%Y') <= pd.to_datetime(endti7)):
                print('Extra Scraping: ' + re.sub(r'\W+', '', ifd['title']).lower())
                ptc, lqjson = parseSquad(turl)
                startelodict, endelodict = getEloDict(ifd)
                save_obj({'wikitext': lqjson, 'startelodict': startelodict, 'endelodict': endelodict}, re.sub(r'\W+', '', ifd['title']).lower() )
                df = compareAndPut(df, ptc, ifd, startelodict, endelodict)
    except:
        print('Error: ' + turl)
        errortitle = errortitle.append(turl)

Already Exist: starladderstarseriesseason1
Extra Scraping: wcgasianchampionship2012
Already Exist: raidcalldota2leagueseason1
Already Exist: starladderstarseriesseason3
Extra Scraping: gosuleagueseason4
Extra Scraping: gosuleagueseason4division1
Extra Scraping: thepremierleagueseason3
Already Exist: g1championsleagueseason4
Extra Scraping: electronicsportsworldcup2012
Extra Scraping: gosuleagueseason5
Extra Scraping: gosuleagueseason5division1
Already Exist: starladderstarseriesseason4
Already Exist: dreamhackwinter2012
Already Exist: thedefenseseason3
Already Exist: worldcybergames2012
Extra Scraping: thoropen2012
Already Exist: raidcalldota2leagueseason2
Already Exist: asusopen2012finals
Already Exist: theasia2012
Already Exist: gleague2012season2
Already Exist: thepremierleagueseason4
Already Exist: starladderstarseriesseason5
Already Exist: armaggeddondota2grandslamasia2013
Already Exist: weplaydota2leagueseason1
Already Exist: dreamhackdota2invitational
Already Exist: g1championsl

Already Exist: dota2professionalleagueseason2secondary
Already Exist: shanghaidota2open2
Already Exist: dota2professionalleagueseason2top
Already Exist: marsdota2league2016autumn
Already Exist: worldcyberarena2016chinesequalifierss3
Already Exist: worldelectronicsportsgames2016europecisfinals
Already Exist: dreamleagueseason6leagueplay
Already Exist: nanyangdota2championshipscruisecup1
Already Exist: dota2aceprovisional
Already Exist: faceitinvitational
Already Exist: worldelectronicsportsgames2016americasfinals
Already Exist: rogmasters2016
Already Exist: northernarenabeatinvitational
Already Exist: worldelectronicsportsgames2016asiapacificfinals
Already Exist: worldcyberarena2016amqualifiers
Already Exist: thesummit6
Already Exist: asusrogdreamleagueseason6
Already Exist: thebostonmajor2016
Already Exist: worldcyberarena2016
Already Exist: chinatop2016
Already Exist: eslonegenting2017
Already Exist: worldelectronicsportsgames2016
Already Exist: dotapitleagueseason5
Already Exist: eli

In [None]:
turl

In [None]:
tourleft = alltour.copy()
for k, v in alltour.items():
    del tourleft[k]
    if k == 'Shanghai Dota 2 Open/1':
        break

In [None]:
for k, v in alltour.items():
    print(k)

In [None]:
alltour['Shanghai Dota 2 Open/1']

In [None]:
tourleft

In [None]:
def correction(urllist):
    for turl in urllist:
        ifd = alltour[turl]
        ptc, lqjson = parseSquad(turl)
        startelodict, endelodict = getEloDict(ifd)
        save_obj({'wikitext': lqjson, 'startelodict': startelodict, 'endelodict': endelodict}, re.sub(r'\W+', '', ifd['title']).lower() )  

In [None]:
turl = 'Dota 2 Radiant %26 Dire Cup/2015'
ifd = alltour['Dota 2 Radiant & Dire Cup/2015']
ptc, lqjson = parseSquad(turl)
startelodict, endelodict = getEloDict(ifd)
save_obj({'wikitext': lqjson, 'startelodict': startelodict, 'endelodict': endelodict}, re.sub(r'\W+', '', ifd['title']).lower() )  

In [None]:
urllist = ['Dota 2 Radiant %26 Dire Cup/2015']

In [None]:
correction(urllist)

In [None]:
len(alltour)

In [None]:
i = 0

In [None]:
for turl, ifd in alltour.items():
    if (pd.to_datetime(ifd['start']) > pd.to_datetime(endti2)) & (pd.to_datetime(ifd['end']) <= pd.to_datetime(endti7)):
        print(turl)

In [None]:
i