In [1]:
from datetime import datetime
import requests
import numpy as np
import pandas as pd
import yaml
import re
from bs4 import BeautifulSoup
import wikitextparser as wtp
from ratelimit import rate_limited
from operator import *

def save_obj(obj, name ):
    import pickle
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        
def load_obj(name ):
    import pickle
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [155]:
def parseSquad(tourdict):
    '''
    input: title of tournament
    output: dictionary of participants of that tournament
    '''
    json = tourdict['wikitext']
    wiki = json.get('parse').get('wikitext').get('*')
    wiki = re.sub('\n', '', wiki)
    parsed = wtp.parse(wiki)
    templates = parsed.templates
    squad = {}
    prize = {} 
    for t in templates:
        if str(t)[:12].lower() == '{{prize pool':
            for ar in t.arguments:
                if (str(ar)[:6] == '|place'):
                    place = ar.value
                if (str(ar)[:9] == '|usdprize'):
                    usdprize = ar.value 
            for ar in t.arguments:
                if str(ar)[1:] == ar.value:
                    teamshort = re.sub(r' ', '', ar.value)
                    try:
                        teamSim = teamEasy[teamshort]
                    except KeyError:
                        teamSim = teamshort
                    teamSim = re.sub(r'\W+|_', '', teamSim).lower()
                    prize[teamSim] = {}
                    prize[teamSim]['usdprize'] = usdprize
                    prize[teamSim]['place'] = place
        if str(t)[:11] == '{{TeamCard|':
            for ar in t.arguments:
                if (str(ar)[:5] == '|team'):
                    teamname = re.sub(r'\W+', '', ar.value).lower()
                    squad[teamname] = {}
                for i in range(1, 6):
                    if (str(ar)[:4] == '|p' + str(i) + '='): 
                        pid = re.sub(r'\W+', '', ar.value).lower()
                        try:
                            pid = AlternateIdDict[re.sub(r'\W+', '', ar.value).lower()]
                        except KeyError:
                            pid = re.sub(r'\W+', '', ar.value).lower()
                        squad[teamname]['pos' + str(i)] = re.sub(r'\W+', '', pid).lower()
                    if (str(ar)[:7] == '|p' + str(i) + 'link'):
                        squad[teamname]['pos' + str(i)] = squad[teamname]['pos' + str(i)] + ' haslink'
    return squad, prize

In [156]:
alltour = load_obj('/dict/touranddate')
comparedict = load_obj('/dict/lqtodat')
AlternateIdDict = load_obj('/dict/AlternateIdDict')
teamEasy = load_obj('/dict/teamEasy2')

In [177]:
tourdict = load_obj('forglicko/' + re.sub(r'\W+', '', 'Dota2 Professional League Season 2 - Secondary').lower() )
json = tourdict['wikitext']
wiki = json.get('parse').get('wikitext').get('*')
wiki = re.sub('\n', '', wiki)
parsed = wtp.parse(wiki)
templates = parsed.templates
squad, prize = parseSquad(tourdict)

In [180]:
teamEasy['forthedream'] = 'ftdcluba'

In [48]:
def getEloDict(tourdict):
    return (tourdict['startelodict'], tourdict['endelodict'])

In [170]:
re.sub(r'team', '', 'teamvgj') in 'vgjthunder'

True

In [162]:
def noTeam(text):
    return re.sub(r'team', '', text)

In [171]:
def compareAndPut(df, squad, prize, ifd, startelodict, endelodict):
    for lqn, pdc in squad.items():
        try:
            teamName = comparedict[lqn]
        except KeyError:
            teamName = lqn
        j = 0
        try:
            place = prize[teamName]['place']
            usdprize = prize[teamName]['usdprize']
        except KeyError:
            place = '0.00'
            usdprize = '0.00'
            for pk in prize.keys():
                if (pk in teamName) or (teamName in pk) or (noTeam(pk) in noTeam(teamName)) or (noTeam(teamName) in noTeam(pk)):
                    place = prize[pk]['place']
                    usdprize = prize[pk]['usdprize']
        for srt in startelodict:
            for ert in endelodict:
                if (teamName == re.sub(r'\W+', '', srt['teamName']).lower()) & (teamName == re.sub(r'\W+', '', ert['teamName']).lower()):
                    j += 1
                    for pos, id in pdc.items():
                        df = df.append({
                            'team': teamName,
                            'pos': pos,
                            'id': id,
                            'tour': ifd['title'],
                            'prizepool': ifd['prize'],
                            'place': place,
                            'prizeusd': usdprize,
                            'start': ifd['start'],
                            'end': ifd['end'],
                            'startelo': srt['elo64']['current'],
                            'startglicko2mu': srt['glicko2']['mu'],
                            'startglicko2phi': srt['glicko2']['phi'],
                            'startglicko2rating': srt['glicko2']['rating'],
                            'endelo': ert['elo64']['current'],
                            'endglicko2mu': ert['glicko2']['mu'],
                            'endglicko2phi': ert['glicko2']['phi'],
                            'endglicko2rating': ert['glicko2']['rating']
                        }, ignore_index=True)
        if j == 0:
            for pos, id in pdc.items():
                if id == '':
                    continue
                else:
                    df = df.append({
                        'team': teamName,
                        'pos': pos,
                        'id': id,
                        'tour': ifd['title'],
                        'prizepool': ifd['prize'],
                        'place': place,
                        'prizeusd': usdprize,
                        'start': ifd['start'],
                        'end': ifd['end'],
                        'startelo': np.nan,
                        'startglicko2mu': np.nan,
                        'startglicko2phi': np.nan,
                        'startglicko2rating': np.nan,
                        'endelo': np.nan,
                        'endglicko2mu': np.nan,
                        'endglicko2phi': np.nan,
                        'endglicko2rating': np.nan
                    }, ignore_index=True)
        
    return df

In [172]:
alltour = load_obj('/dict/touranddate')
comparedict = load_obj('/dict/lqtodat')
AlternateIdDict = load_obj('/dict/AlternateIdDict')
teamEasy = load_obj('/dict/teamEasy2')

In [173]:
endti2 = '2012-09-02'
endti7 = '2017-08-12'

In [181]:
%%time
errortitle = []
df = pd.DataFrame(columns=['team', 'pos', 'id', 'tour', 'prizepool', 'place', 'prizeusd', 'start', 'end',
                           'startelo', 'startglicko2mu', 'startglicko2phi', 'startglicko2rating', 'endelo', 'endglicko2mu', 'endglicko2phi', 'endglicko2rating'])
for turl, ifd in alltour.items():
    try:
        if (pd.to_datetime(ifd['start'], format='%d/%m/%Y') > pd.to_datetime(endti2)) & (pd.to_datetime(ifd['end'], format='%d/%m/%Y') <= pd.to_datetime(endti7)):
            try:
                tourdict = load_obj('forglicko/' + re.sub(r'\W+', '', ifd['title']).lower() )
            except:
                continue
            ptc, prz = parseSquad(tourdict)
            startelodict, endelodict = getEloDict(tourdict)
            df = compareAndPut(df, ptc, prz, ifd, startelodict, endelodict)
    except:
        print(turl)
        errortitle = errortitle.append(turl)

Wall time: 1min 36s


In [184]:
teamEasy

{'.aspera.': 'aspera',
 '1stvn': '1stvn',
 '3dclan': '3dclan',
 '3dmax': '3dmax',
 '4 friends + chrillee': '4friendschrillee',
 '4anchorsseacaptain': '4anchorsseacaptain',
 '4cloverlepricon': '4cloverlepricon',
 '4fc': '4friendschrillee',
 '4p5': '4protectfive',
 '4vikingsdane': '4vikingsdane',
 '5eva.': '5eva',
 '5inq': '5inqo',
 '5jungz': '5jungz',
 '90s': '90sgaming',
 '???': 'redefiningmadness',
 'ASUS.polar': 'asuspolar',
 'Fiskestanga': 'fiskestanga',
 'Flip.sid3 Tactics': 'flipsid3tactics',
 'MUFC': 'mufc',
 'ROOT Gaming': 'rootgaming',
 'Team Finland': 'finland',
 'aaa': 'teamaaa',
 'abc': 'applebananacucumber',
 'absolutelegends': 'absolutelegends',
 'aces': 'acesgaming',
 'acionarena': 'acionarena',
 'adfinem': 'adfinem',
 'affection': 'affection',
 'aftershock': 'aftershockgaming',
 'ahead': 'aheadgaming',
 'ahead.kz': 'aheadgaming',
 'al': 'absolutelegends',
 'alternate': 'alternateattax',
 'ancientwarriors': 'ancientwarriors',
 'animal kingdom': 'kingdomdota',
 'anji': 'cy

In [182]:
len(df[df['place'] == '0.00'].team.unique())

90

In [144]:
df[df['place'] == '0.00'][df[df['place'] == '0.00'].startelo.notnull()].drop_duplicates(['team', 'tour'])

Unnamed: 0,team,pos,id,tour,prizepool,place,prizeusd,start,end,startelo,startglicko2mu,startglicko2phi,startglicko2rating,endelo,endglicko2mu,endglicko2phi,endglicko2rating
105,absolutelegends,pos1,comewithme,StarLadder StarSeries Season 3,15000,0.0,0.0,17/09/2012,21/10/2012,966.952981,1569.803678,69.256178,1396.663234,1074.657967,1601.045548,68.869853,1428.870915
1946,quanticgaming,pos1,paris,RaidCall EMS One Summer Season,35000,0.0,0.0,19/06/2013,14/07/2013,1287.303037,1754.629711,62.89718,1597.386762,1033.806893,1691.628221,56.749221,1549.755168
1971,alliance,pos1,loda,RaidCall EMS One Summer Season,35000,0.0,0.0,19/06/2013,14/07/2013,1435.67634,1859.546032,35.62236,1770.490131,1383.078043,1859.767365,34.226496,1774.201125
1981,4friendschrillee,pos1,strangby,RaidCall EMS One Summer Season,35000,0.0,0.0,19/06/2013,14/07/2013,1069.343115,1597.397177,52.022019,1467.342129,958.811598,1558.745927,50.373164,1432.813017
3451,superstrongdinosaurs,pos1,sneyking,Netolic Pro League #4 West,10000,0.0,0.0,04/12/2013,12/12/2013,1192.149559,1757.036317,61.27149,1603.857592,1183.967678,1742.756167,59.406079,1594.240969
3626,firstdeparture,pos1,chibix,Asian Cyber Games,30000,0.0,0.0,27/12/2013,29/12/2013,1197.219201,1685.632556,53.396601,1552.141055,1232.284973,1690.947476,50.018212,1565.901945
5276,mvpphoenix,pos1,march haslink,The International 2014,10923977,0.0,0.0,08/07/2014,21/07/2014,1120.723836,1734.114424,43.767038,1624.69683,1106.668213,1734.114424,46.190279,1618.638726
5281,cisgame,pos1,black,The International 2014,10923977,0.0,0.0,08/07/2014,21/07/2014,1035.834722,1604.60769,52.315209,1473.819668,987.88149,1604.60769,54.356509,1468.716418
5331,virtuspro,pos1,illidan,The International 2014,10923977,0.0,0.0,08/07/2014,21/07/2014,1078.389525,1633.691578,42.254745,1528.054714,1020.115306,1633.691578,44.798671,1521.694899
5866,teamempire,pos1,silent,ASUS ROG DreamLeague Season 2,115560,0.0,0.0,06/10/2014,29/11/2014,1128.216105,1824.331128,36.611157,1732.803236,994.481579,1807.859016,38.802532,1710.852685


In [129]:
teamEasy['fnatic'] = 'fnaticeu'

KeyError: 'fnatic'

In [62]:
len(teamEasy)

833

In [54]:
ptc

{'hellraisers': {'pos1': 'rmn',
  'pos2': 'afoninje',
  'pos3': 'gorec',
  'pos4': 'goddam',
  'pos5': 'dread'},
 'invictusgaming': {'pos1': 'burning',
  'pos2': 'ferrari_430',
  'pos3': 'luo',
  'pos4': 'chuan',
  'pos5': 'faith'},
 'summersrift': {'pos1': 'bananaslamjamma',
  'pos2': 'brax',
  'pos3': 'ixmike88',
  'pos4': 'demon',
  'pos5': 'whitebeard'},
 'teammalaysia': {'pos1': 'kyxy',
  'pos2': 'kecikimba',
  'pos3': 'ohaiyo',
  'pos4': 'johnny',
  'pos5': 'mushi'},
 'teamsecret': {'pos1': 'arteezy',
  'pos2': 's4',
  'pos3': 'zai',
  'pos4': 'puppey',
  'pos5': 'kuroky'}}

In [55]:
prz

{'hellraisers': {'place': '4', 'usdprize': '8,798'},
 'invictusgaming': {'place': '2', 'usdprize': '25,239'},
 'summersrift': {'place': '5', 'usdprize': '5,000'},
 'teammalaysia': {'place': '3', 'usdprize': '10,697'},
 'teamsecret': {'place': '1', 'usdprize': '38,561'}}

In [56]:
df

Unnamed: 0,team,pos,id,tour,prizepool,place,prizeusd,start,end,startelo,startglicko2mu,startglicko2phi,startglicko2rating,endelo,endglicko2mu,endglicko2phi,endglicko2rating
0,teammalaysia,pos1,kyxy,2015 Red Bull Battle Grounds: Dota 2,88290,3,10697,06/04/2015,10/05/2015,1218.115314,1877.024101,55.925833,1737.209517,1086.465776,1877.482587,46.073665,1762.298425
1,teammalaysia,pos2,kecikimba,2015 Red Bull Battle Grounds: Dota 2,88290,3,10697,06/04/2015,10/05/2015,1218.115314,1877.024101,55.925833,1737.209517,1086.465776,1877.482587,46.073665,1762.298425
2,teammalaysia,pos3,ohaiyo,2015 Red Bull Battle Grounds: Dota 2,88290,3,10697,06/04/2015,10/05/2015,1218.115314,1877.024101,55.925833,1737.209517,1086.465776,1877.482587,46.073665,1762.298425
3,teammalaysia,pos4,johnny,2015 Red Bull Battle Grounds: Dota 2,88290,3,10697,06/04/2015,10/05/2015,1218.115314,1877.024101,55.925833,1737.209517,1086.465776,1877.482587,46.073665,1762.298425
4,teammalaysia,pos5,mushi,2015 Red Bull Battle Grounds: Dota 2,88290,3,10697,06/04/2015,10/05/2015,1218.115314,1877.024101,55.925833,1737.209517,1086.465776,1877.482587,46.073665,1762.298425
5,invictusgaming,pos1,burning,2015 Red Bull Battle Grounds: Dota 2,88290,2,25239,06/04/2015,10/05/2015,1192.221152,1913.478548,37.402883,1819.971339,1042.873249,1916.930558,35.472331,1828.24973
6,invictusgaming,pos2,ferrari_430,2015 Red Bull Battle Grounds: Dota 2,88290,2,25239,06/04/2015,10/05/2015,1192.221152,1913.478548,37.402883,1819.971339,1042.873249,1916.930558,35.472331,1828.24973
7,invictusgaming,pos3,luo,2015 Red Bull Battle Grounds: Dota 2,88290,2,25239,06/04/2015,10/05/2015,1192.221152,1913.478548,37.402883,1819.971339,1042.873249,1916.930558,35.472331,1828.24973
8,invictusgaming,pos4,chuan,2015 Red Bull Battle Grounds: Dota 2,88290,2,25239,06/04/2015,10/05/2015,1192.221152,1913.478548,37.402883,1819.971339,1042.873249,1916.930558,35.472331,1828.24973
9,invictusgaming,pos5,faith,2015 Red Bull Battle Grounds: Dota 2,88290,2,25239,06/04/2015,10/05/2015,1192.221152,1913.478548,37.402883,1819.971339,1042.873249,1916.930558,35.472331,1828.24973


    #Clean Data
    df = df[df['id'] != '']
    df.loc[(df['id'] == 'g haslink'), 'id'] = 'g'
    df.loc[(df['id'] == 'noone haslink'), 'id'] = 'noone'
    df.loc[(df['id'].str[:8] == 'paparazi'), 'id'] = 'paparazi'
    df.pos = pd.to_numeric(df.pos.str[-1])

In [190]:
#    df[['team', 'pos', 'id', 'tour', 'prizeusd', 'start', 'startelo', 'end', 'endelo']].to_csv('elo.csv', encoding='utf-8', index=False)
df[['team', 'pos', 'id', 'tour', 'prizepool', 'place', 'prizeusd', 'start', 'startglicko2mu', 'startglicko2phi', 'startglicko2rating', 'end', 'endglicko2mu', 'endglicko2phi', 'endglicko2rating']].to_csv('glickoandprize.csv', encoding='utf-8', index=False)

In [189]:
df = df[df['id'] != '']
df.loc[(df['id'] == 'g haslink'), 'id'] = 'g'
df.loc[(df['id'] == 'noone haslink'), 'id'] = 'noone'
df.loc[(df['id'].str[:8] == 'paparazi'), 'id'] = 'paparazi'
df.pos = pd.to_numeric(df.pos.str[-1])