In [5]:
from __future__ import division, print_function
import os
import preprocessing
import sys
import numpy as np
import pandas as pd

In [2]:
sys.path.append(os.path.join( '..', 'data'))
from build_db import connect

In [3]:
# connect to db
db_name = 'dota_db'
with open(os.path.expanduser('~/.pgpass')) as f:
    for line in f:
        host, port, db, user, password = [x.strip() for x in line.split(':')]
        if db == db_name:
            con, meta = connect(user=user, password=password, db=db, host=host, port=port)
            break

In [7]:
pd.read_sql('SELECT COUNT(*) FROM matches', con).loc[0, 'count']

32565

# Data Background
Our data is made up of matches starting from 08/20/2017 to 09/26/2017, i.e. patch 7.06f. At the time of writing we have 32565 matches and counting.

In [8]:
# loading toy data set
df_toy = pd.read_sql('SELECT * FROM matches ORDER BY RANDOM() LIMIT 1000;', con)

In [9]:
df_toy.columns

Index([u'match_id', u'duration', u'game_mode', u'match_seq_num', u'picks_bans',
       u'players', u'radiant_win', u'start_time'],
      dtype='object')

In [11]:
# confirm that all games are cm games
df_toy['game_mode'].value_counts()

2    1000
Name: game_mode, dtype: int64

In [13]:
# confirm that matches are unique
np.unique(df_toy['match_id'].values).shape

(1000,)

In [22]:
# confirm that picks_bans always have 20 entries
np.unique(df_toy['picks_bans'].map(lambda x: len(x)).values)

array([10, 15, 17, 18, 19, 20])

In [27]:
# examining partial pick_bans
example = df_toy['picks_bans'][df_toy['picks_bans'].map(lambda x: len(x)) == 15].iloc[0]
example

[{u'hero_id': 57, u'is_pick': False, u'order': 0, u'team': 0},
 {u'hero_id': 81, u'is_pick': False, u'order': 1, u'team': 0},
 {u'hero_id': 39, u'is_pick': True, u'order': 2, u'team': 1},
 {u'hero_id': 75, u'is_pick': True, u'order': 3, u'team': 0},
 {u'hero_id': 4, u'is_pick': True, u'order': 4, u'team': 0},
 {u'hero_id': 61, u'is_pick': True, u'order': 5, u'team': 1},
 {u'hero_id': 34, u'is_pick': False, u'order': 6, u'team': 0},
 {u'hero_id': 93, u'is_pick': False, u'order': 7, u'team': 0},
 {u'hero_id': 33, u'is_pick': True, u'order': 8, u'team': 0},
 {u'hero_id': 53, u'is_pick': True, u'order': 9, u'team': 1},
 {u'hero_id': 74, u'is_pick': True, u'order': 10, u'team': 0},
 {u'hero_id': 114, u'is_pick': True, u'order': 11, u'team': 1},
 {u'hero_id': 1, u'is_pick': False, u'order': 12, u'team': 0},
 {u'hero_id': 17, u'is_pick': True, u'order': 13, u'team': 1},
 {u'hero_id': 102, u'is_pick': True, u'order': 14, u'team': 0}]

Some games have fewer bans for some reason, but all have 10 picks. When we build our model, we can keep 10 bans if we allow some to be None. Some games also have dire pick first.

In [24]:
# How many games  have full pick_ban?
df_toy['picks_bans'][df_toy['picks_bans'].map(lambda x: len(x)) == 20].shape

(980,)

In [31]:
df_toy_pb = df_toy[['match_id', 'picks_bans', 'radiant_win']].set_index('match_id')
df_toy_pb.head()

Unnamed: 0_level_0,picks_bans,radiant_win
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3443069468,"[{u'is_pick': False, u'hero_id': 87, u'order':...",True
3389751992,"[{u'is_pick': False, u'hero_id': 75, u'order':...",False
3391239235,"[{u'is_pick': False, u'hero_id': 37, u'order':...",True
3441934391,"[{u'is_pick': False, u'hero_id': 75, u'order':...",True
3390656894,"[{u'is_pick': False, u'hero_id': 50, u'order':...",False


<table cellpadding="4">
<caption><b>Captains Mode picking order</b><br>(<i>This example assumes <a href="/File:Radiant_icon.png" class="image"><img alt="Radiant icon.png" src="https://dota2.gamepedia.com/media/dota2.gamepedia.com/thumb/2/2a/Radiant_icon.png/16px-Radiant_icon.png?version=ef34c7f53dc1972609aef0114763ede1" width="16" height="16" srcset="https://dota2.gamepedia.com/media/dota2.gamepedia.com/thumb/2/2a/Radiant_icon.png/24px-Radiant_icon.png?version=ef34c7f53dc1972609aef0114763ede1 1.5x, https://dota2.gamepedia.com/media/dota2.gamepedia.com/thumb/2/2a/Radiant_icon.png/32px-Radiant_icon.png?version=ef34c7f53dc1972609aef0114763ede1 2x"></a> <span style="color:#42A51C">Radiant</span> is the starting team. Invert teams if <a href="/File:Dire_icon.png" class="image"><img alt="Dire icon.png" src="https://dota2.gamepedia.com/media/dota2.gamepedia.com/thumb/0/0e/Dire_icon.png/16px-Dire_icon.png?version=c5abc3405cda4390e2338d8eb787e38d" width="16" height="16" srcset="https://dota2.gamepedia.com/media/dota2.gamepedia.com/thumb/0/0e/Dire_icon.png/24px-Dire_icon.png?version=c5abc3405cda4390e2338d8eb787e38d 1.5x, https://dota2.gamepedia.com/media/dota2.gamepedia.com/thumb/0/0e/Dire_icon.png/32px-Dire_icon.png?version=c5abc3405cda4390e2338d8eb787e38d 2x"></a> <span style="color:#DE3909">Dire</span> is the starting team.</i>)
</caption>
<tbody><tr>
<td style="color:#EEE; background:#42A51C"><b>Ban</b>
</td>
<td style="color:#EEE; background:#DE3909"><b>Ban</b>
</td>
<td style="color:#EEE; background:#42A51C"><b>Ban</b>
</td>
<td style="color:#EEE; background:#DE3909"><b>Ban</b>
</td>
<td>
</td>
<td style="color:#EEE; background:#42A51C"><b>Pick</b>
</td>
<td style="color:#EEE; background:#DE3909"><b>Pick</b>
</td>
<td style="color:#EEE; background:#DE3909"><b>Pick</b>
</td>
<td style="color:#EEE; background:#42A51C"><b>Pick</b>
</td>
<td>
</td>
<td style="color:#EEE; background:#DE3909"><b>Ban</b>
</td>
<td style="color:#EEE; background:#42A51C"><b>Ban</b>
</td>
<td style="color:#EEE; background:#DE3909"><b>Ban</b>
</td>
<td style="color:#EEE; background:#42A51C"><b>Ban</b>
</td>
<td>
</td>
<td style="color:#EEE; background:#DE3909"><b>Pick</b>
</td>
<td style="color:#EEE; background:#42A51C"><b>Pick</b>
</td>
<td style="color:#EEE; background:#DE3909"><b>Pick</b>
</td>
<td style="color:#EEE; background:#42A51C"><b>Pick</b>
</td>
<td>
</td>
<td style="color:#EEE; background:#DE3909"><b>Ban</b>
</td>
<td style="color:#EEE; background:#42A51C"><b>Ban</b>
</td>
<td>
</td>
<td style="color:#EEE; background:#42A51C"><b>Pick</b>
</td>
<td style="color:#EEE; background:#DE3909"><b>Pick</b>
</td></tr></tbody></table>

# Preprocessing

In [94]:
example = df_toy['picks_bans'][df_toy['picks_bans'].map(lambda x: len(x)) == 19].iloc[0]
parsed = preprocessing.parse_pb(example)

In [95]:
parsed

{'t1_bans': (4, 2, 35, 99, 47),
 't1_picks': (75, 37, 39, 12, 21),
 't2_bans': (93, 50, 97, 71, None),
 't2_picks': (87, 27, 55, 56, 43),
 'team1': 0,
 'team2': 1}

In [96]:
example

[{u'hero_id': 4, u'is_pick': False, u'order': 0, u'team': 0},
 {u'hero_id': 93, u'is_pick': False, u'order': 1, u'team': 1},
 {u'hero_id': 2, u'is_pick': False, u'order': 2, u'team': 0},
 {u'hero_id': 50, u'is_pick': False, u'order': 3, u'team': 1},
 {u'hero_id': 75, u'is_pick': True, u'order': 4, u'team': 0},
 {u'hero_id': 87, u'is_pick': True, u'order': 5, u'team': 1},
 {u'hero_id': 27, u'is_pick': True, u'order': 6, u'team': 1},
 {u'hero_id': 37, u'is_pick': True, u'order': 7, u'team': 0},
 {u'hero_id': 97, u'is_pick': False, u'order': 8, u'team': 1},
 {u'hero_id': 35, u'is_pick': False, u'order': 9, u'team': 0},
 {u'hero_id': 71, u'is_pick': False, u'order': 10, u'team': 1},
 {u'hero_id': 99, u'is_pick': False, u'order': 11, u'team': 0},
 {u'hero_id': 55, u'is_pick': True, u'order': 12, u'team': 1},
 {u'hero_id': 39, u'is_pick': True, u'order': 13, u'team': 0},
 {u'hero_id': 56, u'is_pick': True, u'order': 14, u'team': 1},
 {u'hero_id': 12, u'is_pick': True, u'order': 15, u'team': 

In [98]:
%timeit df_toy['picks_bans'].map(preprocessing.parse_pb)

1 loop, best of 3: 3.62 s per loop


In [99]:
pb_series = df_toy['picks_bans'].map(preprocessing.parse_pb)

In [106]:
t1_picks = pb_series.map(lambda x: x['t1_picks'])
t2_picks = pb_series.map(lambda x: x['t2_picks'])
team1 = pb_series.map(lambda x: x['team1'])

In [107]:
picks_df = pd.DataFrame(data={'t1_picks': t1_picks, 't2_picks':t2_picks, 'team1': team1})

In [128]:
concat_df = pd.concat([df_toy, picks_df], axis=1)

In [129]:
# https://stackoverflow.com/questions/29034928/pandas-convert-a-column-of-list-to-dummies
t1_dummies = pd.get_dummies(picks_df['t1_picks'].apply(pd.Series).stack(), prefix='t1', prefix_sep='_').sum(level=0)
t2_dummies = pd.get_dummies(picks_df['t2_picks'].apply(pd.Series).stack(), prefix='t2', prefix_sep='_').sum(level=0)

In [130]:
concat_df['team1_win'] = (concat_df['radiant_win']) ^ (concat_df['team1'] == 1)

In [132]:
full_df = pd.concat([concat_df[['match_id', 'team1_win']], t1_dummies, t2_dummies], axis=1)

Unnamed: 0,match_id,team1_win,t1_1,t1_2,t1_3,t1_4,t1_5,t1_6,t1_7,t1_8,...,t2_104,t2_106,t2_107,t2_108,t2_109,t2_110,t2_111,t2_112,t2_113,t2_114
0,3443069468,False,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3389751992,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3391239235,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3441934391,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3390656894,False,1,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [138]:
processed_df = preprocessing.preprocess(df_toy)

In [139]:
processed_df.head()

Unnamed: 0,match_id,team1_win,t1_1,t1_2,t1_3,t1_4,t1_5,t1_6,t1_7,t1_8,...,t2_104,t2_106,t2_107,t2_108,t2_109,t2_110,t2_111,t2_112,t2_113,t2_114
0,3443069468,False,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3389751992,False,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3391239235,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3441934391,True,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3390656894,False,1,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
