In [1]:
import bz2
import ujson as json

In [2]:
def iterate_matches(matches_filename):
    with bz2.BZ2File(matches_filename) as f:
        for n, line in enumerate(f):
            match = json.loads(line)
            yield match, line
            if (n+1) % 1000 == 0:
                print 'Processed %d matches' % (n+1)

In [3]:
TIME_LIMIT = 300 # 5 min
TIME_STEPSIZE = TIME_LIMIT / 5

import pandas as pd

abilities_dict = {}
apd = pd.read_csv("data/dictionaries/abilities.csv", index_col="id")
for idx, id in enumerate(apd.index):
    abilities_dict[id] = idx
del apd
    
heroes_dict = {}
hpd = pd.read_csv("data/dictionaries/heroes.csv", index_col="id")
for idx, id in enumerate(hpd.index):
    heroes_dict[id] = idx
del hpd
    
items_dict = {}
ipd = pd.read_csv("data/dictionaries/items.csv", index_col="id")
for idx, id in enumerate(ipd.index):
    items_dict[id] = idx
del ipd

MAX_ITEMS = len(items_dict)
MAX_HEROES = len(heroes_dict)
MAX_ABILS = len(abilities_dict)

In [4]:
# Parse func
def parse_match(match):
    # Main features/target
    lobby_type = match['lobby_type']
    lobby_type = -1 if lobby_type == 7 else lobby_type
    radiant_win = match['finish']['radiant_win'] if 'finish' in match else None
    match_id = match['match_id']
    start_time = match['start_time']

    # Time series with limit
    max_time_series = [idx for idx, t in enumerate([t for t in match['times'] if t <= TIME_LIMIT])]
    if not max_time_series:
        return None
    max_time_idx = max(max_time_series)

    # Picks & bans
    radiant_picks = set([p['hero_id'] for p in match['picks_bans'] if p['team'] == 1 and p['is_pick'] == True])
    radiant_bans = set([p['hero_id'] for p in match['picks_bans'] if p['team'] == 1 and p['is_pick'] == False])
    dire_picks = set([p['hero_id'] for p in match['picks_bans'] if p['team'] == 0 and p['is_pick'] == True])
    dire_bans = set([p['hero_id'] for p in match['picks_bans'] if p['team'] == 0 and p['is_pick'] == False])

    radiant_abils = {}
    dire_abils = {}
    radiant_creep_kills = []
    dire_creep_kills = []
    radiant_xp = []
    dire_xp = []
    radiant_gold = []
    dire_gold = []
    radiant_kills = dict([(i, 0) for i in range(0, (max_time_idx + 1) * TIME_STEPSIZE, TIME_STEPSIZE)])
    dire_kills = dict([(i, 0) for i in range(0, (max_time_idx + 1) * TIME_STEPSIZE, TIME_STEPSIZE)])
    radiant_items = {}
    dire_items = {}
    radiant_wards = dict([(i, 0) for i in range(0, (max_time_idx + 1) * TIME_STEPSIZE, TIME_STEPSIZE)])
    dire_wards = dict([(i, 0) for i in range(0, (max_time_idx + 1) * TIME_STEPSIZE, TIME_STEPSIZE)])

    for p in match['players']:
        hero_id = p['hero_id']

        aus = [au['ability'] for au in p['ability_upgrades'] if au['time'] <= max_time_idx * TIME_STEPSIZE]
        items = [
            item_id
            for item_id, time in[(t['item_id'], t['time'])
            for t in p['purchase_log']] if time <= max_time_idx * TIME_STEPSIZE
        ]

        is_radiant = hero_id in radiant_picks
        if is_radiant:
            for a_id in aus:
                radiant_abils[a_id] = radiant_abils.get(a_id, 0) + 1
            for i in items:
                radiant_items[i] = radiant_items.get(i, 0) + 1
            radiant_creep_kills.append(p['lh_t'][max_time_idx])
            radiant_xp.append(p['xp_t'][max_time_idx])
            radiant_gold.append(p['gold_t'][max_time_idx])
            for k in p['kills_log']:
                for time in radiant_kills:
                    if k['time'] <= time:
                        radiant_kills[time] += 1
            for k in p['obs_log']:
                for time in radiant_wards:
                    if k['time'] <= time:
                        radiant_wards[time] += 1

        else:
            for a_id in aus:
                dire_abils[a_id] = dire_abils.get(a_id, 0) + 1
            for i in items:
                dire_items[i] = dire_items.get(i, 0) + 1
            dire_creep_kills.append(p['lh_t'][max_time_idx])
            dire_xp.append(p['xp_t'][max_time_idx])
            dire_gold.append(p['gold_t'][max_time_idx])
            for k in p['kills_log']:
                for time in dire_kills:
                    if k['time'] <= time:
                        dire_kills[time] += 1
            for k in p['obs_log']:
                for time in dire_wards:
                    if k['time'] <= time:
                        dire_wards[time] += 1

    radiant_creep_kills = sorted(radiant_creep_kills)
    dire_creep_kills = sorted(dire_creep_kills)
    radiant_xp = sorted(radiant_xp)
    dire_xp = sorted(dire_xp)
    radiant_gold = sorted(radiant_gold)
    dire_gold = sorted(dire_gold)

    radiant_kills = [v for k, v in sorted([(k, v) for k, v in radiant_kills.items()], key=lambda x: x[0])]
    dire_kills = [v for k, v in sorted([(k, v) for k, v in dire_kills.items()], key=lambda x: x[0])]

    radiant_wards = [v for k, v in sorted([(k, v) for k, v in radiant_wards.items()], key=lambda x: x[0])]
    dire_wards = [v for k, v in sorted([(k, v) for k, v in dire_wards.items()], key=lambda x: x[0])]

    fb = 0
    for i, (rk, dk) in enumerate(zip(radiant_kills, dire_kills)):
        if rk > dk:
            fb = 6 - i
            break
        elif rk < dk:
            fb = -6 + i
            break

    return {
        "radiant_picks" : radiant_picks,
        "dire_picks" : dire_picks,
        "radiant_bans" : radiant_bans,
        "dire_bans" : dire_bans,
        "radiant_abils" : radiant_abils,
        "dire_abils" : dire_abils,
        "radiant_kills" : radiant_kills,
        "dire_kills" : dire_kills,
        "radiant_items" : radiant_items,
        "dire_items" : dire_items,
        "radiant_gold" : radiant_gold,
        "dire_gold" : dire_gold,
        "radiant_xp" : radiant_xp,
        "dire_xp" : dire_xp,
        "radiant_creep_kills" : radiant_creep_kills,
        "dire_creep_kills" : dire_creep_kills,
        "radiant_wards" : radiant_wards,
        "dire_wards" : dire_wards,
        "fb" : fb,
        "radiant_win" : None if radiant_win is None else (1 if radiant_win else 0),
        "start_time": start_time,
        "lobby_type": lobby_type,
        "match_id" : match_id
    }

In [5]:
def train_columns():
    fst_columns = ['match_id', 'start_time', 'lobby_type']

    rhcolumns = ['rh_%d' % (i + 1) for i in xrange(MAX_HEROES)]
    dhcolumns = ['dh_%d' % (i + 1) for i in xrange(MAX_HEROES)]
    ricolumns = ['ri_%d' % (i + 1) for i in xrange(MAX_ITEMS)]
    dicolumns = ['di_%d' % (i + 1) for i in xrange(MAX_ITEMS)]
    racolumns = ['ra_%d' % (i + 1) for i in xrange(MAX_ABILS)]
    dacolumns = ['da_%d' % (i + 1) for i in xrange(MAX_ABILS)]

    rkcolumns = ['rk_%d' % (i + 1) for i in xrange(6)]
    dkcolumns = ['dk_%d' % (i + 1) for i in xrange(6)]
    rwcolumns = ['rw_%d' % (i + 1) for i in xrange(6)]
    dwcolumns = ['dw_%d' % (i + 1) for i in xrange(6)]

    rckcolumns = ['rck_%d' % (i + 1) for i in xrange(5)]
    dckcolumns = ['dck_%d' % (i + 1) for i in xrange(5)]
    rxcolumns = ['rx_%d' % (i + 1) for i in xrange(5)]
    dxcolumns = ['dx_%d' % (i + 1) for i in xrange(5)]
    rgcolumns = ['rg_%d' % (i + 1) for i in xrange(5)]
    dgcolumns = ['dg_%d' % (i + 1) for i in xrange(5)]

    fb_columns = ['fb']
    y_columns = ['radiant_win']

    total_columns = (fst_columns + rhcolumns + dhcolumns + ricolumns +
                     dicolumns + racolumns + dacolumns +
                     rkcolumns + dkcolumns + rwcolumns +
                     dwcolumns + rckcolumns + dckcolumns + 
                     rxcolumns + dxcolumns + rgcolumns +
                     dgcolumns + fb_columns + y_columns)

    header = ",".join(total_columns) + '\n'
    return total_columns, header

def test_columns():
    fst_columns = ['match_id', 'start_time', 'lobby_type']

    rhcolumns = ['rh_%d' % (i + 1) for i in xrange(MAX_HEROES)]
    dhcolumns = ['dh_%d' % (i + 1) for i in xrange(MAX_HEROES)]
    ricolumns = ['ri_%d' % (i + 1) for i in xrange(MAX_ITEMS)]
    dicolumns = ['di_%d' % (i + 1) for i in xrange(MAX_ITEMS)]
    racolumns = ['ra_%d' % (i + 1) for i in xrange(MAX_ABILS)]
    dacolumns = ['da_%d' % (i + 1) for i in xrange(MAX_ABILS)]

    rkcolumns = ['rk_%d' % (i + 1) for i in xrange(6)]
    dkcolumns = ['dk_%d' % (i + 1) for i in xrange(6)]
    rwcolumns = ['rw_%d' % (i + 1) for i in xrange(6)]
    dwcolumns = ['dw_%d' % (i + 1) for i in xrange(6)]

    rckcolumns = ['rck_%d' % (i + 1) for i in xrange(5)]
    dckcolumns = ['dck_%d' % (i + 1) for i in xrange(5)]
    rxcolumns = ['rx_%d' % (i + 1) for i in xrange(5)]
    dxcolumns = ['dx_%d' % (i + 1) for i in xrange(5)]
    rgcolumns = ['rg_%d' % (i + 1) for i in xrange(5)]
    dgcolumns = ['dg_%d' % (i + 1) for i in xrange(5)]

    fb_columns = ['fb']

    total_columns = (fst_columns + rhcolumns + dhcolumns + ricolumns +
                     dicolumns + racolumns + dacolumns +
                     rkcolumns + dkcolumns + rwcolumns +
                     dwcolumns + rckcolumns + dckcolumns + 
                     rxcolumns + dxcolumns + rgcolumns +
                     dgcolumns + fb_columns)

    header = ",".join(total_columns) + '\n'
    return total_columns, header

In [6]:
def parse_bz2_to_csv(input_file, output_file, is_train=True):
    with open(output_file, 'w') as f:
        total_columns, header = train_columns() if is_train else test_columns()
        print len(total_columns)
        
        f.write(header)
        num = 0
        unparsed = []
        for match, line in iterate_matches(input_file):
            # Parsing
            match = parse_match(match)
            if not match:
                unparsed.append(line)
                continue

            row_data = []

            # fst columns
            row_data.extend([i for i in [match['match_id'], match['start_time'], match['lobby_type']]])

            # Radiant picks
            d = dict((i, 0) for i in range(MAX_HEROES))
            for id in match['radiant_picks']:
                d[heroes_dict[id]] = 1
            row_data.extend([v for k, v in d.items()])

            # Dire picks
            d = dict((i, 0) for i in range(MAX_HEROES))
            for id in match['dire_picks']:
                d[heroes_dict[id]] = 1
            row_data.extend([v for k, v in d.items()])

            # Radiant items
            d = dict((i, 0) for i in range(MAX_ITEMS))
            for id in match['radiant_items']:
                d[items_dict[id]] = match['radiant_items'][id]
            row_data.extend([v for k, v in d.items()])

            # Dire items
            d = dict((i, 0) for i in range(MAX_ITEMS))
            for id in match['dire_items']:
                d[items_dict[id]] = match['dire_items'][id]
            row_data.extend([v for k, v in d.items()])

            # Radiant abilities
            d = dict((i, 0) for i in range(MAX_ABILS))
            for id in match['radiant_abils']:
                idx = abilities_dict.get(id, -1)
                if idx >= 0:
                    d[idx] = match['radiant_abils'][id]
            row_data.extend([v for k, v in d.items()])

            # Dire abilities
            d = dict((i, 0) for i in range(MAX_ABILS))
            for id in match['dire_abils']:
                idx = abilities_dict.get(id, -1)
                if idx >= 0:
                    d[idx] = match['dire_abils'][id]
            row_data.extend([v for k, v in d.items()])

            # Radiant kills
            kills = match['radiant_kills']
            kills = [kills[i] if len(kills) >= i + 1 else 0 for i in xrange(6)] # Time series
            row_data.extend([k for k in kills])

            # Dire kills
            lst = match['dire_kills']
            lst = [lst[i] if len(lst) >= i + 1 else 0 for i in xrange(6)] # Time series
            row_data.extend([k for k in lst])

            # Radiant wards
            lst = match['radiant_wards']
            lst = [lst[i] if len(lst) >= i + 1 else 0 for i in xrange(6)] # Time series
            row_data.extend([k for k in lst])

            # Dire wards
            lst = match['dire_wards']
            lst = [lst[i] if len(lst) >= i + 1 else 0 for i in xrange(6)] # Time series
            row_data.extend([k for k in lst])

            # Radiant creep kills
            row_data.extend([k for k in match['radiant_creep_kills']]) # 5-min snapshot

            # Dire creep kills
            row_data.extend([k for k in match['dire_creep_kills']]) # 5-min snapshot

            # Radiant XP
            row_data.extend([k for k in match['radiant_xp']]) # 5-min snapshot

            # Dire XP
            row_data.extend([k for k in match['dire_xp']]) # 5-min snapshot

            # Radiant gold
            row_data.extend([k for k in match['radiant_gold']]) # 5-min snapshot

            # Dire gold
            row_data.extend([k for k in match['dire_gold']]) # 5-min snapshot

            # FB
            row_data.append(match['fb'])
            
            # y
            if (is_train):
                row_data.append(match['radiant_win'])

            assert len(row_data) == len(total_columns)

            row_str = ",".join([str(i) for i in row_data])
            f.write(row_str + "\n")

            num += 1

    return unparsed

In [10]:
unparsed = parse_bz2_to_csv('data/matches.jsonlines.bz2', 'data/new_features.csv', is_train=True)

1931
Processed 1000 matches
Processed 2000 matches
Processed 3000 matches
Processed 4000 matches
Processed 5000 matches
Processed 6000 matches
Processed 7000 matches
Processed 8000 matches
Processed 9000 matches
Processed 10000 matches
Processed 11000 matches
Processed 12000 matches
Processed 13000 matches
Processed 14000 matches
Processed 15000 matches
Processed 16000 matches
Processed 17000 matches
Processed 18000 matches
Processed 19000 matches
Processed 20000 matches
Processed 21000 matches
Processed 22000 matches
Processed 23000 matches
Processed 24000 matches
Processed 25000 matches
Processed 26000 matches
Processed 27000 matches
Processed 28000 matches
Processed 29000 matches
Processed 30000 matches
Processed 31000 matches
Processed 32000 matches
Processed 33000 matches
Processed 34000 matches
Processed 35000 matches
Processed 36000 matches
Processed 37000 matches
Processed 38000 matches
Processed 39000 matches
Processed 40000 matches
Processed 41000 matches
Processed 42000 matc

In [8]:
failed_matches = map(lambda x: json.loads(x)['match_id'], unparsed)

In [9]:
failed_matches

[1811, 6336, 13048, 27418]