This notebook pulls the relevant information out of the previously scraped individual game records

In [16]:
import glob
import json
import pandas as pd
import os
from collections import defaultdict
import math

In [17]:
plat = True

In [19]:
if plat:
    game_folder= 'data/raw_game_jsons_plat/'
    output_folder = 'data/stats_summoner_champ_history_plat/'
    match_folder = 'data/match_history_plat/'
else:
    game_folder = 'data/raw_game_jsons/'    
    output_folder = 'data/stats_summoner_champ_history/'
    match_folder = 'data/match_history'

In [6]:
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

In [7]:
all_games = set(glob.glob('{}*'.format(game_folder)))

In [8]:
already_parsed = glob.glob('{}*'.format(output_folder))

In [9]:
#champion_json = requests.get('http://ddragon.leagueoflegends.com/cdn/8.23.1/data/en_US/champion.json').json()
#with open('data/champ_json', 'w') as f:
#    json.dump(champion_json, f)
with open('data/champ_json', 'r') as f:
    champion_json = json.load(f)
#champion id -> champion name
def get_champion(key): 
    for champion in champion_json['data'].values(): 
        if champion['key'] == str(key): 
            return champion['name']
    #print('something is wrong')

In [None]:
for mh in glob.glob('{}/*'.format(match_folder)):   
    main_account_id = os.path.basename(mh).split('.')[0]
    all_q_ids = []
    games = list(pd.read_csv(mh)['gameId'])
    tss = list(pd.read_csv(mh)['timestamp'])
    villain_dict = defaultdict(list)
    villain_stat_dict = defaultdict(list)
    all_stats = ['kills', 'deaths', 'assists', 'pentaKills', 'firstBloodKill', 'physicalDamageDealtToChampions', 'magicDamageDealtToChampions']
    hero_stat_dict = defaultdict(list)
    ban_dict = defaultdict(list)
    account_id_dict = defaultdict(list)
    heroes = []
    hero_win = []
    hero_picked_first = []
    timestamps = []
    main_character_list = [] 
    write_filename = '{}{}.csv'.format(output_folder, main_account_id)
    hero_role = []
    success = True
    if (write_filename in already_parsed):
        print('We already parsed the match history of this summoner.')
        continue
    for (g, ts) in zip(games, tss):        
        if ts < 1535785200000: #sep 1, 2018 (unix milliseconds) (we only scraped stuff after this)
            continue
        single = '{}{}.json'.format(game_folder, g)
        if single not in all_games:
            continue
        with open(single, 'r') as f:
            game = json.load(f)
        all_q_ids.append(game['queueId'])
        if (game['queueId']) != 420: # We only want ranked
            continue
        # This should be equal to game['gameId']
        game_id = os.path.basename(single).split('.')[0]
        champ_ids = {}
        id_to_lane = {}
        id_to_team = {}
        if len(game['participants']) != 10:
            print('There are not 10 players recorded in this game, so skipping.')
            print('The game mode is {}'.format(game['gameMode']))
            continue
        all_participant_stats = defaultdict(dict)
        for p in game['participants']:
            participant_dict = {}
            s = p['stats']
            stats_we_want = ['kills', 'deaths', 'assists', 'pentaKills', 'firstBloodKill', 'physicalDamageDealtToChampions', 'magicDamageDealtToChampions']
            for stat in stats_we_want:
                try:
                    participant_dict[stat] = s[stat]
                except:
                    participant_dict[stat] = None
            all_participant_stats[p['participantId']] = participant_dict
            id_to_team[p['participantId']] = p['teamId']
            id_to_lane[p['participantId']] = p['timeline']['lane']
            champ_ids[p['participantId']] = p['championId']

        # What champs were banned?    
        pick_order_to_ban = {}
        for team in game['teams']:
            for ban in team['bans']:
                pick_order_to_ban[ban['pickTurn']] = ban['championId']
        # Which team won?
        winning_team_id = -100
        for team in game['teams']:
            if team['win'] == 'Win':
                winning_team_id = team['teamId']
        if winning_team_id == -100:
            print('The winning team is not recorded.')
            continue
        # Who is our protagonist?
        main_char = main_account_id
        if len(game['participantIdentities']) != 10:
            print('There are not 10 players recorded in this game, so skipping.')
            continue
        main_pid = False
        for pi in game['participantIdentities']:
            try:
                if str(pi['player']['accountId']) == main_char:
                    main_pid = pi['participantId']
            except:
                print('Failed to find our main character, skipping')
        # What is our protagonist's (player who's games we're scraping) lane?
        if main_pid:
            protag_lane = id_to_lane[main_pid]
        else: #Info not provided
            continue
        if protag_lane == 'NONE': #Lane not recorded
            continue
        hero_role.append(protag_lane)
        # Who are our villains? (players on opposing team)
        villains = []
        villain_stats = []
        hero_stats = []
        for pid in id_to_lane.keys():
            if pid == main_pid:
                hero_stats.append(all_participant_stats[pid])
                continue
            if id_to_lane[pid] != protag_lane:
                continue
            if id_to_team[pid] == id_to_team[main_pid]:
                continue
            villains.append(pid)
            villain_stats.append(all_participant_stats[pid])
        for i in range(5):
            if i >= len(villains):
                villain_dict[i].append(None)
            else:
                villain_dict[i].append(champ_ids[villains[i]])
        if len(hero_stats) != 1:
            print('multiple hero stats? {}'.format(len(hero_stats)))
            hero_stat_dict[stat].append(None)
        else:
            for stat in all_stats:
                hero_stat_dict[stat].append(hero_stats[0][stat])
        if len(villains) > 0:
            for stat in all_stats:
                villain_stat_dict[stat].append(villain_stats[0][stat])
        else:
            for stat in all_stats:
                villain_stat_dict[stat].append(None)
        order = [1,6,7,2,3,8,9,4,5,10]
        hero_first = 1
        for v in villains:
            if order.index(v) < order.index(main_pid):
                hero_first = 0
        hero_picked_first.append(hero_first)
        # Who was banned?
        for k in pick_order_to_ban.keys():
            ban_dict[k].append(pick_order_to_ban[k])
        # Did hero win?
        if id_to_team[main_pid] == winning_team_id:
            hero_win.append(1)
        else:
            hero_win.append(0)
        heroes.append(champ_ids[main_pid])
        for i in range(10):
            pi = game['participantIdentities'][i]
            account_id_dict[i].append(pi['player']['accountId'])
        # When was the game played?
        timestamps.append(game['gameCreation'])
        main_character_list.append(main_account_id)
    if success:
        single_champ_history = pd.DataFrame.from_dict(villain_dict).add_prefix('villain_')
        ban_df = pd.DataFrame.from_dict(ban_dict).add_prefix('ban_')
        single_champ_history = pd.concat([single_champ_history, ban_df], axis=1)
        single_champ_history['hero'] = heroes
        single_villain_stats = pd.DataFrame.from_dict(villain_stat_dict).add_prefix('villain_0_')
        single_champ_history = pd.concat([single_champ_history, single_villain_stats], axis=1)
        single_hero_stats = pd.DataFrame.from_dict(hero_stat_dict).add_prefix('hero_')
        single_champ_history = pd.concat([single_champ_history, single_hero_stats], axis=1)
        account_id_df = pd.DataFrame.from_dict(account_id_dict).add_prefix('account_id_')
        single_champ_history = pd.concat([single_champ_history, account_id_df], axis=1)     
        single_champ_history['timestamp'] = timestamps
        single_champ_history['hero_account_id'] = main_character_list
        single_champ_history['hero_role'] = hero_role
        single_champ_history['hero_win'] = hero_win
        single_champ_history['hero_picked_first'] = hero_picked_first
        for cn in list(single_champ_history.columns.values)[:16]:
            foo = list(single_champ_history[cn])
            bar = []
            for f in foo:
                if f and not math.isnan(f):
                    champ_name = get_champion(int(f))
                    bar.append(champ_name)
                else:
                    bar.append(None)
            single_champ_history[cn] = bar
        single_champ_history.to_csv(write_filename, index=False)  