# Overwatch Battletag Scraper

### Imports

In [2]:
from bs4 import BeautifulSoup as bs
import urllib2
from pathlib2 import Path
import codecs
import time
import threading

### Hero Files

In [3]:
hero_dir = "../data/hero/"
hero_list = ["ana",
             "bastion",
             "d.va", "doomfist",
             "genji",
             "hanzo",
             "junkrat",
             "lucio",
             "mccree", "mei", "mercy", "moira",
             "orisa",
             "pharah",
             "reaper", "reinhardt", "roadhog",
             "soldier76", "sombra", "symmetra",
             "torbjorn", "tracer",
             "widowmaker", "winston",
             "zarya", "zenyatta"]
hero_dict = {}
for hero in hero_list:
    hero_dict[hero] = 1

### Hero Helpers

In [4]:
def add_key(tds, info):
    """
    Given a td list of 2, adds the KVP tds[0].getText(), tds[1].getText() to the info map.
    This function also preprocesses times, percentages, and numbers to remove (:), (%), and (,)
    characters.
    """
    key = td_to_feat(tds[0].getText())
    val = tds[1].getText()
    #Process times with : to seconds
    is_time = False
    if ":" in val:
        is_time = True
        val_list = val.split(":")
        #Multipliers list to convert to seconds
        #days, hours, minutes, seconds
        mults = [86400, 3600, 60, 1]
        seconds = 0
        length = len(val_list)
        for i in range(length):
            seconds += int(val_list[length - i - 1]) * mults[length - i - 1]
        val = seconds
        key += "_seconds"
    #Time played
    elif "hours" in val.lower():
        val = int(val.lower().replace(" hours", "").replace(",", ""))
        key += "_hours"
    #Process percentages to decimal
    elif "%" in val:
        val = int(val.replace("%", "")) * 1.0 / 100
    #Process numbers without ,'s
    else:
        val = int("".join(val.split(",")))
    #Add kvp
    info[key] = val


#Multipliers list to convert to seconds
#days, hours, minutes, seconds
mults = [86400, 3600, 60, 1]
mult_length = len(mults)
def add_keys_from_tbody(tbody, info):
    """
    Given a td list of 2, adds the KVP tds[0].getText(), tds[1].getText() to the info map.
    This function also preprocesses times, percentages, and numbers to remove (:), (%), and (,)
    characters.
    """
    trows = tbody.find_all("tr")
    for i in range(len(trows)):
        tds = trows[i].find_all("td")
        key = td_to_feat(tds[0].getText())
        val = tds[1].getText().lower()
        #print(key, val)
        #Process times with : to seconds
        is_time = False
        if ":" in val:
            is_time = True
            val_list = val.split(":")
            #Multipliers list to convert to seconds
            #days, hours, minutes, seconds
            seconds = 0
            val_length = len(val_list)
            for i in range(val_length):
                seconds += int(val_list[val_length - i - 1]) * mults[mult_length - i - 1]
            val = seconds
            key += "_seconds"
        #Time played hours
        elif "hours" in val or "hour" in val:
            val = int(val.replace(" hour","").replace("s","").replace(",","")) * 60**2
            key += "_seconds"
        #Time played minutes
        elif "minutes" in val or "minute" in val:
            #int(float("2.4")) works whereas int("2.4") doesn't
            val = int(float(val.replace(" minute","").replace("s","").replace(",",""))) * 60
            key += "_seconds"
        #Time played seconds
        elif "seconds" in val or "second" in val:
             #int(float("2.4")) works whereas int("2.4") doesn't
            val = int(float(val.replace(" second","").replace("s","").replace(",","")))
            key += "_seconds"
        #Process percentages to decimal
        elif "%" in val:
            val = int(val.replace("%", "")) * 1.0 / 100
        #Process decimals to floats
        elif "." in val:
            val = float(val.replace(",", ""))
        #Bug case where players have no time on a character but get a dropdown in
        #competitive career stats with a time played = "--" like
        #https://playoverwatch.com/en-us/career/pc/burritoh-1532 on 1/28/2018 7:25am PST
        elif "-" in val:
            val = 0
        #Process integers without ,'s
        else:
            val = int(val.replace(",", ""))
        #Add kvp
        info[key] = val

def td_to_feat(td_str):
    """
    Converts strings to lowercase and replaces single spaces with _
    td_to_feat("Pulse Bomb Kills") returns pulse_bomb_kills
    """
    return "_".join(td_str.lower().split(" "))
def td_to_feat_list(td_str_list):
    """
    Same as td_to_feat but for lists. Not including in same function because
    td_to_feat will be called MANY times, whereas this will be called a few times.
    """
    for i in range(len(td_str_list)):
        td_str_list[i] = "_".join(td_str_list[i].lower().split(" "))

### Hero HTML Handlers

This code block is responsible for filling entry values for scraped battletags. The import part is in fill_entries().

In [5]:
combat_entries = [
    "ELIMINATIONS",
    "DEATHS",
    "FINAL BLOWS",
    "SOLO KILLS",
    "ALL DAMAGE DONE",
    "OBJECTIVE KILLS",
    #" SECONDS" was added and is necessary to avoid adding
    #"OBJECTIVE TIME" back later when misising features are
    #added and 0 filled. This feature isn't missing, it's just
    #appended with _seconds during syntax time.
    "OBJECTIVE TIME SECONDS",
    "MULTIKILLS",
    "ENVIRONMENTAL KILLS",
    "MELEE FINAL BLOWS",
    #" SECONDS" was added and is necessary to avoid adding
    #"TIME SPENT ON FIRE" back later when misising features are
    #added and 0 filled. This feature isn't missing, it's just
    #appended with _seconds during syntax time.
    "TIME SPENT ON FIRE SECONDS",
    "CRITICAL HITS",
    "HERO DAMAGE DONE",
    "BARRIER DAMAGE DONE",
    "QUICK MELEE ACCURACY",
    "CRITICAL HIT ACCURACY",
    "WEAPON ACCURACY"
]
assist_entries = [
    "TELEPORTER PADS DESTROYED",
    "TURRETS DESTROYED",
    "RECON ASSISTS",#specific to widow, hanzo, or any character with wall hack abilities
    "RECON ASSISTS - MOST IN GAME",
    "OFFENSIVE ASSISTS",
    "OFFENSIVE ASSISTS - MOST IN GAME",
    "DEFENSIVE ASSISTS",
    "DEFENSIVE ASSISTS - MOST IN GAME",
    "HEALING DONE",
    "HEALING DONE - MOST IN GAME"#ignore 3 averages for offensive assists, healing done, defensive assists
]
best_entries = [
    "ELIMINATIONS - MOST IN LIFE",
    "ALL DAMAGE DONE - MOST IN LIFE",
    "WEAPON ACCURACY - BEST IN GAME",
    "KILL STREAK - BEST",
    "ALL DAMAGE DONE - MOST IN GAME",
    "ELIMINATIONS - MOST IN GAME",
    "FINAL BLOWS - MOST IN GAME",
    "OBJECTIVE KILLS - MOST IN GAME",
    #" SECONDS" was added and is necessary to avoid adding
    #"OBJECTIVE TIME" back later when misising features are
    #added and 0 filled. This feature isn't missing, it's just
    #appended with _seconds during syntax time.
    "OBJECTIVE TIME - MOST IN GAME SECONDS",
    "MULTIKILL - BEST",
    "SOLO KILLS - MOST IN GAME",
    "CRITICAL HITS - MOST IN GAME",
    "CRITICAL HITS - MOST IN LIFE",
    "MELEE FINAL BLOWS - MOST IN GAME",
    #" SECONDS" was added and is necessary to avoid adding
    #"TIME SPENT ON FIRE" back later when misising features are
    #added and 0 filled. This feature isn't missing, it's just
    #appended with _seconds during syntax time.
    "TIME SPENT ON FIRE - MOST IN GAME SECONDS",
    "HERO DAMAGE DONE - MOST IN GAME",
    "HERO DAMAGE DONE - MOST IN LIFE",
    "BARRIER DAMAGE DONE - MOST IN GAME"
]
#ignore averages
match_awards_entries = [
    "MEDALS - BRONZE",
    "MEDALS - SILVER",
    "MEDALS - GOLD",
    "MEDALS",
    "CARDS"
]
game_entries = [
    "TIME PLAYED",
    "GAMES PLAYED",
    "GAMES WON",
    "GAMES TIED",
    "GAMES LOST",
    "WIN PERCENTAGE"
]
miscellaneous_entries = [
    "shield_generators_destroyed"
]
#Convert entries to correct style
for l in [
    combat_entries,
    assist_entries,
    #best_entries,
    match_awards_entries,
    game_entries,
    miscellaneous_entries
]:
    td_to_feat_list(l)

#Used for ensuring all keys are present after scraping
#(some keys can be missing for 0-valued features)
general_entries = [
    combat_entries,
    assist_entries,
    #best_entries,
    match_awards_entries,
    game_entries,
    miscellaneous_entries
]

def fill_entries(tbodies, info, hero_key):
    """
    Gets the general tbody table entries for each of the 7 information
    tables. Fills in missing values where needed with 0 while ignoring average
    and other incorrect data entries.
    """
    #Get h5s
    h5s = []
    for i in range(len(tbodies)):
        h5s.append(tbodies[i].previous_sibling.find("h5").getText())
    #Remove averages and best tbody
    for i in range(len(h5s)):
        if h5s[i] == u'Average' or h5s[i] == u'Best':
            h5s[i] = ""
    tmp_list = []
    for i in range(len(h5s)):
        if h5s[i] != "":
            tmp_list.append(tbodies[i])
    tbodies = tmp_list
    
    #Handle edge case for Winston Miscellaneous tbody having conflicting
    #elements with Hero Specific tbody
    #print(hero_data_category_id_to_hero_specific[hero_key]["name"])
    if hero_key != hero_to_hash["winston"]:
        #Add information for relevant tbodies
        for i in range(0, len(tbodies)):
            add_keys_from_tbody(tbodies[i], info)
    else:
        #Iterating backwards will have the Melee Kills field written twice.
        #However, the problematic entry will be overwritten (it's written
        #first), so it fixes the incorrect value issue. However, at the
        #end of this function remember to delete winston-specific
        #miscellaneous fields
        for i in range(len(tbodies) - 1, -1, -1):
            add_keys_from_tbody(tbodies[i], info)

    #Pluralize
    pluralize(info, hero_key)
    
    #Add missing general entries
    for entry_list in general_entries:
        for k in entry_list:
            if k not in info:
                info[k] = 0
    #Add missing hero specific
    for k in hero_data_category_id_to_hero_specific[hero_key]["entries"]:
        if k not in info:
            info[k] = 0
            
    #Delete any key containing the word average or avg
    keys_to_delete = []
    for k in info:
        if "average" in k or "avg" in k:
            keys_to_delete.append(k)
    for k in keys_to_delete:
        del info[k]
    
    #Fix any key like
    #NANO {COUNT, PLURAL, ONE {BOOST} OTHER {BOOSTS}} APPLIED - MOST IN GAME
    #nano_{count,_plural,_one_{boost}_other_{boosts}}_applied_-_most_in_game
    #Keys to fix will hold tuples of the (old key, new key)
    keys_to_fix = []
    for k in info:
        end_idx = k.find("}}")
        #Match found
        if end_idx != -1:
            val_str = ""
            #Trace string backwards
            back_idx = end_idx
            while True:
                back_idx -= 1
                #Stop if reached last opening brace
                if k[back_idx] == "{":
                    break
                #Build string backwards
                else:
                    val_str = k[back_idx] + val_str
            
            #The desired characters are in val_str
            new_key = k[:k.find("{")] + val_str + k[end_idx+2:]
            keys_to_fix.append((k, new_key))
    #Add new fixed keys and remove old broken ones
    for k_tup in keys_to_fix:
        info[k_tup[1]] = info[k_tup[0]]
        del info[k_tup[0]]
        
    #One player had a unique key on Bastion that looked like some hash value with
    #a "." in it and a bunch of numbers. Will just remove any key with a . in it
    fucked_keys = []
    for k in info:
        if "." in k:
            fucked_keys.append(k)
    for k in fucked_keys:
        del info[k]
            
    #Delete time_played object_time keys (replaced by " "_seconds)
    del info["time_played"]
    #del info["objective_time"]
    #del info["objective_time_-_most_in_game"]
    #del info["time_spent_on_fire"]
    #del info["time_spent_on_fire_-_most_in_game"]
    
    #Delete hero specific keys for most/best in game
    fucked_keys = []
    for k in info:
        if k.endswith("in_game"):
            fucked_keys.append(k)
    for k in fucked_keys:
        del info[k]
    
    #Delete recon assists for all characters except hanzo, widow, and genji (can deflect sonar i think)
    if not (hero_key == "0x02E0000000000029" or hero_key == "0x02E0000000000005" or hero_key == "0x02E000000000000A"):
        del info["recon_assists"]
    #Delete winston Miscellaneous attributes
    if hero_key == hero_to_hash["winston"]:
        if "jump_kills" in info:
            del info["jump_kills"]
        if "weapon_kills" in info:
            del info["weapon_kills"]
    #Replace symmetra shield generators (incorrectly pluralized if they have shield generator
    #uptime from Shield Generators Destroyed attribute for General Features) uptime
    if hero_key == hero_to_hash["symmetra"]:
        if "shield_generators_uptime" in info:
            info["shield_generator_uptime"] = info["shield_generators_uptime"]
            del info["shield_generators_uptime"]
        if "sentry_turrets_kills" in info:
            info["sentry_turret_kills"] = info["sentry_turrets_kills"]
            del info["sentry_turrets_kills"]
    
    #Delete deflected dragonstrike and graviton surge kills for genji
    #Note: May have to do others like tac visor, concussion mine, etc.
    if hero_key == hero_to_hash["genji"]:
        if "graviton_surge_kills" in info:
            del info["graviton_surge_kills"]
        if "dragonstrike_kills" in info:
            del info["dragonstrike_kills"]
    
    #Replace any singular medal or game keys
    mg_replace_tups = [("medal", "medals"), ("game", "games")]
    mg_singular_lists = [
        ["medal_-_bronze", "medal_-_silver", "medal_-_gold", "medal"],
        ["game_won", "game_played", "game_lost"]
    ]
    for i in range(2):
        ls = mg_singular_lists[i]
        for j in range(len(ls)):
            if ls[j] in info:
                new_key = ls[j].replace(mg_replace_tups[i][0], mg_replace_tups[i][1])
                info[new_key] = info[ls[j]]
                del info[ls[j]]

#### Hero Specific Entries
This is just a listing of what each hero *should* have in terms of hero-specific features.

In [6]:
hero_data_category_id_to_hero_specific = {
    #Ana
    "0x02E000000000013B": {
        "name": "ana",
        "entries": [
            "SCOPED ACCURACY - BEST IN GAME",
            "SELF HEALING",
            "SELF HEALING - MOST IN GAME",
            "ENEMIES SLEPT",
            "NANO BOOSTS APPLIED",
            "NANO BOOST ASSISTS",
            "UNSCOPED ACCURACY - BEST IN GAME",
            "ENEMIES SLEPT - MOST IN GAME",
            "NANO BOOST ASSISTS - MOST IN GAME",
            #NANO {COUNT, PLURAL, ONE {BOOST} OTHER {BOOSTS}} APPLIED - MOST IN GAME regex
            "BIOTIC GRENADE KILLS",
            "SCOPED ACCURACY",
            "UNSCOPED_ACCURACY"
        ]
    },
    #Bastion
    "0x02E0000000000015": {
        "name": "bastion",
        "entries": [
            "RECON KILLS",
            "SENTRY KILLS",
            "TANK KILLS",
            "SENTRY KILLS - MOST IN GAME",
            "RECON KILLS - MOST IN GAME",
            "TANK KILLS - MOST IN GAME",
            "SELF HEALING",
            "SELF HEALING - MOST IN GAME"
            #OVERWATCH.GUID.0X0860000000000033
        ]
    },
    #Doomfist
    "0x02E000000000012F": {
        "name": "doomfist",
        "entries": [
            "ABILITY DAMAGE DONE",
            "ABILITY DAMAGE DONE - MOST IN GAME",
            "METEOR STRIKE KILLS",
            "METEOR STRIKE KILLS - MOST IN GAME",
            "SHIELDS CREATED",
            "SHIELDS CREATED - MOST IN GAME"
        ]
    },
    #D.Va
    "0x02E000000000007A": {
        "name": "d.va",
        "entries": [
            "SELF-DESTRUCT KILLS",
            "SELF-DESTRUCT KILLS - MOST IN GAME",
            "MECHS CALLED",
            "MECHS CALLED - MOST IN GAME",
            "DAMAGE BLOCKED - MOST IN GAME",
            "DAMAGE BLOCKED",
            "MECH DEATHS"
        ]
    },
    #Genji
    "0x02E0000000000029": {
        "name": "genji",
        "entries": [
            "DRAGONBLADE KILLS",
            "DRAGONBLADE KILLS - MOST IN GAME",
            "DAMAGE REFLECTED",
            "DAMAGE REFLECTED - MOST IN GAME",
            "DRAGONBLADES"
        ]
    },
    #Hanzo
    "0x02E0000000000005": {
        "name": "hanzo",
        "entries": [
            "DRAGONSTRIKE KILLS",
            "DRAGONSTRIKE KILLS - MOST IN GAME",
            "SCATTER ARROW KILLS",
            "SCATTER ARROW KILLS - MOST IN GAME"
        ]
    },
    #Junkrat
    "0x02E0000000000065": {
        "name": "junkrat",
        "entries": [
            "ENEMIES TRAPPED - MOST IN GAME",
            "ENEMIES TRAPPED",
            "RIP-TIRE KILLS - MOST IN GAME",
            "RIP-TIRE KILLS",
            "CONCUSSION MINE KILLS",
            "CONCUSSION MINE KILLS - MOST IN GAME"
        ]
    },
    #Lucio
    "0x02E0000000000079": {
        "name": "lucio",
        "entries": [
            "SOUND BARRIERS PROVIDED",
            "SOUND BARRIERS PROVIDED - MOST IN GAME",
            "SELF HEALING",
            "SELF HEALING - MOST IN GAME"
        ]
    },
    #McCree
    "0x02E0000000000042": {
        "name": "mccree",
        "entries": [
            "DEADEYE KILLS",
            "DEADEYE KILLS - MOST IN GAME",
            "FAN THE HAMMER KILLS",
            "FAN THE HAMMER KILLS - MOST IN GAME"
        ]
    },
    #Mei
    "0x02E00000000000DD": {
        "name": "mei",
        "entries": [
            "ENEMIES FROZEN",
            "ENEMIES FROZEN - MOST IN GAME",
            "BLIZZARD KILLS - MOST IN GAME",
            "BLIZZARD KILLS",
            "DAMAGE BLOCKED - MOST IN GAME",
            "DAMAGE BLOCKED",
            "SELF HEALING",
            "SELF HEALING - MOST IN GAME"
        ]
    },
    #Mercy
    "0x02E0000000000004": {
        "name": "mercy",
        "entries": [
            "BLASTER KILLS",
            "BLASTER KILLS - MOST IN GAME",
            "PLAYERS RESURRECTED",
            "PLAYERS RESURRECTED - MOST IN GAME",
            "SELF HEALING",
            "SELF HEALING - MOST IN GAME",
            "DAMAGE AMPLIFIED",
            "DAMAGE AMPLIFIED - MOST IN GAME"
        ]        
    },
    #Moira
    "0x02E00000000001A2": {
        "name": "moira",
        "entries": [
            "SELF HEALING",
            "SELF HEALING - MOST IN GAME",
            "COALESCENCE KILLS",
            "COALESCENCE KILLS - MOST IN GAME",
            "COALESCENCE HEALING",
            "COALESCENCE HEALING - MOST IN GAME",
            "SECONDARY FIRE ACCURACY"
        ]
    },
    #Orisa
    "0x02E000000000013E": {
        "name": "orisa",
        "entries": [
            "DAMAGE AMPLIFIED",
            "DAMAGE AMPLIFIED - MOST IN GAME",
            "DAMAGE BLOCKED",
            "DAMAGE BLOCKED - MOST IN GAME",
            "SUPERCHARGER ASSISTS",
            "SUPERCHARGER ASSISTS - MOST IN GAME"
        ]
    },
    #Pharah
    "0x02E0000000000008": {
        "name": "pharah",
        "entries": [
            "ROCKET DIRECT HITS",
            "BARRAGE KILLS",
            "ROCKET DIRECT HITS - MOST IN GAME",
            "BARRAGE KILLS - MOST IN GAME"
        ]
    },
    #Reaper
    "0x02E0000000000002": {
        "name": "reaper",
        "entries": [
            "DEATH BLOSSOM KILLS",
            "DEATH BLOSSOM KILLS - MOST IN GAME",
            "SELF HEALING",
            "SELF HEALING - MOST IN GAME"
        ]
    },
    #Reinhardt
    "0x02E0000000000007": {
        "name": "reinhardt",
        "entries": [
            "DAMAGE BLOCKED",
            "DAMAGE BLOCKED - MOST IN GAME",
            "CHARGE KILLS",
            "CHARGE KILLS - MOST IN GAME",
            "FIRE STRIKE KILLS",
            "FIRE STRIKE KILLS - MOST IN GAME",
            "EARTHSHATTER KILLS",
            "EARTHSHATTER KILLS - MOST IN GAME",
            "ROCKET HAMMER MELEE ACCURACY"
        ]
    },
    #Roadhog
    "0x02E0000000000040": {
        "name": "roadhog",
        "entries": [
            "ENEMIES HOOKED - MOST IN GAME",
            "ENEMIES HOOKED",
            "HOOKS ATTEMPTED",
            "WHOLE HOG KILLS - MOST IN GAME",
            "WHOLE HOG KILLS",
            "HOOK ACCURACY - BEST IN GAME",
            "SELF HEALING",
            "SELF HEALING - MOST IN GAME",
            "HOOK ACCURACY"
        ]
    },
    #Soldier: 76
    "0x02E000000000006E": {
        "name": "soldier76",
        "entries": [
            "HELIX ROCKETS KILLS - MOST IN GAME",
            "HELIX ROCKETS KILLS",#pluralize rocket
            "TACTICAL VISOR KILLS",
            "TACTICAL VISOR KILLS - MOST IN GAME",
            "BIOTIC FIELDS DEPLOYED",#pluralize field
            "BIOTIC FIELD HEALING DONE",
            "SELF HEALING",
            "SELF HEALING - MOST IN GAME"
        ]
    },
    #Sombra
    "0x02E000000000012E": {
        "name": "sombra",
        "entries": [
            "ENEMIES HACKED",
            "ENEMIES EMP'D",
            "ENEMIES HACKED - MOST IN GAME",
            "ENEMIES EMP'D - MOST IN GAME"
        ]
    },
    #Symmetra
    "0x02E0000000000016": {
        "name": "symmetra",
        "entries": [
            "SENTRY TURRET KILLS",
            "SENTRY TURRET KILLS - MOST IN GAME",
            "PLAYERS TELEPORTED",#pluralize player
            "PLAYERS TELEPORTED - MOST IN GAME",
            "TELEPORTER UPTIME",
            "TELEPORTER UPTIME - BEST IN GAME",
            "DAMAGE BLOCKED",
            "DAMAGE BLOCKED - MOST IN GAME",
            "SHIELD GENERATOR UPTIME",
            "PRIMARY FIRE ACCURACY"
        ]
    },
    #Torbjorn
    "0x02E0000000000006": {
        "name": "torbjorn",
        "entries": [
            "ARMOR PACKS CREATED",
            #NOTE: This is a pain in the ass. These are different
            #characters and we're using ö.
            #Ö is not the same as what we're using.
            #Second NOTE: Those both suck try this instead: \xf6. I really
            #swear the big o with the dots worked yesterday lmao.
            #Third NOTE: No good
            "TORBJÖRN KILLS",
            "TURRET KILLS",
            "TORBJÖ6RN KILLS - MOST IN GAME",
            "ARMOR PACKS CREATED - MOST IN GAME",
            "TURRET KILLS - MOST IN GAME",
            "MOLTEN CORE KILLS",
            "MOLTEN CORE KILLS - MOST IN GAME"
        ]
    },
    #Tracer
    "0x02E0000000000003": {
        "name": "tracer",
        "entries": [
            "PULSE BOMB KILLS",
            "PULSE BOMB KILLS - MOST IN GAME",
            "PULSE BOMBS ATTACHED - MOST IN GAME",
            "PULSE BOMBS ATTACHED",
            "SELF HEALING",
            "SELF HEALING - MOST IN GAME",
            "HEALTH RECOVERED",
            "HEALTH RECOVERED - MOST IN GAME",
        ]
    },
    #Widowmaker
    "0x02E000000000000A": {
        "name": "widowmaker",
        "entries": [
            "VENOM MINE KILLS",
            "SCOPED CRITICAL HITS",
            "SCOPED CRITICAL HITS - MOST IN GAME",
            "VENOM MINE KILLS - MOST IN GAME",
            "SCOPED ACCURACY"
        ]
    },
    #Winston
    "0x02E0000000000009": {
        "name": "winston",
        "entries": [
            "PLAYERS KNOCKED BACK",
            "DAMAGE BLOCKED",
            "DAMAGE BLOCKED - MOST IN GAME",
            "PLAYERS KNOCKED BACK - MOST IN GAME",
            "MELEE KILLS",
            "MELEE KILLS - MOST IN GAME",
            "JUMP PACK KILLS",
            "JUMP PACK KILLS - MOST IN GAME",
            "PRIMAL RAGE KILLS",
            "PRIMAL RAGE KILLS - MOST IN GAME",
            "PRIMAL RAGE MELEE ACCURACY",
            "TESLA CANNON ACCURACY"
        ]
    },
    #Zarya
    "0x02E0000000000068": {
        "name": "zarya",
        "entries": [
            "DAMAGE BLOCKED",
            "DAMAGE BLOCKED - MOST IN GAME",
            "GRAVITON SURGE KILLS",
            "GRAVITON SURGE KILLS - MOST IN GAME",
            "HIGH ENERGY KILLS - MOST IN GAME",
            "HIGH ENERGY KILLS",
            "PROJECTED BARRIERS APPLIED",
            "AVERAGE ENERGY - BEST IN GAME",
            "PROJECTED BARRIERS APPLIED - MOST IN GAME",
            "AVERAGE ENERGY",
            "PRIMARY FIRE ACCURACY"
        ]
    },
    #Zenyatta
    "0x02E0000000000020": {
        "name": "zenyatta",
        "entries": [
            "TRANSCENDENCE HEALING - BEST",
            "TRANSCENDENCE HEALING",
            "SELF HEALING",
            "SELF HEALING - MOST IN GAME"
        ]
    }
}

for k,v in hero_data_category_id_to_hero_specific.iteritems():
    td_to_feat_list(v["entries"])
#Doing this here because I don't feel like going through and fixing
#they syntax by hand
for k,v in hero_data_category_id_to_hero_specific.iteritems():
    entry_map = {}
    for entry in v["entries"]:
        entry_map[entry] = 1
    hero_data_category_id_to_hero_specific[k]["entries"] = entry_map
    
#Map hero names to data-category-id
hero_to_hash = {}
for k,v in hero_data_category_id_to_hero_specific.iteritems():
    hero_to_hash[v["name"]] = k

#### Hero Data Extractor
This code block contains the function responsible for managing the extraction of hero entries every possible hero. If a hero is not in the competitive profile on the HTML page, it's skipped.

In [7]:
def hero_data_extractor(comp_soup):
    """
    Extracts data from specified BeautifulSoup competitive div for all
    heroes available through PlayOverwatch for the player under consideration.
    
    Returns a list of (hero_name, hero_dictionary) tuples.
    """
    hero_infos = []
    for hero_key in hero_data_category_id_to_hero_specific:
        #Get hero specific data
        hero_soup = comp_soup.find("div", {"data-category-id": hero_key})
        if hero_soup != None:
            info = {}
            tbodies = hero_soup.find_all("tbody")
            fill_entries(tbodies, info, hero_key)
            #Weird bug where player isn't reported to have time on a character but
            #but has a dropdown in competitive career stats
            #https://playoverwatch.com/en-us/career/pc/slin-1353 Lucio for 1/28/2018 7:07AM PST
            if "time_played_seconds" not in info:
                continue
            
            original_info = handle_averages(info, hero_key)
            
            hero_infos.append((hero_data_category_id_to_hero_specific[hero_key]["name"], info, original_info))
    return hero_infos

#### Pluralizer
This code block is only responsible for pluralizing features. For most features on PlayOverwatch, they use pluralizing JS to make entries end in s for integer values > 1. Features corresponding with a value of 1 do not get an s added, which causes great difficulties for words like enemy vs. enemies, etc.. 

In [8]:
#Ends with keys
ends_with = ["kill", "blow", "death", "elimination", "hit", "assist", "card", "medal"]
len_ends_with = len(ends_with)
#Contains keys
contains = ["generator", "pad", "turret", "assist"]
len_contains = len(contains)
    
#Special cases (instances for hero-specific information that doesn't need to be checked
#for all heroes). Probably could be done better, but it's not a priority right now.
problem_keys_to_phrases = {
    #Ana
    "0x02E000000000013B":
        [
            ("enemies_slept", "enemy_slept", 1),
            ("nano_boosts_applied", "nano_boost_applied", 1)
        ],
    #Doomfist
    "0x02E000000000012F":
        [("shields_created", "shield_created", 1)],
    #D.Va
    "0x02E000000000007A":
        [
            ("mechs_called", "mech_called", 1),
            ("mech_deaths", "mech_death", 1)
        ],
    #Genji
    "0x02E0000000000029":
        [("dragonblades", "dragonblade", 1)],
    #Junkrat
    "0x02E0000000000065":
        [("enemies_trapped", "enemy_trapped", 1)],
    #Lucio
    "0x02E0000000000079":
        [("sound_barriers_provided", "sound_barrier_provided", 1)],
    #Mei
    "0x02E00000000000DD":
        [("enemies_frozen", "enemy_frozen", 1)],
    #Mercy
    "0x02E0000000000004":
        [("players_resurrected", "player_resurrected", 1)],
    #Roadhog
    "0x02E0000000000040":
        [
            ("enemies_hooked", "enemy_hooked", 1),
            ("hooks_attempted", "hook_attempted", 1)
        ],
    #Soldier: 76
    "0x02E000000000006E":
        [
            ("biotic_fields_deployed", "biotic_field_deployed", 1),
            ("helix_rockets_kills", "helix_rocket_kills", 1)
        ],
    #Sombra
    "0x02E000000000012E":
        [
            ("enemies_hacked", "enemy_hacked", 1),
            ("enemies_emp'd", "enemy_emp'd", 1)
        ],
    #Symmetra
    "0x02E0000000000016":
        #Preferred phrase, occuring phrase, number of possible occurences
        [("players_teleported", "player_teleported", 1)],
    #Torbjorn
    "0x02E0000000000006":
        [("armor_packs_created", "armor_pack_created", 1)],
    #Tracer
    "0x02E0000000000003":
        [("pulse_bombs_attached", "pulse_bomb_attached", 1)],
    #Winston
    "0x02E0000000000009":
        [("players_knocked_back", "player_knocked_back", 1)],
    #Zarya
    "0x02E0000000000068":
        [("projected_barriers_applied", "projected_barrier_applied", 1)]
}

def pluralize(info, hero_key):
    """
    Pluralizes any of the following words (if not already plural) in the info
    dictionary where necessary (if the word is the last word in a key):
    kill, blow, death, elimination, hit, assist, generator, pad, game, card, medal, turret
    """
    #Tuple lists of (k,v) to update AFTER iterating through the dictionary
    kvps_end_with = []
    kvps_contains = []
    for k,v in info.iteritems():
        does_end_with = False
        #Check ends with
        for i in range(0, len_ends_with):
            if k.endswith(ends_with[i]):
                kvps_end_with.append((k,v))
                does_end_with = True
                break
            elif k.endswith("_in_game"):
                break
        if does_end_with:
            continue
        
        #Check contains
        for i in range(0, len_contains):
            #If it is already pluralized, don't replace pads with padss
            #valid example below
            #if pad in "teleporter pad destroyed" and pads not in "teleporter pad destroyed"
            if (contains[i] in k) and ((contains[i] + "s") not in k):
                kvps_contains.append((k,v,i))
                break
                
    #Pluralize keys
    for tup in kvps_end_with:
        info[tup[0] + "s"] = tup[1]
        del info[tup[0]]
    for tup in kvps_contains:
        info[tup[0].replace(contains[tup[2]], contains[tup[2]] + "s")] = tup[1]
        del info[tup[0]]
    
    #Only do work if this hero is known to have an issue
    if hero_key in problem_keys_to_phrases:
        #Holds tuples of (key, phrase_tup)
        erase_tups = []
        phrase_tups = problem_keys_to_phrases[hero_key]
        
        ##
        """
        for phrase_tup in phrase_tups:
            count = 0
            for k in info:
                if (phrase_tup[1] in k) and ((phrase_tup[0]) not in k):
                    count += 1
                    erase_tups.append((k, phrase_tup))
                if count == phrase_tup[2]:
                    break
            
        for erase_tup in erase_tups:
            phrase_tup = erase_tup[1]#phrase_tup is a tuple itself, erase_tup is a (string, tuple) tuple
            origin_key = erase_tup[0]
            info[origin_key.replace(phrase_tup[1], phrase_tup[0])] = info[origin_key]
            del info[origin_key]
        """
        ##
        swap_keys = []
        for phrase_tup in phrase_tups:
            if phrase_tup[1] in info:
                swap_keys.append((phrase_tup[0], phrase_tup[1]))
        for key_tup in swap_keys:
            info[key_tup[0]] = info[key_tup[1]]
            del info[key_tup[1]]

#### Average Maps
If an entry is in one of these map, it has to be averaged before writing to file.

In [9]:
#Per life and second stats
per_life_and_second_average_map = {}
#Hero Specific
for k,v in hero_data_category_id_to_hero_specific.iteritems():
    for entry in v["entries"]:
        if not entry.endswith("best") and not entry.endswith("game") and not entry.endswith("energy") and not entry.endswith("accuracy"):
            per_life_and_second_average_map[entry] = 1
#General
for entry_list in [combat_entries, assist_entries, miscellaneous_entries]:
    for entry in entry_list:
        if not entry.endswith("game") and not entry.endswith("accuracy") and not entry.endswith("deaths"):
            per_life_and_second_average_map[entry] = 1
#Handle Torbjorn fancy o's
per_life_and_second_average_map["torbjorn_kills"] = 1

#Per game stats
per_game_average_map = {}
for entry_list in [match_awards_entries]:
    for entry in entry_list:
        per_game_average_map[entry] = 1

#### Hero Specific Average Helpers
These methods just return true or false if the hero name meets a criteria.

In [10]:
#Healing data is not recorded for Sombra on PlayOverwatch
healer_names = ["ana", "lucio", "mercy", "moira", "soldier76", "zenyatta"]
def is_healer(hero_key):
    """
    Returns True if the specified hero key can be referred to as a healer.
    """
    for h in healer_names:
        if hero_key == hero_to_hash[h]:
            return True
    return False

blocker_names = ["mei", "symmetra", "d.va", "orisa", "reinhardt", "winston", "zarya"]
def is_blocker(hero_key):
    """
    Returns True if the specified hero key can block damage.
    """
    for b in blocker_names:
        if hero_key == hero_to_hash[b]:
            return True
    return False

booper_names = ["lucio", "pharah", "roadhog", "orisa", "winston", "doomfist", "d.va", "junkrat"]
def is_booper(hero_key):
    """
    Returns True if the specified hero key can boop (no Reinhardt).
    """
    for b in booper_names:
        if hero_key == hero_to_hash[b]:
            return True
    return False

#### Average Handler
Averages all necessary features prior to file output.

In [11]:
def handle_averages(info, hero_key):
    """
    Reclaculates running sum measurements into an average based on total number of lives (deaths + 1). 
    """
    #Torb's fancy o's (Ö) cause an issue with working with strings...this can be solved
    #by uncommenting the Handle utf-8 section. However, uncommenting this section forces
    #jupyter to output everything to the command promopt as opposed to under the executed
    #cell. It's best to comment out this torb section unless scraping.
    #Replace o's
    #Update: This problem blows let me do it my way k thanks.
    if hero_key == hero_to_hash["torbjorn"]:
        torb_keys = []
        for k in info:
            if k.endswith("rn_kills"):
                torb_keys.append(k)
        max_torb_val = -1
        for k in torb_keys:
            if max_torb_val < info[k]:
                max_torb_val = info[k]
            del info[k]
        info["torbjorn_kills"] = max_torb_val
        
        #If turrets was accidnetally pluralized because of "Turrets Destroyed"
        #feature pluralization from "Turret Destroyed"
        if "turrets_kills" in info:
            info["turret_kills"] = info["turrets_kills"]
            del info["turrets_kills"]
        
    
    #Cloning original info for output to raw (original data minus some problematic features) data
    original_data = {}
    for k in info:
        original_data[k] = info[k]
    
    #Calculate averages based on time played and lifes
    per_life_and_second_keys = []
    lifes = int(info["deaths"]) + 1.0#lmao lives is unclear
    #print(hero_data_category_id_to_hero_specific[hero_key]["name"])
    seconds = 1.0 * max(info["time_played_seconds"], 1)
    for k in info:
        if k in per_life_and_second_average_map:
            per_life_and_second_keys.append(k)
    for k in per_life_and_second_keys:
        #Stats per life
        info[k + "_per_life"] = info[k] / lifes
        #Stats per second
        #avoid divide by 0 in strange case where data was scraped for a
        #hero with a time played = 0 seconds
        info[k + "_per_second"] = info[k] / seconds
    
    #Calculates averages based on games played
    per_game_keys = []
    games = 1.0 * max(info["games_played"], 1)
    for k in info:
        if k in per_game_average_map:
            per_game_keys.append(k)
    for k in per_game_keys:
        info[k + "_per_game"] = info[k] / games
        
    #General custom averages
    elims = 1.0 * max(info["eliminations"], 1)
    info["final_blows_per_elimination"] = info["final_blows"] / elims
    info["solo_kills_per_elimination"] = info["solo_kills"] / elims
    info["objective_kills_per_elimination"] = info["objective_kills"] / elims
    info["hero_to_barrier_damage_ratio"] = info["hero_damage_done"] / float(max(info["barrier_damage_done"], 1))
    info["average_life"] = seconds / lifes
    
    #Role specific averages
    damage = 1.0 * max(info["all_damage_done"], 1)
    #Healers
    if is_healer(hero_key):
        info["healing_to_damage_ratio"] = info["healing_done"] / damage
    #Blockers
    if is_blocker(hero_key):
        info["damage_blocked_to_output_ratio"] = info["damage_blocked"] / damage
    #Boopers
    if is_booper(hero_key):
        info["environmental_kills_to_eliminations_ratio"] = info["environmental_kills"] / elims
    
    #Hero specific averages
    #Ana
    if hero_key == hero_to_hash["ana"]:
        info["grenade_kills_to_eliminations_ratio"] = info["biotic_grenade_kills"] / elims
        info["nano_assists_to_applied_ratio"] = info["nano_boost_assists"] / float(max(info["nano_boosts_applied"], 1))
    #Bastion
    elif hero_key == hero_to_hash["bastion"]:
        info["recon_kills_to_eliminations_ratio"] = info["recon_kills"] / elims
        info["sentry_kills_to_eliminations_ratio"] = info["sentry_kills"] / elims
        info["tank_kills_to_eliminations_ratio"] = info["tank_kills"] / elims
    #Doomfist
    elif hero_key == hero_to_hash["doomfist"]:
        info["meteor_strike_kills_to_eliminations_ratio"] = info["meteor_strike_kills"] / elims
    #D.Va
    elif hero_key == hero_to_hash["d.va"]:
        info["self-destruct_kills_to_eliminations_ratio"] = info["self-destruct_kills"] / elims
        info["mech_deaths_to_deaths_ratio"] = info["mech_deaths"] / float(max(info["deaths"], 1))
    #Genji
    elif hero_key == hero_to_hash["genji"]:
        info["dragonblade_kills_to_eliminations_ratio"] = info["dragonblade_kills"] / elims
        info["damage_reflected_to_output_ratio"] = info["damage_reflected"] / damage
        info["dragonblade_kills_to_dragonblades_ratio"] = info["dragonblade_kills"] / float(max(info["dragonblades"], 1))
    #Hanzo
    elif hero_key == hero_to_hash["hanzo"]:
        info["dragonstrike_kills_to_eliminations_ratio"] = info["dragonstrike_kills"] / elims
        info["scatter_arrow_kills_to_eliminations_ratio"] = info["scatter_arrow_kills"] / elims
    #Junkrat
    elif hero_key == hero_to_hash["junkrat"]:
        info["enemies_trapped_to_eliminations_ratio"] = info["enemies_trapped"] / elims
        info["rip-tire_kills_to_eliminations_ratio"] = info["rip-tire_kills"] / elims
        info["concussion_mine_kills_to_eliminations_ratio"] = info["concussion_mine_kills"] / elims
    #Lucio
    elif hero_key == hero_to_hash["lucio"]:
        info["healing_done_to_self_healing_ratio"] = info["healing_done"] / float(max(info["self_healing"], 1))
    #McCree
    elif hero_key == hero_to_hash["mccree"]:
        info["deadeye_kills_to_eliminations_ratio"] = info["deadeye_kills"] / elims
        info["fan_the_hammer_kills_to_eliminations_ratio"] = info["fan_the_hammer_kills"] / elims
    #Mei
    elif hero_key == hero_to_hash["mei"]:
        info["enemies_frozen_to_eliminations_ratio"] = info["enemies_frozen"] / elims
        info["blizzard_kills_to_eliminations_ratio"] = info["blizzard_kills"] / elims
    #Mercy
    elif hero_key == hero_to_hash["mercy"]:
        info["damage_amplified_to_healing_done"] = info["damage_amplified"] / float(max(info["healing_done"], 1))
    #Moira
    elif hero_key == hero_to_hash["moira"]:
        info["coalescence_kills_to_eliminations_ratio"] = info["coalescence_kills"] / elims
        info["coalescence_healing_to_healing_done"] = info["coalescence_healing"] / float(max(info["healing_done"], 1))
    #Orisa
    #Pharah
    elif hero_key == hero_to_hash["pharah"]:
        info["barrage_kills_to_eliminations_ratio"] = info["barrage_kills"] / elims
    #Reaper
    elif hero_key == hero_to_hash["reaper"]:
        info["death_blossom_kills_to_eliminations_ratio"] = info["death_blossom_kills"] / elims
    #Reinhardt
    elif hero_key == hero_to_hash["reinhardt"]:
        info["charge_kills_to_eliminations"] = info["charge_kills"] / elims
        info["fire_strike_kills_to_eliminations"] = info["fire_strike_kills"] / elims
        info["earthshatter_kills_to_eliminations"] = info["earthshatter_kills"] / elims
    #Roadhog
    elif hero_key == hero_to_hash["roadhog"]:
        info["enemies_hooked_to_eliminations_ratio"] = info["enemies_hooked"] / elims
        info["whole_hog_kills_to_eliminations_ratio"] = info["whole_hog_kills"] / elims
    #Soldier: 76
    elif hero_key == hero_to_hash["soldier76"]:
        info["helix_rockets_kills_to_eliminations_ratio"] = info["helix_rockets_kills"] / elims
        info["tactical_visor_kills_to_eliminations_ratio"] = info["tactical_visor_kills"] / elims
        info["biotic_field_healing_done_to_self_healing_ratio"] = info["biotic_field_healing_done"] / float(max(info["self_healing"], 1))
        info["biotic_field_healing_done_to_biotic_fields_deployed_ratio"] = info["biotic_field_healing_done"] / float(max(info["biotic_fields_deployed"], 1))
    #Sombra
    elif hero_key == hero_to_hash["sombra"]:
        info["enemies_hacked_to_eliminations_ratio"] = info["enemies_hacked"] / elims
        info["enemies_emp'd_to_eliminations_ratio"] = info["enemies_emp'd"] / elims
        info["enemies_hacked_to_enemies_emp'd_ratio"] = info["enemies_hacked"] / float(max(info["enemies_emp'd"], 1))
    #Symmetra
    elif hero_key == hero_to_hash["symmetra"]:
        info["sentry_turret_kills_to_eliminations_ratio"] = info["sentry_turret_kills"] / elims
    #Tracer
    elif hero_key == hero_to_hash["tracer"]:
        info["pulse_bomb_kills_to_eliminations_ratio"] = info["pulse_bomb_kills"] / elims
        info["pulse_bombs_attached_to_pulse_bomb_kills_ratio"] = info["pulse_bombs_attached"] / float(max(info["pulse_bomb_kills"], 1))
    #Widowmaker
    elif hero_key == hero_to_hash["widowmaker"]:
        info["venom_mine_kills_to_eliminations_ratio"] = info["venom_mine_kills"] / elims
    #Winston
    elif hero_key == hero_to_hash["winston"]:
        #Winston gets a miscellaneous entry for all melee and jump pack. The melee
        #entries do not match up. This has been handled in fill_entries()
        info["melee_kills_to_eliminations_ratio"] = info["melee_kills"] / elims
        info["jump_pack_kills_to_eliminations_ratio"] = info["jump_pack_kills"] / elims
        info["primal_rage_kills_to_eliminations_ratio"] = info["primal_rage_kills"] / elims
    #Zarya
    elif hero_key == hero_to_hash["zarya"]:
        info["graviton_surge_kills_to_eliminations_ratio"] = info["graviton_surge_kills"] / elims
        info["high_energy_kills_to_eliminations_ratio"] = info["high_energy_kills"] / elims
    #Zenyatta
    elif hero_key == hero_to_hash["zenyatta"]:
        info["transcendence_healing_to_healing_done_ratio"] = info["transcendence_healing"] / float(max(info["healing_done"], 1))
    #Torbjorn
    #Find matching COMMENT THIS OUT section to comment out import sys code if
    #1. getting ascii character at position errors or
    #2. print() output is being printed to commandline instead of jupyter
#    """
    elif hero_key == hero_to_hash["torbjorn"]:
        info["turret_kills_to_torbjorn_kills_ratio"] = info["turret_kills"] / float(max(info["torbjorn_kills"], 1))
        info["torbjorn_kills_to_eliminations_ratio"] = info["torbjorn_kills"] / elims
        info["turret_kills_to_eliminations_ratio"] = info["turret_kills"] / elims
        info["molten_core_kills_to_eliminations_ratio"] = info["molten_core_kills"] / elims
#    """
    
    #Delete converted keys
    #Note: can't delete keys before creating general/hero specific custom averages
    for l in [per_life_and_second_keys, per_game_keys]:
        for k in l:
            del info[k]
            
    #lul this return statement does not match the function name lmfao
    return original_data

### Handle utf-8
This is necessary because Overwatch has 1 character with a fancy o in his name, Torbjorn. Outputing/working with that string is just a pain in the ass. Unfortunately, running this codeblock forces all output to go to the console but is the only known fix for the issue. 

In [12]:
#Find matching COMMENT THIS OUT section to comment out Torbjorn data if
    #1. getting ascii character at position errors or
    #2. print() output is being printed to commandline instead of jupyter
#"""
import sys
reload(sys)
sys.setdefaultencoding("utf8")
#"""

### Scraping function

In [13]:
def scrape_data_for_battletag(battletag):
    """
    Scrapes and preprocesses the data for a specified battletag from
    Blizzard's PlayOverwatch website. Note this function does not
    write the data to a file.
    
    If a request returns a 504, this function will literally continue to
    sleep and retry forever.
    """
    #Make requestd
    battletag_url = "https://playoverwatch.com/en-us/career/pc/{}"
    
    #Request until 504 turns to 200
    retry_time = 1
    exp_factor = 2
    while True:
        try:
            req = urllib2.Request(
                battletag_url.format("-".join(battletag.split("#"))),
                headers={"User-Agent": "Mozilla/5.0"}
            )
            resp = urllib2.urlopen(req)
            if resp.getcode() == 200:
                break
        except Exception as inst:
            print("Got 504. Sleeping for {}s.".format(retry_time))
            time.sleep(retry_time)
            retry_time *= exp_factor
            print("||||||||||")
            print("||||||||||")
            print("||||||||||")
            print("Retrying {}".format(battletag))
            
    #BeautifulSoup
    soup = bs(resp.read())
    
    #Get current SR
    sr = soup.find("div", {"class": "competitive-rank"})
    if sr == None:
        return None
    sr = int(sr.find("div", {"class": "u-align-center h5"}).getText())
    
    #Competitive HTML div
    comp = soup.find(
        "div", {"id": "competitive"}
    ).find(
        "section", {"class": "content-box u-max-width-container career-stats-section"}
    )

    #Extract hero data
    #if(comp == None):
    #    print(battletag)
    data = hero_data_extractor(comp)
    
    
    #Add SR attribute
    for d in data:
        d[1]["SR"] = sr
    
    """
    for d in data:
        if(d[0] == "ana"):
            sorted_keys = sorted(d[1].keys())
            for k in sorted_keys:
                print(k, d[1][k])
    """
    
    #Return data
    return data

### Files

In [14]:
curr_battletag_index_from_scrambled_str = "../data/curr_battletag_index_from_scrambled.txt"
discovery_scrambled_str = "../data/discovery_scrambled.csv"
hero_processed_root_path = "../data/hero_processed/"
hero_raw_root_path = "../data/hero_raw/"

### Existing file data

In [15]:
#Current battletag index
curr_battletag_index = -1
try:
    path = Path(curr_battletag_index_from_scrambled_str).resolve()
except:
    print("Current battletag index file NOT FOUND. Using current battletag index value of 0.")
    curr_battletag_index = 0
else:
    f = open(curr_battletag_index_from_scrambled_str, "r")
    curr_battletag_index = int(f.readline())
    print("Current page file FOUND. Resuming scraping from battle index {}.".format(curr_battletag_index))
    f.close()

#All battletags
battletags = []
with codecs.open(discovery_scrambled_str, "r", encoding="utf8") as f:
    for line in f:
        battletags.append(line.split(",")[1])

### Hero data writer

In [16]:
#STored data (below) has the following structure
"""
{
    "ana": [(battletag_0, info_0_ana, original_info_0_ana), (battletag_1, info_1_ana, original_info_1_ana), ...],
    "bastion": [(battletag_0, info_0_bastion, original_info_3_bastion), (battletag_3, info_3_bastion, original_info_3_bastion), ...],,
    "d.va": [(battletag_5, info_5_d.va, original_info_5_bastion)],
    ...
}
"""
stored_data = {}
def write_hero_data():
    """
    This function writes the data in stored_data above to file. Each key
    corresponds to a hero (ana, bastion, ...), and the value is a list of
    tuples of players and their hero data for the specified hero.
    """
    #Dictionary mapping hero names to their file output
    #NOTE: This is done outside of the loops below because if
    #a dictionary error occurs (Tracer entry trying to be added with
    #Soldier76 features) we want to fail and get the KeyError before
    #writing that players info to file for other heroes.
    """
    {
        "ana": {
            path: ana_path_str
            outputs: [ana_output0, ana_output1, ...]
        },
        "bastion": {
            path: bastion_path_str
            outputs: [bastion_output0, ...]
        },
        ...
    }
    """
    #Note: This is currently not used
    #These changes were made at hero index 9771
    hero_output_map = {}
    for hero in stored_data:
        hero_output_map[hero] = {
            "paths": [],
            "outputs": ([], [])
        }
    
    #For each hero
    sorted_heroes = sorted(stored_data.keys())
    #for hero in stored_data:
    for i in range(len(sorted_heroes)):
        hero = sorted_heroes[i]
        #Don't write if no information
        if len(stored_data[hero]) == 0:
            continue
        #Presort keys, should be the same keys for each battletag (unless anomaly)
        sorted_keys_processed = sorted(stored_data[hero][0][1].keys())
        sorted_keys_processed.remove("SR")
        sorted_keys_raw = sorted(stored_data[hero][0][2].keys())
        
        #The following SHOULD be done as a 1 time external method prior to scraping. It
        #doesn't make sense from an optimization standpoint to do it inside this write
        #method which gets a lot of use. Especially since this startup only has to happen
        #once.
        #Check to see if the hero file exists. If it does not, write the keys describing
        #the entry as the top level line in the .csv
        #Paths
        path_list = [
            hero_processed_root_path,
            hero_raw_root_path
        ]
        for i in range(len(path_list)):
            path_list[i] += hero + ".csv"
        #Associated data keys
        key_list = [
            sorted_keys_processed,
            sorted_keys_raw
        ]
        #Write feature labels
        for i in range(len(path_list)):
            try:
                p = Path(path_list[i]).resolve()
            except:
                print("Current {} file DOES NOT exist. Writing feature labels.".format(hero))
                with codecs.open(path_list[i], "w", encoding="utf8") as f:
                    output = "battletag,SR"
                    for j in range(0, len(key_list[i])):
                        output += "," + key_list[i][j]
                    f.write(output + "\n")
        
        #Note the below could be done in the same enclosing loop as above. However, they're
        #separated to point out the big comment above.
        for i in range(len(path_list)):
            hero_output_map[hero]["paths"].append(path_list[i])
            #For each battletag-info pair
            for tup in stored_data[hero]:
                #First 2 line items are battletag (id) and SR (target class)
                output = tup[0] + "," + str(tup[1]["SR"])
                #Sort remaining keys
                #This should be moved to outside the tuple loop...but I'm scared of
                #some asshole with a weird/extra feature that I haven't seen yet
                #showing up and ruining the scraping. I'll do it for now but it'll
                #throw a key error if this happens.
                for j in range(0, len(key_list[i])):
                    output += "," + str(tup[i+1][key_list[i][j]])
                #Write
                hero_output_map[hero]["outputs"][i].append(output)
#                    f.write(output + "\n")
                    
    #Write to file
    for i in range(len(sorted_heroes)):
        hero = sorted_heroes[i]
        #Current hero data
        curr_map = hero_output_map[hero]
        #For each path
        for i in range(len(curr_map["paths"])):
            curr_path = curr_map["paths"][i]
            #Open file
            with codecs.open(curr_path, "a", encoding="utf8") as f:
                #Append output
                curr_out_list = curr_map["outputs"][i]
                for j in range(len(curr_out_list)):
                    f.write(curr_out_list[j] + "\n")
    
    #Empty stored data
    for hero in stored_data:
        stored_data[hero] = []

This just separates output from the rest of the console when it starts.

In [17]:
print("\n"*5)

## Main
Below is the main codeblock which runs the entire scraping/writing process. The large comment sections to start are notes on problematic battletag links.

This will crash when PlayOverwatch has a poorly formatted competitive hero entry. This is in no control on our end.

In [18]:
#Clear stored data when restarting scrape
for hero in hero_to_hash:
    stored_data[hero] = []

#max_count = 13
#count = 0
#Notes:

#https://playoverwatch.com/en-us/career/pc/Tocom-1468
#had biotic field data for tracer on 1/28/2018 2:47pm PT
#->Removed from data set

#https://playoverwatch.com/en-us/career/pc/Persocon-2391
#had teleporter uptime features for non-symmetra characters
#on 1/31/2018 3:27pm PT
#->Removed from data set

#https://playoverwatch.com/en-us/career/pc/ohhsnap-21969
#had 50k biotic field healing done on genji on 2/22018 8:25am PT
#->Removed from data set

#https://playoverwatch.com/en-us/career/pc/Dragin-1467
#had 225 enemies frozen on lucio on 2/2/2018 9:06am PT
#->Removed from data set, but was a duplicate entry appearing early.
#Earlier entry was left in database (line 260)

#https://playoverwatch.com/en-us/career/pc/GansiHXD-11210
#81 turret kills on soldier76 on 2/2/2018 6:36pm PT
#->Removed from dataset

#https://playoverwatch.com/en-us/career/pc/SirManselot-2339
#21800 self healing on pharah on 2/5/2018 7:21am PT
#->Removed from dataset

#https://playoverwatch.com/en-us/career/pc/Gibmemore-1472
#379 seconds of teleporter uptime on widowmaker on 2/5/2018 8:02am PT
#-> Removed from dataset
#Earlier entry was left in database, removed entry at index 92086, 2nd entry at 65150

#Shuuya#2847
#biotic field healing done on lucio on 2/6/2018 5:26am PT
#-> Removed from dataset
#(Both occurences removed from dataset, they were a duplicate later beyond 90k
#aroudn 160k)

#You just need to write this battletag down because...yeah: p00pCream#1155
battletag_count = len(battletags)

#Only open up hero files every n battletags
battletags_stored = 0
write_every_n_battletags = 4
#Iterate over each battletag
print("\nResuming scraping.\n")
for i in range(curr_battletag_index, battletag_count):
    #Scrape data
    print("({}/{}) Scraping battletag {}: {}".format(
        battletags_stored,
        write_every_n_battletags,
        i,
        battletags[i]
    ))
    start_time = time.time()
    recent_data_list = scrape_data_for_battletag(battletags[i])
    print("Took {}s".format(time.time() - start_time))
    
    #In event where a battletag was scraped and known to have an SR, but
    #because of OverwatchTracker or PlayOverwatch misinformation,
    #PlayOverwatch does not have an SR for the player at this time.
    if recent_data_list == None:
        print("No SR...moving on.")
        continue
    
    #Recent_data_list of form [(hero_0, map_hero_0), (hero_1, map_hero_1), ...]
    for tup in recent_data_list:
        stored_data[tup[0]].append((battletags[i], tup[1], tup[2]))
    battletags_stored += 1
    
    #Store data if necessary
    if battletags_stored == write_every_n_battletags:
        print("Writing to file.")
        write_hero_data()
        #Update current battletag index
        f = open(curr_battletag_index_from_scrambled_str, "w")
        curr_battletag_index = i+1
        f.write(str(curr_battletag_index))
        f.close()
        
        #Reset battletags stored
        battletags_stored = 0
        
        print("Data stored.")
        print("\n-------------------\n")
    
    #count += 1
    #if count == max_count:
    #    break

KeyError: u'biotic_field_healing_done_per_life'

In [19]:
stored_data

{'ana': [(u'Microcity#1653',
   {'SR': 2830,
    u'all_damage_done_per_life': 640.5901639344262,
    u'all_damage_done_per_second': 10.854444444444445,
    'average_life': 59.01639344262295,
    'barrier_damage_done_per_life': 0.0,
    'barrier_damage_done_per_second': 0.0,
    'biotic_grenade_kills_per_life': 0.0,
    'biotic_grenade_kills_per_second': 0.0,
    'cards_per_game': 0.0,
    'critical_hit_accuracy': 0,
    'critical_hits_per_life': 0.0,
    'critical_hits_per_second': 0.0,
    u'deaths': 60,
    u'defensive_assists_per_life': 1.2459016393442623,
    u'defensive_assists_per_second': 0.021111111111111112,
    u'eliminations_per_life': 1.5737704918032787,
    u'eliminations_per_second': 0.02666666666666667,
    u'enemies_slept_per_life': 1.0819672131147542,
    u'enemies_slept_per_second': 0.018333333333333333,
    'environmental_kills_per_life': 0.0,
    'environmental_kills_per_second': 0.0,
    'final_blows_per_elimination': 0.40625,
    u'final_blows_per_life': 0.6393442