In [1]:
import pandas as pd 
import numpy as np
import json
import os
import re
from datetime import datetime
from datetime import date 
import requests
from bs4 import BeautifulSoup
import sys

#from .. import Helpers
sys.path.append(os.path.join(os.path.abspath(''),'..'))

from helpers import PageParser

## Helpers

In [2]:
#No header/user-agent cycling?
headers = {
    'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
    'Referer': '',
    'sec-ch-ua-mobile': '?0',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36'
}

In [3]:
def build_api_endpoint(user_id: 'int', start_info=0) -> 'url string': 

    """ OPGG uses AJAX XHR requests to fetch data. 
        We can pose to the public API endpoint per user 
        and fetch data all the same. 
        
        @user_id: integer value found in data-summoner-id param of OPGG user pages
        @start_info: integer value used for pagination;
                     first page is 0, next is Unix epoch 
                     time (https://www.epochconverter.com/)
        
    """

    base_url = 'https://na.op.gg/summoner/matches/ajax/averageAndList/'
    api_substring = 'startInfo={}&summonerId={}&type=soloranked'.format(start_info,
                                                        user_id)
        
    api_endpoint = base_url + api_substring
    
    return api_endpoint

In [4]:
def parse(payload: 'dict', user_name) -> 'pd.DataFrame':
    
    """ Get game data (last 10 ranked games) per user
    
        See: git issue ...
        Note: normally this would pose an error for unranked players;
        however, by looking @ usernames.iypnb we can confirm
        that we only track user IDs for currently ranked players
        who own a division. By default, this means they've played 10 
        games (placements).
    """   
        
    recency = payload['lastInfo']
    #Convert UTC -> readable date
    readable_date = datetime.utcfromtimestamp(recency).strftime('%Y-%m-%d %H:%M:%S')

    #Parse HTML contained in XHR requests
    
    soup = BeautifulSoup(payload['html'])
    games = soup.find_all("div", {"class": "GameItemList"})
    
    p = PageParser()
    d = p.parse_page(games)
    
    d['last_updated'] = readable_date
    d['user_name'] = user_name
    
    df = pd.DataFrame.from_dict(d)
    
    return df

## Load Data

In [5]:
os.path.abspath('')

'/Users/liamisaacs/Desktop/Personal Github Repositories/metis-project3/v0.0.0'

In [6]:
search_params = 's2-user_ids'
fldr = os.path.join(os.path.abspath(''), '..', 'data')

for f in os.listdir(fldr):
    
    if search_params in f:
        
        print(f)
        fp = os.path.join(fldr, f)

fp

s2-user_ids_20210927.csv


'/Users/liamisaacs/Desktop/Personal Github Repositories/metis-project3/v0.0.0/../data/s2-user_ids_20210927.csv'

In [7]:
df = pd.read_csv(fp)

In [8]:
df.user_id.nunique()

43263

In [9]:
df.shape

(43265, 2)

In [10]:
df.user_name.nunique()

43264

In [11]:
def print_progress(iteration, total, prefix ='', suffix ='', decimals =1, length =100, fill ='█', printEnd="\r"):
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end =printEnd)

    if iteration == total:
        print()

In [12]:
df = df.dropna(subset=['user_name'])

In [14]:
print_progress(0, len(df.user_id.values), prefix ='', suffix ='Complete')

failed = 0

for i, user_id in enumerate(df.user_id.values): 
    
    user_name = df.iloc[i]["user_name"]


    user_id = int(user_id)

    #Build api endpoint 
    api_endpoint = build_api_endpoint(user_id)


    #Fetch payload 
    try: 

        r = requests.get(api_endpoint, headers=headers)

        payload = json.loads(r.text)
        game_df = parse(payload=payload,
                   user_name=user_name)


        #If first instantiate main as df
        if i==0:
            main = game_df

        #If not append df as row to main dataframe
        else: 
            main = main.append(game_df)
        #Rationale: easier to append to one dataframe than concat 43k dataframes in memory


    except: 
        failed += 1
        print(f'Error fetching {user_name} with id {user_id} @ {api_endpoint}') 


    if i%1000==0:          
        print("Scraped {} of {}, {} remaining, failed {}".format(i,
                                                                 len(df.user_id.values),
                                                                 len(df.user_id.values)-i, 
                                                                 failed
                                                                ))

    print_progress(i, len(df.user_id.values), prefix ='', suffix ='Complete')

Scraped 0 of 43264, 43264 remaining, failed 0---------------------------------------------------------| 0.0% Complete
Error fetching SMD Mtnops with id 101241265 @ https://na.op.gg/summoner/matches/ajax/averageAndList/startInfo=0&summonerId=101241265&type=soloranked
Scraped 1000 of 43264, 42264 remaining, failed 1------------------------------------------------------| 2.3% Complete
Error fetching Chynx with id 112397782 @ https://na.op.gg/summoner/matches/ajax/averageAndList/startInfo=0&summonerId=112397782&type=soloranked
Error fetching Vyvie with id 110886473 @ https://na.op.gg/summoner/matches/ajax/averageAndList/startInfo=0&summonerId=110886473&type=soloranked
Error fetching Erlkönig with id 106110162 @ https://na.op.gg/summoner/matches/ajax/averageAndList/startInfo=0&summonerId=106110162&type=soloranked
Scraped 2000 of 43264, 41264 remaining, failed 4------------------------------------------------------| 4.6% Complete
Error fetching I love Kayn with id 112332800 @ https://na.op.g

In [15]:
frame = main.copy()

In [16]:
frame.shape

(585455, 18)

In [17]:
frame['items'].values[1]

['Eclipse', 'Refillable Potion', 'Serrated Dirk', 'Stealth Ward', 'Boots']

In [18]:
# setting up filepath 
data_folder = ''
base_filename = 's3-user_data'
timestamp = str(date.today()).replace('-','')
ext = '.csv'
filepath = '_'.join([data_folder+base_filename,timestamp+ext])

#export
frame.to_csv(filepath,index=False,header=True)

In [20]:
frame.sample(1)

Unnamed: 0,game_id,result,game_length,champion,spell_0,spell_1,rune_0,rune_1,kills,deaths,assists,levels,cs,kp,items,wards,last_updated,user_name
16,4036251327,Remake,3m 25s,Akali,Flash,Ignite,Electrocute,Precision,0,0,0,Level3,10 (2.9) CS,P/Kill 0%,"[Dark Seal, Refillable Potion, Stealth Ward]",0,2021-09-07 17:26:55,AGGELMAN24
