In [2]:
import requests
from bs4 import BeautifulSoup
from datetime import date
import time
import ast
import os
import json
import pandas as pd

## Config

In [2]:
api_key = os.getenv('RIOT_API_KEY')

In [3]:
tier_config = {
             'CHALLENGER': ['I'],
             'GRANDMASTER': ['I'],
             'MASTER': ['I'],
             'DIAMOND': ['I','II','III','IV'],
             'PLATINUM': ['I','II','III','IV'],            
             'GOLD': ['I','II','III','IV'],
             'SILVER': ['I','II','III','IV'],
             'IRON': ['I','II','III','IV'],
            }

## API Endpoint

In [4]:
def build_api_endpoint(tier, division, api_key, page=1,  region='na1', queue='RANKED_SOLO_5x5'):
    
    """Constructs the Riot Games league v4 entries endpoint for a tier, division
       Note that API endpoints differ for challenger, master and grandmaster tiers
    """
    
    assert api_key != '', "No API key configured"
    
    base_url = 'https://{}.api.riotgames.com/lol/league/v4/'.format(region)

    if tier not in ['CHALLENGER', 'GRANDMASTER', 'MASTER']:
        api_substring = f'entries/{queue}/{tier}/{division}?page={page}'
    
    else:
        #Change to lowercase
        tier = tier.lower() 
        #Populate individualized api endpoint
        api_substring = f'{tier}leagues/by-queue/{queue}?page={page}'

    api_url = base_url + api_substring

    return api_url

In [5]:
headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
            "Accept-Language": "en-US,en;q=0.9",
            "Accept-Charset": "application/x-www-form-urlencoded; charset=UTF-8",
            "X-Riot-Token": api_key
          }

## Scrape data

10-20 min

In [6]:
for tier in tier_config: 
    
    conf = tier_config[tier] 
    dfs = []
    scraped =0
    
    for division in conf:
        
        #10 pages
        for p in range(1, 11):
            
            api = build_api_endpoint(tier=tier, division=division, api_key=api_key, page=p)
            r = requests.get(api, headers=headers)
    
            if r.ok:
            
                d = json.loads(r.text)
                scraped += len(d)
                dfs.append(pd.DataFrame(d))


            else:
                print(r.status_code, r.text)
                
        print(f'Scraped {scraped} summoner name info for {tier} (on division {division})')
        time.sleep(60)
    
    frame = pd.concat(dfs)

    # setting up filepath 
    data_folder = 'data/'
    base_filename = 'api-{}-{}'.format(tier,division)
    timestamp = str(date.today()).replace('-','')
    ext = '.csv'
    filepath = '_'.join([data_folder+base_filename,timestamp+ext])
    
    frame.to_csv(filepath,index=False,header=True)

Scraped 50 summoner name info for CHALLENGER (on division I)
Scraped 50 summoner name info for GRANDMASTER (on division I)
Scraped 50 summoner name info for MASTER (on division I)
Scraped 2050 summoner name info for DIAMOND (on division I)
Scraped 4100 summoner name info for DIAMOND (on division II)
Scraped 6150 summoner name info for DIAMOND (on division III)
Scraped 8200 summoner name info for DIAMOND (on division IV)
Scraped 2050 summoner name info for PLATINUM (on division I)
Scraped 4100 summoner name info for PLATINUM (on division II)
Scraped 6150 summoner name info for PLATINUM (on division III)
Scraped 8200 summoner name info for PLATINUM (on division IV)
Scraped 2050 summoner name info for GOLD (on division I)
Scraped 4100 summoner name info for GOLD (on division II)
Scraped 6150 summoner name info for GOLD (on division III)
Scraped 8200 summoner name info for GOLD (on division IV)
Scraped 2050 summoner name info for SILVER (on division I)
Scraped 4100 summoner name info for S

## Concatenate

In [3]:
os.path.abspath('')

'C:\\Users\\yeqiu\\Desktop\\opgg'

In [4]:
fldr = os.path.join(os.path.abspath(''), 'data/')

In [5]:
usernames = []

for f in os.listdir(fldr):
    
    fp = os.path.join(fldr,f)
    
    assert fp.endswith('csv'), 'Expected csv file'
    
    print("Reading {} of size {} MB".format(fp, round(os.path.getsize(fp)*10e-7 ,2) ))
    
    #Challenger, master and grandmaster
    if any(i in fp for i in ['CHALLENGER', 'MASTER', 'GRANDMASTER']):
        
        df = pd.read_csv(fp)
        
        #Str dict -> dict 
        df.entries = df.entries.map(lambda x: ast.literal_eval(x))
        #Normalize on entries 
        df = pd.concat([df.drop(
                                    ['entries'], axis=1
                                ),
                        pd.json_normalize(df.entries)
                       ], axis=1
                      )

        df = df.rename(columns={'queueType': 'queue'})
        
    else:

        df = pd.read_csv(fp)
    
    usernames.append(df)

Reading C:\Users\yeqiu\Desktop\opgg\data/api-CHALLENGER-I_20210923.csv of size 0.97 MB
Reading C:\Users\yeqiu\Desktop\opgg\data/api-DIAMOND-IV_20210923.csv of size 1.33 MB
Reading C:\Users\yeqiu\Desktop\opgg\data/api-GOLD-IV_20210923.csv of size 1.29 MB
Reading C:\Users\yeqiu\Desktop\opgg\data/api-GRANDMASTER-I_20210923.csv of size 2.26 MB
Reading C:\Users\yeqiu\Desktop\opgg\data/api-IRON-IV_20210923.csv of size 1.3 MB
Reading C:\Users\yeqiu\Desktop\opgg\data/api-MASTER-I_20210923.csv of size 8.41 MB
Reading C:\Users\yeqiu\Desktop\opgg\data/api-PLATINUM-IV_20210923.csv of size 1.33 MB
Reading C:\Users\yeqiu\Desktop\opgg\data/api-SILVER-IV_20210923.csv of size 1.31 MB


In [6]:
frame = pd.concat(usernames)

In [7]:
frame.shape

(77380, 16)

In [8]:
frame['tier'].value_counts()

MASTER         26380
DIAMOND         8200
GOLD            8200
IRON            8200
PLATINUM        8200
SILVER          8200
GRANDMASTER     7000
CHALLENGER      3000
Name: tier, dtype: int64

In [9]:
frame.nunique()

tier                8
leagueId         9597
queue               1
name                3
summonerId      43738
summonerName    43731
leaguePoints      900
rank                4
wins              917
losses            901
veteran             2
inactive            1
freshBlood          2
hotStreak           2
queueType           1
miniSeries         19
dtype: int64

In [10]:
frame[frame['summonerName'].duplicated()].nunique()

tier               8
leagueId         803
queue              1
name               3
summonerId      4542
summonerName    4539
leaguePoints     900
rank               4
wins             785
losses           763
veteran            2
inactive           1
freshBlood         2
hotStreak          2
queueType          1
miniSeries        12
dtype: int64

### Duplicates

In [11]:
frame[frame['summonerName'].duplicated()].groupby('tier').count()['summonerId']

tier
CHALLENGER      2700
DIAMOND          182
GOLD             180
GRANDMASTER     6301
IRON             181
MASTER         23742
PLATINUM         182
SILVER           180
Name: summonerId, dtype: int64

In [12]:
frame.drop_duplicates(subset=['summonerName']).groupby('tier').count()['summonerId']

tier
CHALLENGER      300
DIAMOND        8018
GOLD           8020
GRANDMASTER     699
IRON           8019
MASTER         2638
PLATINUM       8018
SILVER         8020
Name: summonerId, dtype: int64

MASTER, GRANDMASTER, and CHALLENGER leagues have <8020 players. So the data makes sense.

### Drop Duplicates, Export

In [13]:
frame = frame.drop_duplicates(subset=['summonerName'])

In [14]:
frame.shape

(43732, 16)

In [15]:
frame.nunique()

tier                8
leagueId         9597
queue               1
name                3
summonerId      43732
summonerName    43731
leaguePoints      900
rank                4
wins              917
losses            901
veteran             2
inactive            1
freshBlood          2
hotStreak           2
queueType           1
miniSeries         19
dtype: int64

In [16]:
frame.queue.value_counts()

RANKED_SOLO_5x5    3637
Name: queue, dtype: int64

In [17]:
frame.tier.value_counts()

GOLD           8020
SILVER         8020
IRON           8019
DIAMOND        8018
PLATINUM       8018
MASTER         2638
GRANDMASTER     699
CHALLENGER      300
Name: tier, dtype: int64

In [18]:
# setting up filepath 
data_folder = 'data/'
base_filename = 'api-all'
timestamp = str(date.today()).replace('-','')
ext = '.csv'
filepath = '_'.join([data_folder+base_filename,timestamp+ext])

#export
frame.to_csv(filepath,index=False,header=True)