In [1]:
import requests
import json
import time
import numpy as np
import pandas as pd
from statistics import mean
from sklearn import preprocessing, impute
from collections import Counter

In [2]:
api_url = "https://api.opendota.com/api"

In [3]:
rate_limit = {'count' : 0,
            'curr_time' : time.time(),
             'total_cnt': 0}

In [4]:
def get_request(url_extensiton, rate_limit, params={}):
    #Ensure rate limit of 60 calls/min is not exceeded
    if rate_limit['count'] == 60:
        rate_limit['count'] = 0
        time_elapsed = time.time() - rate_limit['curr_time']
        if time_elapsed < 60:
            time.sleep(62 - time_elapsed)
        rate_limit['curr_time'] = time.time()
    
    rate_limit['count'] += 1   
    rate_limit['total_cnt'] += 1
    if rate_limit['total_cnt'] == 50000:
        print('monthly limit reached')
    
    # make get request
    response = requests.get(api_url + url_extensiton, params=params)
    if response.status_code != 200:
        # if response.status_code == 429:
        #     print(response.text)
        return {'response_code': response.status_code,
               'error': response.text}
    else:
        json_response = json.loads(response.text)
        return {'response_code': 200,
                'body': json_response}

In [5]:
# Retrieve all hero names
heroes_json = get_request('/heroes', rate_limit)

heroes = dict()
for hero in heroes_json['body']:
    heroes[hero['id']] = hero['localized_name']
    
len(heroes)

122

## Large Instances and Small dimensionality

### Build the dataset with API SQL query

#### Create single SQL Query

In [30]:
# Function that makes the full SQL query to the DOTA2 API
def sql_query(match_id=None):
    
    # Define full query
    query = """
    SELECT

    matches.match_id,
    matches.radiant_team_id,
    matches.dire_team_id,
    matches.game_mode,
    matches.cluster,
    matches.lobby_type,
    matches.radiant_win,
    dire_team.her_o_ids as dire_heros,
    radiant_team.her_o_ids as radiant_heros,
    leagues.tier,
    tr1.rating as dire_rating,
    tr1.wins as dire_wins,
    tr1.losses as dire_losses,
    tr2.rating as radiant_rating,
    tr2.wins as radiant_wins,
    tr2.losses as radiant_losses
    FROM matches
    JOIN (SELECT match_id, string_agg(pl.hero_id::text, ',') as her_o_ids FROM player_matches as pl where player_slot < 5 group by match_id) as dire_team using (match_id)
    JOIN (SELECT match_id, string_agg(pl.hero_id::text, ',') as her_o_ids FROM player_matches as pl where player_slot > 5 group by match_id) as radiant_team using (match_id)
    JOIN leagues using(leagueid)
    JOIN team_rating as tr1 ON tr1.team_id = matches.dire_team_id
    JOIN team_rating as tr2 ON tr2.team_id = matches.radiant_team_id
    WHERE matches.human_players = 10
    AND matches.radiant_team_id IS NOT NULL
    AND matches.dire_team_id IS NOT NULL
    %s
    ORDER BY matches.match_id DESC
    LIMIT 20000;
    """ % ("AND matches.match_id < {}".format(match_id) if match_id else "")

    # Request data matching query
    response = get_request('/explorer', rate_limit, {'sql':query})

    #ensure error is not returned
    if response['response_code'] == 200:
        df = pd.DataFrame(response['body']['rows'])
    else:
        df = None
        print("{} {}".format(response['response_code'], response['error']))

    return df

#### Fetch all possible matches that fits the query

THIS CODE MAY NEED TO BE RUN A COUPLE TIMES FOR IT TO WORK. (API is glitchy)

In [62]:
# All instances cannot be retrieved at once, so it is done in batches of 20k

# storage df
df = pd.DataFrame()

# Last ID that's retrieved
min_id = None

# Keep fetching until the maximum available matches have been found
while True:
    # Get a batch of matches
    query_df = sql_query(min_id)
    print(query_df.shape)
    
    #store the min match_id so next batch can be retrieve
    min_id = min(query_df['match_id'])
    print(min_id)
    
    # add batch to storage
    df = pd.concat([df, query_df], ignore_index=False)
    
    if df.shape[0] > 103000: #manually found the max number of matches avialable
        break

df.shape

(20000, 16)
5627655668
(20000, 16)
5068880258
(20000, 16)
3604296148
(20000, 16)
1943364461
(20000, 16)
357589264
(4945, 16)
19150047


(104945, 16)

In [63]:
## Switch dire and radiant data to double the dataset
df_inv = df.copy()
df_inv['radiant_win'] = ~df_inv['radiant_win']
    
# Flip all columns with dire and radiant directly in them
flip = ['heros', 'rating', 'wins', 'losses']
dire_flip = ['dire_' + i for i in flip]
radiant_flip = ['radiant_' + i for i in flip]

df_inv[radiant_flip] = df[dire_flip]
df_inv[dire_flip] = df[radiant_flip]

# add the duplicated instances to the original df
df = pd.concat([df, df_inv], ignore_index=True)


### Preprocess Dataset

In [64]:
#Apply one hot encoding to clusters and game_mode, as they are arbritary numbers relating to region
le = preprocessing.LabelEncoder()
df['tier'] = le.fit_transform(df['tier'])

enc = preprocessing.OneHotEncoder()
for column in ['cluster', 'game_mode', 'tier']:
    encoded = enc.fit_transform(np.array(df[column]).reshape(-1,1))
    encoded_names = [column + '_' + str(i) for i in range(len(encoded.toarray()[0]))]
    df.loc[:, encoded_names] = encoded.toarray()

In [65]:
dfe = df.copy()

#### Create dataset with one-hot encoding on heroes (dire and radiant separately)

In [66]:
# Add one-hot encoding to heros, such that 121 heroes for both dire and radiant
names = ['dire_{}'.format(str(i)) for i in heroes.values()]
df[names] = [[1 if str(i) in j.split(',') else 0 for i in heroes.keys()] for j in df['dire_heros']]

names = ['radiant_{}'.format(str(i)) for i in heroes.values()]
df[names] = [[1 if str(i) in j.split(',') else 0 for i in heroes.keys()] for j in df['radiant_heros']]

  self[col] = igetitem(value, i)


In [67]:
# Store the relevant columns
skip = ['match_id', 'radiant_team_id', 'dire_team_id', 'dire_heros', 'radiant_heros', 'cluster', 'game_mode', 'tier']
columns = [i for i in df.columns if i not in skip]
df[columns].to_csv("../data/dota2_matches_large_encoded.csv", index=False)

In [68]:
df[columns].shape

(209890, 326)

#### Create dataset without encoding

In [69]:
df = dfe.copy()

In [70]:
# Function to determine whether hero is chosen by a player in either team
def hero_exists(hero, dires, radiants):
    if hero in dires:
        return 1
    elif hero in radiants:
        return 2
    else:
        return 0

In [71]:
# Create all hero choice columns
df[list(heroes.values())] = [[hero_exists(str(i), d, r) for i in heroes.keys()] for d, r in zip(df['dire_heros'], df['radiant_heros'])]

In [72]:
# Store all the relevant features
skip = ['match_id', 'radiant_team_id', 'dire_team_id', 'dire_heros', 'radiant_heros', 'cluster', 'game_mode', 'tier']
columns = [i for i in df.columns if i not in skip]
df[columns].to_csv("../data/dota2_matches_large.csv", index=False)

In [73]:
df[columns].shape

(209890, 204)