In [1]:
import pandas as pd
import numpy as np
import json
import os

In [2]:
from utils import load_json, DotaTokenizer

project_root = os.path.dirname(os.getcwd())
hero_dpath = os.path.join(project_root, 'data', 'heroes.json')
match_dpath = os.path.join(project_root, 'data', 'main_metadata_2023.csv')
pickbans_dpath = os.path.join(project_root, 'data', 'picks_bans_2023.csv')

In [3]:
picks_df = pd.read_csv(pickbans_dpath)
matches_df = pd.read_csv(match_dpath)
hero_metadata = load_json(hero_dpath)

results = np.array(matches_df)
picks = np.array(picks_df)

In [4]:
special_tks = ['[PAD]', '[PICK]', '[BAN]', '[RADSTART]', '[DIRESTART]', '[RADWIN]', '[DIREWIN]']
tokenizer = DotaTokenizer(hero_metadata, special_tks)

In [5]:
def prune_unmatching_matches(picks_data: np.array, results_data: np.array) -> np.array:
    '''
    Prunes matches that do not appear in both datasets.
    '''

    umps, umrs = np.unique(picks_data[:, 4]), np.unique(results[:, 0]) # Col 4 represents 'match_id', Col 0 representes 'match_id'
    conflicting_matches = np.setxor1d(umps, umrs)
    mask_picks, mask_results = ~np.isin(picks_data[:, 4], conflicting_matches), ~np.isin(results_data[:, 0], conflicting_matches)
    picks_data, results_data = picks_data[mask_picks], results_data[mask_results]

    return picks_data, results_data

In [6]:
def prune_underpicked_matches(picks_data: np.array, results_data: np.array) -> np.array:
    '''
    Prunes matches that do not contain a full pick/ban sequence
    (i.e. matches with less than 24 picks or bans).
    '''

    match_ids, id_counts = np.unique(picks_data[:, 4], return_counts=True)
    defective_matches = match_ids[np.where(id_counts != 24)]
    mask_picks, mask_results = ~np.isin(picks_data[:, 4], defective_matches), ~np.isin(results_data[:, 0], defective_matches)
    picks_data, results_data = picks_data[mask_picks], results_data[mask_results]

    return picks_data, results_data

In [73]:
def make_samples_arr(tokenizer: DotaTokenizer, picks: np.array, results: np.array, hero_metadata: dict) -> np.array:

    if picks.shape[0] % 24 != 0:
        raise ValueError('The picks array contains matches with a defective number of picks/bans.')
    
    sample_arrs = np.split(picks, int(picks.shape[0] / 24), axis = 0)
 
    samples = []

    for idx in range(len(sample_arrs)):
        win_label = int(results[idx][1]) # Convert from `bool` to integer
        sample = [int(hero) for hero in list(sample_arrs[idx][:, 1])]
        print(len(sample))
        print(sample)
        print(len(win_label))

    return np.array(samples)

In [74]:
def make_dataset(tokenizer: DotaTokenizer, picks: np.array, results: np.array, hero_metadata: dict) -> np.array:

    '''
    Tokenizer (DataTokenizer): custom tokenizer for encoding matches;
    picks (np.array): Array whose columns [0-4] represent `is_pick`, `hero_id`, `team`, `order`, `match_id`, respectively;
    results (np.array): Array whose columns [0-1] represent `match_id`, `radiant_win`, respectively;
    hero_metada (dictionary): Dictionary containing hero: id (int) pairs.
    '''

    picks, results = prune_unmatching_matches(picks, results)
    picks, results = prune_underpicked_matches(picks, results)
    sequence_samples = make_samples_arr(tokenizer, picks, results, hero_metadata)

    return picks, results, sequence_samples

In [75]:
apicks, aresults, asamples = make_dataset(tokenizer, picks, results, hero_metadata)

24
[100, 44, 19, 120, 31, 104, 101, 54, 10, 88, 93, 35, 70, 106, 56, 14, 121, 20, 98, 46, 6, 22, 97, 48]


TypeError: object of type 'int' has no len()

In [47]:
splitarr = np.split(apicks, int(apicks.shape[0] / 24), axis=0)

In [57]:
splitarr[0][:, 1]

array([100.0, 44.0, 19.0, 120.0, 31.0, 104.0, 101.0, 54.0, 10.0, 88.0,
       93.0, 35.0, 70.0, 106.0, 56.0, 14.0, 121.0, 20.0, 98.0, 46.0, 6.0,
       22.0, 97.0, 48.0], dtype=object)