In [1]:
import pandas as pd
import numpy as np
import torch
import json
import os

In [2]:
from utils import load_json, DotaTokenizer

project_root = os.path.dirname(os.getcwd())
hero_dpath = os.path.join(project_root, 'data', 'heroes.json')
match_dpath = os.path.join(project_root, 'data', 'main_metadata_2023.csv')
pickbans_dpath = os.path.join(project_root, 'data', 'picks_bans_2023.csv')

data_processed_path = '/Users/victorlacerda/Documents/VSCode/DotaPicker/data/processed/' # Where to save the dataloaders
specs_dict_path = os.path.join(data_processed_path + 'dota_matches.json')

In [3]:
picks_df = pd.read_csv(pickbans_dpath)
matches_df = pd.read_csv(match_dpath)
hero_metadata = load_json(hero_dpath)

results = np.array(matches_df)
picks = np.array(picks_df)

In [4]:
special_tks = ['[PICK]', '[BAN]', '[RADSTART]', '[DIRESTART]', '[RADWIN]', '[DIREWIN]']
tokenizer = DotaTokenizer(hero_metadata, special_tks)

In [5]:
def prune_unmatching_matches(picks_data: np.array, results_data: np.array) -> np.array:
    '''
    Prunes matches that do not appear in both datasets.
    '''

    umps, umrs = np.unique(picks_data[:, 4]), np.unique(results[:, 0]) # Col 4 represents 'match_id', Col 0 representes 'match_id'
    conflicting_matches = np.setxor1d(umps, umrs)
    mask_picks, mask_results = ~np.isin(picks_data[:, 4], conflicting_matches), ~np.isin(results_data[:, 0], conflicting_matches)
    picks_data, results_data = picks_data[mask_picks], results_data[mask_results]

    return picks_data, results_data

In [6]:
def prune_underpicked_matches(picks_data: np.array, results_data: np.array) -> np.array:
    '''
    Prunes matches that do not contain a full pick/ban sequence
    (i.e. matches with less than 24 picks or bans).
    '''

    match_ids, id_counts = np.unique(picks_data[:, 4], return_counts=True)
    defective_matches = match_ids[np.where(id_counts != 24)]
    mask_picks, mask_results = ~np.isin(picks_data[:, 4], defective_matches), ~np.isin(results_data[:, 0], defective_matches)
    picks_data, results_data = picks_data[mask_picks], results_data[mask_results]

    return picks_data, results_data

In [7]:
def make_samples_arr(tokenizer: DotaTokenizer, picks: np.array, results: np.array, hero_metadata: dict) -> list:

    if picks.shape[0] % 24 != 0:
        raise ValueError('The picks array contains matches with a defective number of picks/bans.')
    
    sample_arrs = np.split(picks, int(picks.shape[0] / 24), axis = 0)
 
    samples = []

    for idx in range(len(sample_arrs)):
        win_label = int(results[idx][1]) # Convert from `bool` to integer
        sample = [int(hero) for hero in list(sample_arrs[idx][:, 1])]

        if win_label == 1:
            win_label = tokenizer.simple_ttoi['[RADWIN]']
            sample.append(win_label)
        else:
            win_label = tokenizer.simple_ttoi['[DIREWIN]']
            sample.append(win_label)
            
        samples.append(sample)
            
    return samples

In [8]:
def make_sequence_samples(tokenizer: DotaTokenizer, picks: np.array, results: np.array, hero_metadata: dict) -> np.array:

    '''
    Tokenizer (DataTokenizer): custom tokenizer for encoding matches;
    picks (np.array): Array whose columns [0-4] represent `is_pick`, `hero_id`, `team`, `order`, `match_id`, respectively;
    results (np.array): Array whose columns [0-1] represent `match_id`, `radiant_win`, respectively;
    hero_metada (dictionary): Dictionary containing hero: id (int) pairs.
    '''

    picks, results = prune_unmatching_matches(picks, results)
    picks, results = prune_underpicked_matches(picks, results)
    sequence_samples = make_samples_arr(tokenizer, picks, results, hero_metadata)

    return sequence_samples

In [9]:
final_samples = make_sequence_samples(tokenizer, picks, results, hero_metadata)

In [10]:
from utils import Dotaset

dotaset = Dotaset(final_samples)

In [11]:
train_size = int(0.8 * len(dotaset))
val_size = int(0.1 * len(dotaset))
test_size = len(dotaset) - train_size - val_size

In [12]:
specs = {'simple_vocab_len': len(tokenizer.simple_vocab.keys()),
        'alt_vocab_len': len(tokenizer.pickban_vocab.keys()),
        'simple_match_len': len(final_samples[0])}

# Save the dictionary as a JSON file
with open(specs_dict_path, 'w') as json_file:
    json.dump(specs, json_file, indent=4)

In [13]:
from torch.utils.data import random_split
train_dataset, val_dataset, test_dataset = random_split(dotaset, [train_size, val_size, test_size])

In [14]:
torch.save(train_dataset, os.path.join(data_processed_path, 'dota_train.pt'))
torch.save(val_dataset, os.path.join(data_processed_path, 'dota_val.pt'))
torch.save(test_dataset, os.path.join(data_processed_path, 'dota_test.pt'))