# receive and transform data

the data should be received, then transformed, (then saved locally)

In [1]:
import sys
import yaml
import tqdm
import numpy as np
import pandas as pd
sys.path.append('/home/joel/projects/driftlon/analysis')
sys.path.append('/home/joel/projects/driftlon')
sys.path.append('/home/joel/projects/driftlon/data_fetcher')
from get_from_db import DBReader
from write_to_db import DBWriter
from my_data_fetcher import MyDataFetcher
from analysis_utils import transform_data_to_np

load data

In [2]:
mongo_config = yaml.safe_load(open('/home/joel/projects/driftlon/config.yml', 'r'))['mongodb']
db_reader = DBReader(mongo_config['address'], mongo_config['username'], mongo_config['password'])
db_writer = DBWriter(mongo_config['address'], mongo_config['username'], mongo_config['password'])
fetcher = MyDataFetcher(mongo_config['address'], mongo_config['username'], mongo_config['password'])

In [3]:
additional_cumulative_stats = ['totalDamageDealtToChampions', 'kills', 'deaths', 'assists', 'totalDamageTaken', 'goldEarned']
raw_filtered_data, targets, player_ids = fetcher.get_filtered_data_batch(10, offset=5000, additional_cumulative_stats=additional_cumulative_stats)

### to pandas

In [4]:
data, feature_names = list(zip(*map(transform_data_to_np, raw_filtered_data)))

In [5]:
data_pd = pd.DataFrame(raw_filtered_data)
data_pd.head();

load config files, only meta_fields is used

In [6]:
non_numeric_fields_path = './non_numeric_fields.yaml'

with open(non_numeric_fields_path, 'r') as file_:
    non_numeric_fields = yaml.load(file_.read(),  Loader=yaml.BaseLoader)
    

In [7]:
meta_fields_path = './meta_fields.yaml'

with open(meta_fields_path, 'r') as file_:
    meta_fields = yaml.load(file_.read(),  Loader=yaml.BaseLoader)    

drop columns that should not be used

In [8]:
dropped_pd = data_pd.drop(meta_fields, axis=1)
dropped_pd.head();

build pipelines for categorical and numeric data

then use an imputer to add MISSING values

then add the players SHARE on kills/deaths/dmg etc

(then add the users dmg/kills/deaths etc PER MIN)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from custom_transformer import TeamShareAdder, StatPerTimeAdder

In [10]:
numeric_fields = [column_name for column_name in dropped_pd.columns if column_name not in non_numeric_fields]
cat_fields = [column_name for column_name in dropped_pd.columns if column_name in non_numeric_fields]

In [11]:
def aggregation_config_builder(stats):
    return [(f'stats_{stat}', f'team_{stat}') for stat in stats]

def feature_time_config_builder(stats):
    return [f'stats_{stat}' for stat in stats]

aggregation_config = aggregation_config_builder(['totalDamageDealtToChampions', 'kills', 'deaths', 'assists', 'totalDamageTaken', 'goldEarned'])
features_per_time = feature_time_config_builder(['totalDamageDealtToChampions', 'kills', 'deaths', 'assists', 'totalDamageTaken', 'goldEarned'])

In [12]:
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), 
                         ('team_share_adder', TeamShareAdder(numeric_fields, aggregation_config)),
                         ('stat_per_time_adder', StatPerTimeAdder(numeric_fields, features_per_time))])
# num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), 
#                          ('team_share_adder', TeamShareAdder(numeric_fields, aggregation_config))])

In [13]:
cat_pipeline = Pipeline([('one_hot', OneHotEncoder())])

In [14]:
full_pipeline = ColumnTransformer([('num', num_pipeline, numeric_fields), ('cat', cat_pipeline, ['timeline_lane'])])

In [15]:
data_post_pipeline = full_pipeline.fit_transform(dropped_pd);

[0, 1, 2, 15, 26, 30] ['stats_totalDamageDealtToChampions', 'stats_kills', 'stats_deaths', 'stats_assists', 'stats_totalDamageTaken', 'stats_goldEarned']


In [16]:
post_df = pd.DataFrame(data_post_pipeline)
post_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,83,84,85,86,87,88,89,90,91,92
0,3.0,2.0,8.0,2.0,1.0,1.0,492.0,0.0,0.0,0.0,...,0.171551,0.003212,0.002141,0.008565,4.369379,13.377944,6.88758,0.0,0.0,1.0
1,6.0,5.0,6.0,3.0,2.0,2.0,348.0,1.0,0.0,0.0,...,0.212545,0.00375,0.003125,0.00375,9.4175,7.874375,7.45875,1.0,0.0,0.0
2,6.0,7.0,8.0,3.0,1.0,1.0,445.0,0.0,0.0,0.0,...,0.218735,0.004202,0.004902,0.005602,8.060924,20.62395,6.764706,0.0,1.0,0.0
3,7.0,2.0,14.0,4.0,1.0,2.0,875.0,0.0,0.0,0.0,...,0.223226,0.003964,0.001133,0.007928,12.665912,17.635334,8.254813,0.0,1.0,0.0
4,6.0,5.0,8.0,2.0,2.0,2.0,629.0,1.0,0.0,0.0,...,0.205328,0.004024,0.003353,0.005366,10.553991,17.165661,6.580818,0.0,1.0,0.0


save the result

In [18]:
for i, game in tqdm.tqdm(list(enumerate(post_df.itertuples()))):
    db_writer.write_processed_game(game, targets[i], player_ids[i])

100%|██████████| 10/10 [00:01<00:00,  8.71it/s]
