# receive and transform data

the data should be received, then transformed, then saved

In [1]:
import sys
import yaml
import os
my_pwd = '/Users/joelewig'
sys.path.append(os.path.join(my_pwd, 'projects/driftlon/analysis'))
sys.path.append(os.path.join(my_pwd, 'projects/driftlon'))
sys.path.append(os.path.join(my_pwd, 'projects/driftlon/data_fetcher'))
from analysis_utils import DataFetcher, transform_data_to_np
from get_from_db import DBReader
from write_to_db import DBWriter
import numpy as np
import pandas as pd
import tqdm

load data

In [2]:
mongo_config = yaml.safe_load(open(os.path.join(my_pwd, 'projects/driftlon/config.yml'), 'r'))['mongodb']
db_reader = DBReader(mongo_config['address'], mongo_config['username'], mongo_config['password'])
db_writer = DBWriter(mongo_config['address'], mongo_config['username'], mongo_config['password'])
fetcher = DataFetcher(mongo_config['address'], mongo_config['username'], mongo_config['password'])

In [67]:
# raw_data, target = fetcher.get_raw_data_batch(10, offset=10000)
raw_filtered_data, targets, player_ids = fetcher.get_filtered_data_batch(1000, offset=4000)

### to pandas

In [68]:
data, feature_names = list(zip(*map(transform_data_to_np, raw_filtered_data)))

In [69]:
data_pd = pd.DataFrame(raw_filtered_data)
data_pd.head();

load config files, only meta_fields is used

In [70]:
non_numeric_fields_path = './non_numeric_fields.yaml'

with open(non_numeric_fields_path, 'r') as file_:
    non_numeric_fields = yaml.load(file_.read(),  Loader=yaml.BaseLoader)
    

In [71]:
meta_fields_path = './meta_fields.yaml'

with open(meta_fields_path, 'r') as file_:
    meta_fields = yaml.load(file_.read(),  Loader=yaml.BaseLoader)    

drop columns that should not be used

In [72]:
dropped_pd = data_pd.drop(meta_fields, axis=1)
dropped_pd.head()

Unnamed: 0,championId,stats_kills,stats_deaths,stats_assists,stats_largestKillingSpree,stats_largestMultiKill,stats_killingSprees,stats_longestTimeSpentLiving,stats_doubleKills,stats_tripleKills,...,timeline_damageTakenDiffPerMinDeltas_10-20,timeline_damageTakenDiffPerMinDeltas_0-10,timeline_damageTakenDiffPerMinDeltas_20-30,timeline_creepsPerMinDeltas_30-end,timeline_xpPerMinDeltas_30-end,timeline_goldPerMinDeltas_30-end,timeline_csDiffPerMinDeltas_30-end,timeline_xpDiffPerMinDeltas_30-end,timeline_damageTakenPerMinDeltas_30-end,timeline_damageTakenDiffPerMinDeltas_30-end
0,154,4,3,20,2,1,1,357,0,0,...,,,,,,,,,,
1,68,11,1,10,9,2,2,1224,2,0,...,-61.5,-70.5,-93.5,,,,,,,
2,68,13,4,7,7,2,4,282,1,0,...,,,,,,,,,,
3,64,8,1,7,4,2,2,348,1,0,...,,-128.12,,,,,,,,
4,131,4,3,19,2,1,1,519,0,0,...,-122.8,-323.0,,,,,,,,


build pipelines for categorical and numeric data

then use an imputer to add MISSING values

then add the players SHARE on kills/deaths/dmg etc

(then add the users dmg/kills/deaths etc PER MIN)

In [73]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from custom_transformer import TeamShareAdder

In [74]:
numeric_fields = [column_name for column_name in dropped_pd.columns if column_name not in non_numeric_fields]
cat_fields = [column_name for column_name in dropped_pd.columns if column_name in non_numeric_fields]

In [75]:
def aggregation_config_builder(stats):
    return [(f'stats_{stat}', f'team_{stat}') for stat in stats]

aggregation_config = aggregation_config_builder(['totalDamageDealtToChampions', 'kills', 'deaths', 'assists', 'totalDamageTaken', 'goldEarned'])

In [76]:
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), 
                         ('team_share_adder', TeamShareAdder(numeric_fields, aggregation_config, mongo_config))])
# num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median'))])

In [77]:
cat_pipeline = Pipeline([('one_hot', OneHotEncoder())])

In [78]:
full_pipeline = ColumnTransformer([('num', num_pipeline, numeric_fields), ('cat', cat_pipeline, ['timeline_lane'])])

In [79]:
data_post_pipeline = full_pipeline.fit_transform(dropped_pd);

In [80]:
post_df = pd.DataFrame(data_post_pipeline)
post_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,85,86,87,88,89,90,91,92,93,94
0,4.0,3.0,20.0,2.0,1.0,1.0,357.0,0.0,0.0,0.0,...,0.057692,1.176471,272.513158,0.2845,0.171987,0.0,1.0,0.0,0.0,0.0
1,11.0,1.0,10.0,9.0,2.0,2.0,1224.0,2.0,0.0,0.0,...,0.032258,0.833333,500.86,0.169186,0.215937,0.0,1.0,0.0,0.0,0.0
2,13.0,4.0,7.0,7.0,2.0,4.0,282.0,1.0,0.0,0.0,...,0.137931,0.583333,710.852941,0.286796,0.242594,0.0,1.0,0.0,0.0,0.0
3,8.0,1.0,7.0,4.0,2.0,2.0,348.0,1.0,0.0,0.0,...,0.035714,0.777778,230.733333,0.271769,0.241316,0.0,0.0,0.0,1.0,0.0
4,4.0,3.0,19.0,2.0,1.0,1.0,519.0,0.0,0.0,0.0,...,0.09375,0.76,432.698113,0.192858,0.220034,0.0,1.0,0.0,0.0,0.0


save the result

In [81]:
lengths = []
for i, game in tqdm.tqdm(list(enumerate(post_df.itertuples()))):
    lengths.append(len(game))
print(set(lengths))


100%|██████████| 1000/1000 [00:00<00:00, 537593.44it/s]

{96}





In [82]:
for i, game in tqdm.tqdm(list(enumerate(post_df.itertuples()))):
    db_writer.write_processed_game(game, targets[i], player_ids[i])

100%|██████████| 1000/1000 [01:58<00:00,  8.47it/s]
