# receive and transform data

the data should be received, then transformed, (then saved locally)

In [1]:
import sys
import yaml
sys.path.append('/home/joel/projects/driftlon/analysis')
sys.path.append('/home/joel/projects/driftlon')
sys.path.append('/home/joel/projects/driftlon/data_fetcher')
from analysis_utils import DataFetcher, transform_data_to_np
from get_from_db import DBReader
import numpy as np
import pandas as pd

load data

In [2]:
mongo_config = yaml.safe_load(open('/home/joel/projects/driftlon/config.yml', 'r'))['mongodb']
db_reader = DBReader(mongo_config['address'], mongo_config['username'], mongo_config['password'])
fetcher = DataFetcher(mongo_config['address'], mongo_config['username'], mongo_config['password'])

In [3]:
raw_data, target = fetcher.get_raw_data_batch(10, offset=10000)

In [4]:
raw_data, target = fetcher.get_filtered_data_batch(100, offset=10000)

### to pandas

In [5]:
data, features_of_each_game = list(zip(*map(transform_data_to_np, raw_data)))

In [6]:
data_pd = pd.DataFrame(raw_data)
data_pd.head()

Unnamed: 0,participantId,teamId,championId,spell1Id,spell2Id,stats_participantId,stats_win,stats_item0,stats_item1,stats_item2,...,timeline_damageTakenDiffPerMinDeltas_10-20,timeline_damageTakenDiffPerMinDeltas_0-10,timeline_damageTakenDiffPerMinDeltas_20-30,timeline_creepsPerMinDeltas_30-end,timeline_xpPerMinDeltas_30-end,timeline_goldPerMinDeltas_30-end,timeline_csDiffPerMinDeltas_30-end,timeline_xpDiffPerMinDeltas_30-end,timeline_damageTakenPerMinDeltas_30-end,timeline_damageTakenDiffPerMinDeltas_30-end
0,5,100,81,12,4,5,True,6632,3110,3042,...,,,,,,,,,,
1,7,200,104,11,4,7,True,2055,6676,3036,...,-769.8,-95.5,-1433.3,,,,,,,
2,6,200,76,11,4,6,False,3157,2055,4630,...,342.0,263.1,,,,,,,,
3,9,200,104,11,4,9,True,6673,2055,0,...,,-111.68,,,,,,,,
4,2,100,104,11,4,2,True,2055,0,0,...,,-207.22,,,,,,,,


load config files, only meta_fields is used

In [7]:
non_numeric_fields_path = './non_numeric_fields.yaml'

with open(non_numeric_fields_path, 'r') as file_:
    non_numeric_fields = yaml.load(file_.read(),  Loader=yaml.BaseLoader)
    

In [8]:
meta_fields_path = './meta_fields.yaml'

with open(meta_fields_path, 'r') as file_:
    meta_fields = yaml.load(file_.read(),  Loader=yaml.BaseLoader)    

drop columns that should not be used

In [9]:
dropped_pd = data_pd.drop(meta_fields, axis=1)

In [10]:
dropped_pd.head()

Unnamed: 0,championId,stats_kills,stats_deaths,stats_assists,stats_largestKillingSpree,stats_largestMultiKill,stats_killingSprees,stats_longestTimeSpentLiving,stats_doubleKills,stats_tripleKills,...,timeline_damageTakenDiffPerMinDeltas_10-20,timeline_damageTakenDiffPerMinDeltas_0-10,timeline_damageTakenDiffPerMinDeltas_20-30,timeline_creepsPerMinDeltas_30-end,timeline_xpPerMinDeltas_30-end,timeline_goldPerMinDeltas_30-end,timeline_csDiffPerMinDeltas_30-end,timeline_xpDiffPerMinDeltas_30-end,timeline_damageTakenPerMinDeltas_30-end,timeline_damageTakenDiffPerMinDeltas_30-end
0,81,7,5,12,5,2,2,899,1,0,...,,,,,,,,,,
1,104,14,7,10,6,2,3,311,1,0,...,-769.8,-95.5,-1433.3,,,,,,,
2,76,6,5,5,5,1,1,683,0,0,...,342.0,263.1,,,,,,,,
3,104,10,2,7,7,1,2,197,0,0,...,,-111.68,,,,,,,,
4,104,3,0,8,3,1,1,988,0,0,...,,-207.22,,,,,,,,


build pipelines for categorical and numeric data

then use an imputer to add MISSING values

then add the players SHARE on kills/deaths/dmg etc

(then add the users dmg/kills/deaths etc PER MIN)

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from custom_transformer import TeamShareAdder

In [12]:
numeric_fields = [column_name for column_name in dropped_pd.columns if column_name not in non_numeric_fields]
cat_fields = [column_name for column_name in dropped_pd.columns if column_name in non_numeric_fields]

In [13]:
def aggregation_config_builder(stats):
    return [(f'stats_{stat}', f'team_{stat}') for stat in stats]

aggregation_config = aggregation_config_builder(['totalDamageDealtToChampions', 'kills', 'deaths', 'assists', 'totalDamageTaken', 'goldEarned'])

In [14]:
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), 
                         ('team_share_adder', TeamShareAdder(numeric_fields, aggregation_config, mongo_config))])
# num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median'))])

In [15]:
cat_pipeline = Pipeline([('one_hot', OneHotEncoder())])

In [16]:
full_pipeline = ColumnTransformer([('num', num_pipeline, numeric_fields), ('cat', cat_pipeline, ['timeline_lane'])])

In [17]:
data_post_pipeline = full_pipeline.fit_transform(dropped_pd)

In [18]:
pd.DataFrame(data_post_pipeline)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,85,86,87,88,89,90,91,92,93,94
0,7.0,5.0,12.0,5.0,2.0,2.0,899.0,1.0,0.0,0.0,...,0.147059,0.300000,509.539683,0.140193,0.206181,1.0,0.0,0.0,0.0,0.0
1,14.0,7.0,10.0,6.0,2.0,3.0,311.0,1.0,0.0,0.0,...,0.212121,0.285714,449.867647,0.194096,0.264158,0.0,1.0,0.0,0.0,0.0
2,6.0,5.0,5.0,5.0,1.0,1.0,683.0,0.0,0.0,0.0,...,0.250000,0.172414,383.538462,0.233930,0.229351,0.0,1.0,0.0,0.0,0.0
3,10.0,2.0,7.0,7.0,1.0,2.0,197.0,0.0,0.0,0.0,...,0.071429,0.500000,442.166667,0.216201,0.227259,0.0,0.0,0.0,1.0,0.0
4,3.0,0.0,8.0,3.0,1.0,1.0,988.0,0.0,0.0,0.0,...,0.000000,2.666667,158.969697,0.190747,0.192948,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,11.0,7.0,6.0,4.0,3.0,4.0,333.0,1.0,1.0,0.0,...,0.225806,0.187500,481.021739,0.174642,0.258784,0.0,0.0,1.0,0.0,0.0
96,6.0,4.0,5.0,2.0,1.0,2.0,239.0,0.0,0.0,0.0,...,0.160000,0.625000,498.178571,0.216712,0.209303,0.0,0.0,0.0,1.0,0.0
97,6.0,4.0,8.0,5.0,2.0,1.0,608.0,1.0,0.0,0.0,...,0.153846,0.320000,403.381818,0.173610,0.215335,0.0,0.0,1.0,0.0,0.0
98,3.0,8.0,6.0,0.0,1.0,0.0,491.0,0.0,0.0,0.0,...,0.296296,0.162162,482.272727,0.231404,0.205752,0.0,0.0,0.0,0.0,1.0
