# receive and transform data

the data should be received, then transformed, then saved

In [1]:
import sys
import yaml
import os
my_pwd = '/Users/joelewig'
sys.path.append(os.path.join(my_pwd, 'projects/driftlon/analysis'))
sys.path.append(os.path.join(my_pwd, 'projects/driftlon'))
sys.path.append(os.path.join(my_pwd, 'projects/driftlon/data_fetcher'))
from preprocessing_utils import DataFetcher, transform_data_to_np
from get_from_db import DBReader
from write_to_db import DBWriter
import numpy as np
import pandas as pd
import tqdm
from sklearn.preprocessing import StandardScaler

load data

In [2]:
mongo_config = yaml.safe_load(open(os.path.join(my_pwd, 'projects/driftlon/config.yml'), 'r'))['mongodb']
db_reader = DBReader(mongo_config['address'], mongo_config['username'], mongo_config['password'])
db_writer = DBWriter(mongo_config['address'], mongo_config['username'], mongo_config['password'])
fetcher = DataFetcher(mongo_config['address'], mongo_config['username'], mongo_config['password'])

In [3]:
# raw_data, target = fetcher.get_raw_data_batch(10, offset=10000)
raw_filtered_data, targets, player_ids = fetcher.get_filtered_data_batch(10000, offset=0)

### to pandas

In [None]:
data, feature_names = list(zip(*map(transform_data_to_np, raw_filtered_data)))

In [None]:
data_pd = pd.DataFrame(raw_filtered_data)
data_pd.head();

load config files, only meta_fields is used

In [None]:
non_numeric_fields_path = './non_numeric_fields.yaml'

with open(non_numeric_fields_path, 'r') as file_:
    non_numeric_fields = yaml.load(file_.read(),  Loader=yaml.BaseLoader)
    

In [None]:
meta_fields_path = './meta_fields.yaml'

with open(meta_fields_path, 'r') as file_:
    meta_fields = yaml.load(file_.read(),  Loader=yaml.BaseLoader)    

### drop unused

In [None]:
dropped_pd = data_pd.drop(meta_fields, axis=1)
dropped_pd.head()

build pipelines for categorical and numeric data

then use an imputer to add MISSING values

then add the players SHARE on kills/deaths/dmg etc

(then add the users dmg/kills/deaths etc PER MIN)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from custom_transformer import TeamShareAdder

In [None]:
numeric_fields = [column_name for column_name in dropped_pd.columns if column_name not in non_numeric_fields]
cat_fields = [column_name for column_name in dropped_pd.columns if column_name in non_numeric_fields]

In [None]:
def aggregation_config_builder(stats):
    return [(f'stats_{stat}', f'team_{stat}') for stat in stats]

aggregation_config = aggregation_config_builder(['totalDamageDealtToChampions', 'kills', 'deaths', 'assists', 'totalDamageTaken', 'goldEarned'])

In [None]:
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), 
                         ('team_share_adder', TeamShareAdder(numeric_fields, aggregation_config, mongo_config)),
                         ('standard scaler', StandardScaler())])
# num_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median'))])

In [None]:
cat_pipeline = Pipeline([('one_hot', OneHotEncoder())])

In [None]:
full_pipeline = ColumnTransformer([('num', num_pipeline, numeric_fields), ('cat', cat_pipeline, ['timeline_lane'])])

In [None]:
data_post_pipeline = full_pipeline.fit_transform(dropped_pd);

In [None]:
post_df = pd.DataFrame(data_post_pipeline)
post_df.head()

### save the result

In [None]:
lengths = []
for i, game in tqdm.tqdm(list(enumerate(post_df.itertuples()))):
    lengths.append(len(game))
print(set(lengths))


In [None]:
for i, game in tqdm.tqdm(list(enumerate(post_df.itertuples()))):
    db_writer.write_processed_game(game, targets[i], player_ids[i])