In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import *
from sklearn.metrics import *

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        print(e)

In [None]:
df = pd.read_csv(f'data_total.csv', index_col=0)

In [None]:
ycols = [
    x+str(i) for i in range(5) for x in ['speed', 'stop', 'timeloss', 'travel', 'wait']
]

ycol = [
    df.columns[df.columns.str.contains('wait')],
    df.columns[df.columns.str.contains('timeloss')],
    df.columns[df.columns.str.contains('travel')],
    df.columns[df.columns.str.contains('speed')],
    df.columns[df.columns.str.contains('stop')],
]

In [None]:
varcols = [f'p{i}' for i in range(1, 5)]
pcols = df.columns[df.columns.str.contains('_')]
qcols = ['1', '1l', '1r', '1s', 
         '2', '2l', '2r', '2s', 
         '3', '3l', '3r', '3s',
         '4', '4l', '4r', '4s']

In [None]:
var = df[varcols]
p = df[pcols]
q = df[qcols]
y = [
    df[c].mean(1) for c in ycol
]

In [None]:
def to_perc(x):
    return x.apply(lambda x: x / x.sum(), axis=1)

def process_var(_var, max_len=200):
    var = to_perc(_var)
    var['total_len'] = _var.sum(1) / max_len
    return var

def process_p(p, n_arms=4, max_v=1000):
    for i in range(1, n_arms+1):
        cols = list(filter(lambda x: int(x[0]) == i, p.columns))
        p[f'{i}_v'] = p[cols].sum(1) / max_v
        p[cols] = to_perc(p[cols])
    return p

def process_q(q_, n_arms=4):
    q = q_.copy()
    for i in range(1, n_arms+1):
        q[list(filter(lambda x: len(x) > 1, q.columns[q.columns.str.contains(str(i))]))]  =\
            q[list(filter(lambda x: len(x) > 1, q.columns[q.columns.str.contains(str(i))]))].apply(lambda x: x/q[str(i)])
        q[str(i)] /= 7

    q_ohe = [tf.squeeze(tf.one_hot(feat, 7), 0).numpy() for i, feat in enumerate(q_.values.reshape(-1, 1, 16).T-1)]
    q_ohe = tf.concat(q_ohe, axis=1).numpy()
    q_ohe = pd.DataFrame(q_ohe)
    q = pd.concat([q, q_ohe], axis=1)
    return q

def preprocess_all(p, q, var):
    p = process_p(p)
    q = process_q(q)
    var = process_var(var)
    
    return p, q, var
    
def postprocess_var(var):
    m = var[4]*200
    return [int(x*m) for x in var[:4]]

In [None]:
%%time
p, q, var = preprocess_all(p, q, var)

In [None]:
p.to_csv('preprocessed/preprocessed_p.csv')
q.to_csv('preprocessed/preprocessed_q.csv')
var.to_csv('preprocessed/preprocessed_var.csv')

In [None]:
p

In [None]:
q

In [None]:
p.columns

In [None]:
q.columns

In [None]:
var.columns