# Preprocessing

One-hot encode data and save as numpy array for efficient future processing.

In [1]:
import pandas as pd
import numpy as np

In [2]:
def unnest(df, col):
    unnested = (df.apply(lambda x: pd.Series(x[col]), axis=1)
                .stack()
                .reset_index(level=1, drop=True))
    unnested.name = col
    return df.drop(col, axis=1).join(unnested)

def to_bag_of_cards(df):
    df['ind'] = np.arange(df.shape[0]) + 1
    df_orig = df.copy()
    df['deck'] = df['deck'].apply(lambda d: d.split(';'))
    df = unnest(df, 'deck')
    df['value'] = 1
    df_bag = df.pivot(index='ind', columns='deck', values='value')
    df_bag[df_bag.isna()] = 0
    df_bag = df_bag.astype('int')
    return pd.concat([df_orig.set_index('ind'), df_bag], axis=1)

In [3]:
%%time
# train and valid are pre-processed in the same way as in reference implementation of example submission notebook:
train = pd.read_csv('./data/trainingData.csv')
valid = pd.read_csv('./data/validationData.csv')
train = to_bag_of_cards(train)
train_X = train.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1)
tran_y = train['winRate']
valid = to_bag_of_cards(valid)
valid_X = valid.drop(['deck', 'nofGames', 'nOfPlayers', 'winRate'], axis=1)
valid_y = valid['winRate']

Wall time: 54.4 s


In [4]:
np.save(open('./data/train_X.npy', 'wb'), train_X)
np.save(open('./data/train_y.npy', 'wb'), tran_y)
np.save(open('./data/valid_X.npy', 'wb'), valid_X)
np.save(open('./data/valid_y.npy', 'wb'), valid_y)