<a href="https://colab.research.google.com/github/tavishcode/fyp/blob/master/clean_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from google.colab import drive
from datetime import datetime
drive.mount('/content/drive')

In [0]:
df = pd.read_csv('drive/My Drive/train_1.csv')
df.fillna(0, inplace = True)

Processing Categorical Features

In [0]:
df['Agent'] = df['Page'].str.split('_').str[-1].astype('category')
df['Access'] = df['Page'].str.split('_').str[-2].astype('category')
df['Type'] = df['Page'].str.split('_').str[-3].astype('category')
# df['Title'] = df['Page'].str.split('_').str[:-3].apply('_'.join).astype('category') ignored right now as too large to encode
df.drop('Page', axis=1, inplace=True)

In [0]:
df['Agent'].unique()

In [0]:
sampled_df = df.sample(frac=0.05)

In [0]:
# One Hot Encoding of categorical features (Agent, Access, Type, Title)

In [0]:
train_start = 0
train_end = 100
val_start = train_end
val_end = val_start + 100
test_start = val_end
test_end = test_start + 100

In [0]:
dataset_arr = sampled_df.values

In [0]:
numerical_dataset = dataset_arr[:,:-3]
categorical_dataset = dataset_arr[:,-3:]
ct = ColumnTransformer(
    [('oh_enc', OneHotEncoder(sparse=False), [-1, -2, -3])]
)
categorical_dataset = ct.fit_transform(dataset_arr)

In [0]:
train = numerical_dataset[:,train_start:train_end].astype('float')
val = numerical_dataset[:,val_start:val_end].astype('float')
test = numerical_dataset[:,test_start:test_end].astype('float')

In [0]:
ct.get_feature_names()

['oh_enc__x0_commons.wikimedia.org',
 'oh_enc__x0_de.wikipedia.org',
 'oh_enc__x0_en.wikipedia.org',
 'oh_enc__x0_es.wikipedia.org',
 'oh_enc__x0_fr.wikipedia.org',
 'oh_enc__x0_ja.wikipedia.org',
 'oh_enc__x0_ru.wikipedia.org',
 'oh_enc__x0_www.mediawiki.org',
 'oh_enc__x0_zh.wikipedia.org',
 'oh_enc__x1_all-access',
 'oh_enc__x1_desktop',
 'oh_enc__x1_mobile-web',
 'oh_enc__x2_all-agents',
 'oh_enc__x2_spider']

Processing Date Features

In [0]:
dates = sampled_df.columns.values[:-3]
date_features = []
for date in dates:
  dt = datetime.strptime(date, '%Y-%m-%d')
  # [month, weekday, is_weekend]
  date_features.append([dt.month, dt.weekday(), int(dt.weekday()//5 == 1)])

In [0]:
# One Hot Encoding of date features

In [0]:
date_enc = OneHotEncoder(sparse=False)
enc_date_features = date_enc.fit_transform(date_features)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [0]:
enc_date_features.shape

(550, 21)

Feature Normalization

In [0]:
scaler = MinMaxScaler()
train = scaler.fit_transform(train)
val = scaler.transform(val)
test = scaler.transform(test)

Processing Statistical Features


In [0]:
means = np.mean(train, axis=1)
medians = np.median(train, axis=1)
stds = np.std(train, axis=1)

Constructing 3D datasets (series, timesteps, features)


In [0]:
# visit, 21 date features, 14 categorical_features, 3 statistical features
num_features = 1 + 21 + 14 + 3
train_set = np.zeros((train.shape[0], train.shape[1], num_features))
val_set = np.zeros((val.shape[0], val.shape[1], num_features))
test_set = np.zeros((test.shape[0], test.shape[1], num_features))
for i in range(train.shape[0]):
  row_train = np.column_stack((train[i], 
                              enc_date_features[train_start:train_end],
                              np.tile(categorical_dataset[i], 
                                      (train.shape[1],1)),
                              np.full(train.shape[1], means[i]), 
                              np.full(train.shape[1], medians[i]),
                              np.full(train.shape[1], stds[i])))
  row_val = np.column_stack(( val[i], 
                              enc_date_features[train_start:train_end],
                              np.tile(categorical_dataset[i], 
                                      (val.shape[1],1)),
                              np.full(val.shape[1], means[i]), 
                              np.full(val.shape[1], medians[i]),
                              np.full(val.shape[1], stds[i])))
  row_test = np.column_stack((test[i], 
                              enc_date_features[train_start:train_end],
                              np.tile(categorical_dataset[i], 
                                      (test.shape[1],1)),
                              np.full(test.shape[1], means[i]), 
                              np.full(test.shape[1], medians[i]),
                              np.full(test.shape[1], stds[i])))
  train_set[i, :, :] = row_train
  val_set[i, :, :] = row_val
  test_set[i, :, :] = row_test 
print(train_set.shape)
print(val_set.shape)
print(test_set.shape)

(7253, 100, 39)
(7253, 100, 39)
(7253, 100, 39)


In [0]:
# saving arrays
np.save('drive/My Drive/train_set2.npy', train_set)
np.save('drive/My Drive/val_set2.npy', val_set)
np.save('drive/My Drive/test_set2.npy', train_set)