In [77]:
from helpers import *
import numpy as np 

In [78]:
dataset = load_csv('dataset/trainset.csv')

### extract predictions, discard IDs and transform to ndarray

In [79]:
# get predictions

y = dataset["Prediction"]
print(len(y))
print(y)



225000
[0 0 0 ... 0 1 0]


In [80]:
# remove id & predictions, convert to 2D ndarray
data = np.array(dataset.tolist()) # not efficient, but it does the job and is still < 1s 
data = data[:,2:]
print(data.shape)
print(data[0])

(225000, 30)
[-9.9900e+02  9.0100e+01  4.4395e+01  1.7740e+00 -9.9900e+02 -9.9900e+02
 -9.9900e+02  1.3770e+00  1.7740e+00  6.4763e+01  1.3700e+00 -1.3950e+00
 -9.9900e+02  2.7323e+01 -5.7400e-01  1.4570e+00  3.7440e+01 -1.5850e+00
  5.2100e-01  5.6246e+01 -2.2370e+00  9.3023e+01  0.0000e+00 -9.9900e+02
 -9.9900e+02 -9.9900e+02 -9.9900e+02 -9.9900e+02 -9.9900e+02  0.0000e+00]


### calculate means of valid points & replace missing points by mean

In [81]:
# create masked ndarray to discard missing values
# https://numpy.org/doc/stable/reference/maskedarray.generic.html#constructing-masked-arrays
mask = data == -999.0
masked_data = np.ma.array(data)
masked_data.mask = mask
masked_data

masked_array(
  data=[[--, 90.1, 44.395, ..., --, --, 0.0],
        [28.478, 52.063, 21.773, ..., --, --, 50.076],
        [74.538, 119.834, 66.795, ..., -0.449, 0.57, 71.123],
        ...,
        [62.328, 42.916, 44.607, ..., --, --, 51.348],
        [133.204, 17.066, 54.913, ..., -0.128, -2.185, 205.491],
        [--, 72.435, 59.817, ..., --, --, 0.0]],
  mask=[[ True, False, False, ...,  True,  True, False],
        [False, False, False, ...,  True,  True, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ...,  True,  True, False],
        [False, False, False, ..., False, False, False],
        [ True, False, False, ...,  True,  True, False]],
  fill_value=1e+20)

In [82]:
means = np.ma.mean(masked_data, axis = 0)
means.shape
means.data

array([ 1.21787311e+02,  4.91733179e+01,  8.11552019e+01,  5.80110215e+01,
        2.40016552e+00,  3.71544546e+02, -8.12847295e-01,  2.37200843e+00,
        1.89338249e+01,  1.58561722e+02,  1.43750899e+00, -1.26627596e-01,
        4.58146191e-01,  3.87136342e+01, -1.09434622e-02, -8.72947111e-03,
        4.66526434e+01, -2.01354978e-02,  4.26294311e-02,  4.17174179e+01,
       -1.17773333e-02,  2.09980447e+02,  9.80426667e-01,  8.48992350e+01,
       -5.90535272e-03, -1.12831502e-02,  5.77031093e+01, -1.11501667e-02,
       -3.14898920e-03,  7.31954449e+01])

In [83]:
# replace missing values by their respective means
indices = np.where(data==-999.0)
data[indices] = np.take(means, indices[1])
# quick check
print(np.mean(data, axis= 0)- means)

[2.007993771258043e-11 0.0 0.0 0.0 6.780354055990756e-12
 9.218865670845844e-10 2.6509905381999488e-12 0.0 0.0 0.0 0.0 0.0
 -7.494560527732119e-13 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
 -4.597211500367848e-11 4.826000710167477e-15 4.133846043252731e-15
 -1.3740475424128817e-10 -1.906461100098511e-14 4.4395910558936436e-15 0.0]


### Normalize feature data

In [84]:
data = (data - np.mean(data, axis = 0)) / np.std(data, axis=0)
print(np.mean(data, axis = 0))
print(np.std(data, axis = 0))
print(data.shape)

[-3.71896381e-13  3.13801390e-14 -6.73309704e-15 -2.73349225e-14
 -7.22150970e-12 -4.29195303e-12 -1.36881379e-12  6.25675817e-14
  7.25006376e-15  3.19039906e-15  1.11030617e-14  3.06481951e-15
  3.49239454e-12 -3.19880961e-14  2.90912973e-17  3.03140229e-18
 -2.48220723e-14 -1.71221062e-18  5.33482394e-16  1.47402030e-14
 -1.37381464e-17 -1.96827473e-14 -2.76003418e-16  9.70570721e-13
 -2.20644585e-15 -1.83531436e-15  7.96206825e-12  1.74248055e-14
 -2.73853080e-15 -1.15462262e-14]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]
(225000, 30)


### add bias column


In [85]:
bias_vec = np.ones((data.shape[0],1))
print(data.shape)
data = np.concatenate((bias_vec, data), axis = 1)
print(data.shape)
print(data[0])

(225000, 30)
(225000, 31)
[ 1.00000000e+00 -3.80824516e-13  1.15995991e+00 -8.99007960e-01
 -8.81846506e-01 -7.22053251e-12 -4.29574198e-12 -1.37006313e-12
 -1.26937800e+00 -7.68311386e-01 -8.09933495e-01 -7.98411975e-02
 -1.06261497e+00  3.48733557e-12 -5.08070908e-01 -4.63807470e-01
  8.07001953e-01 -4.17125756e-01 -1.23725599e+00  2.63333103e-01
  4.40676722e-01 -1.22818043e+00 -9.23426737e-01 -1.00237670e+00
  9.77307453e-13 -3.48923968e-15 -2.93908572e-15  7.96425490e-12
  1.74008116e-14 -4.53349879e-15 -7.46144441e-01]


### save TX and Y matrix

In [87]:
np.save('dataset/preprocessed_tx.npy',data)
np.save('dataset/y.npy', y)