In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
from helpers import *
from Model import *

In [2]:
[yb_t, input_data_t, ids_t, yb_test, input_data_test, ids_test] = pickle.load(open( "dat.p", "rb" ))
tx_tr_clean = clean_data(input_data_t)
tx_te_clean = clean_data(input_data_test)

print('Train set shape ', tx_tr_clean.shape)
print('Test set shape ', tx_te_clean.shape)

Train set shape  (250000, 30)
Test set shape  (568238, 30)


In [3]:
tx_tr_clean = np.column_stack((tx_tr_clean, yb_t))
tx_te_clean = np.column_stack((tx_te_clean, yb_test, ids_test))

Extract different pattern of missing data

In [None]:
def extract_patern(tx):
    sh = tx.shape
    nan_mask = np.isnan(tx)
    type_of_missing = []
    for i in range(sh[0]):
        type_of_missing.append(''.join(['1'  if nan_mask[i,j] else '0' for j in range(sh[1])]))
    type_of_missing = set(type_of_missing)
    type_of_missing_bool = []
    for line in type_of_missing:
        type_of_missing_bool.append([True if e == '1' else False for e in line])
    return nan_mask, np.array(type_of_missing_bool)

In [None]:
%%time
nan_mask_tr, type_of_missing = extract_patern(tx_tr_clean)
print(type_of_missing)

Sort the data by the pattern

In [None]:

def sort_by_pattern(tx, nan_mask, type_of_missing):
    tx_sorted = [[]]*len(type_of_missing)
    for i in range(nan_mask.shape[0]):
        for k in range(type_of_missing.shape[0]):
            if np.array_equal(nan_mask[i,:],type_of_missing[k,:]):
                tx_sorted[k]=tx_sorted[k]+[tx[i,:]]
                break
    return tx_sorted
        

In [None]:
%%time
tx_tr_sorted = sort_by_pattern(tx_tr_clean, nan_mask_tr, type_of_missing)

In [4]:

tx_tr_sorted,type_of_missing = pickle.load(open('tx_sorted.dat', 'rb'))

Concatenante data that can be trained together

In [5]:
valid_data = []
for line in type_of_missing:
    valid = []
    for (i,test) in enumerate(type_of_missing):
        plop = test[np.invert(line)]
        if not plop.any():
            valid.append(i)
    valid_data.append(valid)
    
print(valid_data)

[[0], [0, 1], [0, 2], [0, 2, 3], [0, 1, 2, 3, 4, 5], [0, 1, 2, 5]]


In [6]:
tx_tr_sorted_grouped = []
for valid in valid_data:
    temp = []
    for i in valid:
        temp += tx_tr_sorted[i]
    tx_tr_sorted_grouped.append(temp)
    
tx_tr_sorted_grouped = [np.array(e) for e in tx_tr_sorted_grouped]

Training the models

In [7]:
def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    poly = np.ones((len(x), 1))
    for deg in range(1, degree+1):
        poly = np.c_[poly, np.power(x, deg)]
    return poly[:, 1:]

def augment_feat(input_tx_train, input_tx_test, degree):
    n_features = len(input_tx_train[0])
    for ind in range(1, n_features):
        input_tx_train = np.c_[input_tx_train, build_poly(input_tx_train[:, ind], degree)]
        input_tx_test = np.c_[input_tx_test, build_poly(input_tx_test[:, ind], degree)]
    return input_tx_train, input_tx_test

def augment_feat_no_test(input_tx_train, degree):
    n_features = len(input_tx_train[0])
    for ind in range(1, n_features):
        input_tx_train = np.c_[input_tx_train, build_poly(input_tx_train[:, ind], degree)]
    return np.column_stack((np.ones((len(input_tx_train), 1)),input_tx_train))



In [8]:
from Model import logistic_regression_gradient

In [9]:
%%time
models = []
std_values = []
for tx in tx_tr_sorted_grouped:
    y = tx[:,-1]
    tx = tx[:,0:-1]
    tx = augment_feat_no_test(tx, 2)
    tx, mean_x, std_x = standardize(tx)
    model = logistic_regression_gradient()
    model.fit(y, tx)
    models.append(model)
    std_values.append([mean_x, std_x])



MemoryError: 

In [None]:
print([e[:,0:-1].shape for e in tx_tr_sorted])

print([e[:,0:-3].shape for e in tx_te_sorted])

print(tx_te_clean[1014])

Prediction phase :

In [None]:
nan_mask_te, type_of_missing_te = extract_patern(tx_te_clean)
tx_te_sorted = sort_by_pattern(tx_te_clean, nan_mask_te, type_of_missing_te)
tx_te_sorted = [np.array(e) for e in tx_te_sorted]

In [None]:
y_preds = []
ids_all = []
for tx, std_value, model in zip(tx_tr_sorted_grouped,std_values, models):
    y = tx[:,-3]
    ids = tx[:,-1]
    tx = tx[:,0:-3]
    tx = augment_feat_no_test(tx, 2)
    tx = (tx-std_value[0])/std_value[1]
    y_pred = model.predict(tx)
    y_preds.append(y_pred)
    ids.append(ids)

In [None]:
print(len(y_preds))
y_preds_combined = reduce(np.vstack, y_preds)
ids_te_combined = reduce(np.vstack, ids)
print(type(y_preds_combined), len(y_preds_combined))

In [None]:
y_pred_bin = np.array([-1 if p<0 else 1 for p in y_preds_combined])

In [None]:
create_csv_submission(ids_te_combined, y_pred_bin, 'submultiridge.csv')