In [1]:
from implementations import  *
from helpers import *

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

## 1) Loading Data

In [2]:
#load data
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("data/dataset/", sub_sample=False)
assert len(y_train) == len(x_train), "Number of labels and number of rows in the dataset should be equal"

#Get column names
column_names = np.genfromtxt("data/dataset/x_train.csv", delimiter=",", dtype=str, max_rows=1)
column_names = column_names[1:] #Remove Id column to match the number of features loaded
print(column_names[:10])

['_STATE' 'FMONTH' 'IDATE' 'IMONTH' 'IDAY' 'IYEAR' 'DISPCODE' 'SEQNO'
 '_PSU' 'CTELENUM']


## 2) Preprocessing 

In [3]:
# Handling NaN values
# We will remove columns with more than 1/3% of NaNs

nan_counts = np.isnan(x_train).sum(axis=0)
columns_to_remove = np.where(nan_counts > len(x_train) / 3)[0]

print(f"{len(columns_to_remove)} Columns have more than 1/3 % NaNs   ")

data_cleaned = np.delete(x_train, columns_to_remove, axis=1)
x_test_cleaned = np.delete(x_test, columns_to_remove, axis=1)

columns_clean = np.delete(column_names, columns_to_remove)

#Handling Nan Values
# Replacing NaN with Median Values 
# More robust to Outliers & conform to categorical features

medians = np.nanmedian(data_cleaned, axis=0)
means = np.nanmean(data_cleaned, axis=0) # we will use this later
stds = np.nanstd(data_cleaned, axis=0) # we will use this later

data_cleaned[np.isnan(data_cleaned)] = np.take(medians, np.isnan(data_cleaned).nonzero()[1])
assert np.isnan(data_cleaned).sum() == 0, "There should be no NaNs in the dataset"

174 Columns have more than 1/3 % NaNs   


## 3) Feature Engineering

In [4]:
### Removing Highly Inter-correlated Features
data_cleaned, columns_to_keep, highly_correlated_cols = remove_highly_correlated_columns(data_cleaned, y_train ,threshold=0.8)

names_tobe_removed = columns_clean[highly_correlated_cols]
indices_to_remove = np.where(np.isin(column_names, names_tobe_removed))[0]
columns_to_remove = list(set(np.concatenate((columns_to_remove, indices_to_remove))))

columns_clean = columns_clean[columns_to_keep]


In [5]:
# Observe the correlation of features with the y label
corr = np.corrcoef(data_cleaned.T, y_train)
corr_to_y = corr[-1, :-1]
sorted_indices = np.argsort(np.abs(corr_to_y))[::-1]
sorted_features = columns_clean[sorted_indices]
sorted_corr = corr_to_y[sorted_indices]

In [6]:
#Remove features with low correlation (less than threshold=0.05) with the y label
corr_threshold = 0.05
corr_mask = np.abs(corr_to_y) > corr_threshold

data_cleaned = data_cleaned[:, corr_mask]

names_tobe_removed = columns_clean[~corr_mask]
indices_to_remove = np.where(np.isin(column_names, names_tobe_removed))[0]
columns_to_remove = list(set(np.concatenate((columns_to_remove, indices_to_remove))))

columns_clean = columns_clean[corr_mask]

In [7]:
# Filter only the top 40 features with the highest correlation with the y label
# Separate categorical and non-categorical features in different lists

relevant_cat_features= [] # categorical features
relevant_non_cat_features = [] # non-categorical features

for feature in sorted_features[:40]:
    column_idx = column_names.tolist().index(feature)
    values = np.unique(x_train[:,column_idx])
    if len(values) <= 8:
        relevant_cat_features.append(feature)
    else:
        relevant_non_cat_features.append(feature)

In [8]:
x_train1 = data_cleaned.copy() # make a copy of the cleaned data

In [9]:
#expand matrix by one hot encoding
x_train1 = one_hot_encoder(x_train1, columns_clean, relevant_cat_features) #one hot encoding
x_train1 = concat_features(data_cleaned, x_train1, relevant_non_cat_features, columns_clean) #concatenate non categorical features with categorical features

In [10]:
ratio = 0.8 # 80% of trainnig data is used for training
seed =1 # random seed
x_train_split, x_test_split , y_train_split, y_test_split = train_test_split(x_train1, y_train, test_size=1-ratio, random_state=seed)
#SMOTE Oversampling
x_train_split , y_train_split = SMOTE(x_train_split, y_train_split, k=5,ratio=0.6) #0.5
#Undersampling
x_train_split , y_train_split = undersample_majority(x_train_split, y_train_split,ratio=0.6) #0.85
x_train_split, x_mean , x_std = standardize(x_train_split)

100%|██████████| 13881/13881 [02:09<00:00, 106.93it/s]


In [11]:
#Feature Expansion
#Selected interactions are the interactions (i,j) that have been selected by the feature expansion function
x_train_split, selected_interactions = feature_expansion(x_train_split, y_train_split,desired_number_of_features=130) #180

## 4) Training ML models

In [12]:
#Setting up parameters
seed = 0 # Random seed for determinisitc results
gamma = 0.001 # Learning rate 0.001
max_iters = 20000 # Maximum number of iterations 20000
_lambda = 0.01 # Regularization parameter 0.01

#Prepping data for training
y , tx = build_model_data( y_train_split,x_train_split)

In [13]:
y[y==-1] = 0 # replace -1 with 0 

w = np.random.normal(0, 0.1, tx.shape[1])

weights = np.unique(y, return_counts=True)[1]*[1,2]/len(y)
w,loss = weighted_reg_logistic_regression(y, tx, _lambda, w, max_iters, gamma, weights)

#w,loss = reg_logistic_regression(y, tx, _lambda, w, max_iters, gamma)
#w,loss = ridge_regression(y, tx, _lambda)
#w,loss = least_squares(y, tx)
#w,loss = mean_squared_error_gd(y, tx, w, max_iters, gamma)

Current iteration=0, loss=0.8605029248246178 (with regularization)
Current iteration=100, loss=0.6083841700793691 (with regularization)
Current iteration=200, loss=0.5317004121196145 (with regularization)
Current iteration=300, loss=0.4934058114787415 (with regularization)
Current iteration=400, loss=0.4686650564892864 (with regularization)
Current iteration=500, loss=0.45063151803567686 (with regularization)
Current iteration=600, loss=0.43662251216024744 (with regularization)
Current iteration=700, loss=0.425314123465999 (with regularization)
Current iteration=800, loss=0.4159439810998501 (with regularization)
Current iteration=900, loss=0.40802657016091703 (with regularization)
Current iteration=1000, loss=0.40123191986114864 (with regularization)
Current iteration=1100, loss=0.3953258304770051 (with regularization)
Current iteration=1200, loss=0.3901368309763934 (with regularization)
Current iteration=1300, loss=0.3855362976905621 (with regularization)
Current iteration=1400, loss=

In [18]:
pred_train = predict(tx, w) #predict outputs values in -1,1

tx_test = (x_test_split - x_mean) / x_std #standardize test data
selected_interaction_terms = np.column_stack([tx_test[:, i] * tx_test[:, j] for i, j in selected_interactions])
tx_test = np.column_stack((tx_test, selected_interaction_terms))
tx_test = np.c_[np.ones((tx_test.shape[0], 1)), tx_test] #add bias term

pred_test = predict(tx_test, w)

y_test_split[y_test_split==0] = -1    # Reconvert labels to -1,1
y_train_split[y_train_split==0] = -1  # Reconvert labels to -1,1

# Print accuracy, recall, precision, and f1 score for train set
print("Train set:")
print_results(y_train_split, pred_train)

# Print accuracy, recall, precision, and f1 score for test set
print("\nTest set:")
print_results(y_test_split, pred_test)

Train set:
Accuracy:  0.7175685943436049
Recall:  0.6222444348389885
Precision:  0.8935485122395934
F1 score:  0.7336168044208748
Confusion Matrix: 
 [[19465  2744]
 [13983 23033]]

Test set:
Accuracy:  0.8492670587876756
Recall:  0.5967465753424658
Precision:  0.31618581019778624
F1 score:  0.41335547384651883
Confusion Matrix: 
 [[52249  7537]
 [ 2355  3485]]


## 5) Predicting labels for test data

In [17]:
#Preparing test data
x_test1 = x_test.copy()
x_test1 = prepare_data(x_test1, columns_to_remove) #clean data

x_test_hot = one_hot_encoder(x_test1, columns_clean, relevant_cat_features) #one hot encoding
x_test1 = concat_features(x_test1, x_test_hot, relevant_non_cat_features, columns_clean) #concatenate non categorical features with categorical features

x_test1 = (x_test1 - x_mean) / x_std #standardize test data

#The next two lines perform feature expansion on the test data
selected_interaction_terms = np.column_stack([x_test1[:, i] * x_test1[:, j] for i, j in selected_interactions])
x_test1 = np.column_stack((x_test1, selected_interaction_terms))


x_test1  = np.c_[np.ones(x_test1.shape[0]), x_test1] #add bias term
pred_test = predict(x_test1, w, threshold=0.62)

create_csv_submission(test_ids, pred_test, "submission_ai_final.csv")

(109379, 321)
(109379, 236)
