# Utilities for data processing

## 1. Import data
We use pandas to read the .csv-files and store the data into pandas dataframes.

In [9]:
import pandas as pd
import numpy as np

In [10]:
def Import_Data(data_file, label_file):
    data = pd.read_csv(data_file,header=None)
    labels = pd.read_csv(label_file,header=None)
    debug_msg = str('Imported data ('+str(data.shape)+') and labels ('+str(labels.shape)+').')
    print(debug_msg)
    return data, labels

## 2. Feature selection
The training data contains a lot of bad features. For example, some columns have constant values and others have high correlation. To get rid of the worst features, we can use the feature_selection library in Scikit-learn.

In [11]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

In [1]:
def Remove_Zero_Variance(data):
    selector = VarianceThreshold()
    clean_data = selector.fit_transform(data)
    debug_msg = str('Zero variance features removed from data. Input shape: ('+ str(data.shape)+'). Output shape: ('+str(clean_data.shape)+')')
    print(debug_msg)
    return clean_data

def Select_Features_From_Model(data, labels, model_name='LassoCV'):
    if model_name == 'LassoCV':
        model = SelectFromModel(LassoCV())
        model.fit(data, labels)
        selected_features = model.transform(data)
        debug_msg = str('Selected best features from input. Input shape: '+str(data.shape)+'. Output shape: '+str(selected_features.shape)+'.')
        print(debug_msg)
        return selected_features

## 3. Normalization
It is always good to normalize the data into some specific range. However, we need to be mindful of the methods of normalization and their effect on the distribution of data points in the given range. Below we have defined a function that utilizes multiple techniques for normalization.

Min-max normalization and Z-score normalization are very common ways of normalizing and standardizing data. They are however sensitive to outliers. Therefore we also included tanh-normalization, which is discussed in (Latha, Thangasamy, 2011) (https://research.ijcaonline.org/volume32/number10/pxc3875530.pdf).

In [13]:
def Normalize(data, type_string):
    if type_string == 'min-max':
        z = (data - data.mean()) / (data.max() - data.min())
    elif type_string == 'z-score':
        z = (data-data.mean())/data.std
    elif type_string == 'tanh':
        z = 0.5*(np.tanh(0.01*(data-data.mean)/data.std)+1)
    else:
        z = (data - data.mean()) / (data.max() - data.min())
    debug_msg = str('Data normalized using ' + type_string + ' method.')
    print(debug_msg)
    return z

## 4. Splitting into train and test sets
We want to split the given training data into train and test sets, to be able to test performance locally, before submitting to Kaggle. For this we can use the model_selection library in Scikit-learn.

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
def Split_Data(data, labels, ratio=0.3, state=213):
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=ratio, random_state=state)
    return X_train, X_test, y_train, y_test