# Loading Data

In [8]:
import pandas as pd
df_train = pd.read_pickle('train_data_merged.pkl') # merged raw train dataset (77 entries)
df_test = pd.read_pickle('test_data_merged.pkl') # merged raw test dataset (60 entries)
X_imputed_v1 = pd.read_pickle('imputation_data_v1.pkl') # Imputation with train dataset alone 
X_imputed_v2 = pd.read_pickle('imputation_data_v2.pkl') # Imputation with train and test dataset together 

# Imputation Selection

The MSE of Lasso with the data that is imputed with train dataset alone is higher than that of Lasso with the one that is imputed with train and test dataset together.

Thus, the imuptation with all data is a better choice.

In [50]:
def reverse(y, mean, std):
    return y * std + mean

def mean_square_error(truth, y_pred):
    return np.sqrt(np.sum((truth - y_pred)**2) / len(truth))

from sklearn.model_selection import LeaveOneOut
from sklearn import preprocessing
import numpy as np
from sklearn import linear_model

def validate(X, y, method):
    loo = LeaveOneOut()
    n_tryout = len(X)
    s = 0
    for train, test in loo.split(list(range(n_tryout))):
        X_train = np.array(X.loc[train])
        
        X_test = np.array(X.loc[test])
        y_test = np.array(y.loc[test])
        
        if method == 'log':
            y_train = np.log(np.array(y.loc[train]))
        elif method == 'normal':
            mean = np.mean(np.array(y.loc[train]))
            std = np.std(np.array(y.loc[train]))
            y_train = (np.array(y.loc[train]) - mean) / std
        elif method == 'None':
            y_train = np.array(y.loc[train])
        
        scaler = preprocessing.StandardScaler().fit(X_train) # normalization
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
        clf = linear_model.Lasso(alpha=0.1) # fit the lasso
        clf.fit(X_train, y_train)
        
        if method == 'log':
            s += mean_square_error(y_test, np.exp(clf.predict(X_test)))
        elif method == 'normal':
            s += mean_square_error(y_test, reverse(clf.predict(X_test), mean, std))
        elif method == 'None':
            s += mean_square_error(y_test, clf.predict(X_test))
        
    return s / n_tryout


# The MSE of LASSO with v1 imputation 
print("The mse of Lasso with v1 imputation is %s" % validate(X_imputed_v1, df_train['severity_score'], method = 'normal'))

# The MSE of LASSO with v2 imputation 
print("The mse of Lasso with v2 imputation is %s" % validate(X_imputed_v2.loc[:76], df_train['severity_score'], method = 'normal'))


The mse of Lasso with v1 imputation is 2.523213165799959
The mse of Lasso with v2 imputation is 2.2834710485317555


# Regression with 2 models

In [41]:
df_cla = pd.concat([df_train[['severity_score']], X_imputed_v2.loc[:76]], axis = 1)
df_cla['label'] = [1 if x > 2 else 0 for x in df_cla['severity_score']]

df1 = df_cla[df_cla['label'] == 1].reset_index(drop = True)
df2 = df_cla[df_cla['label'] == 0].reset_index(drop = True)

score1 = validate(df1[list(df1)[1:-1]], df1['severity_score'], method = 'log')
score2 = validate(df2[list(df2)[1:-1]], df2['severity_score'], method = 'log')
score = (len(df1) * score1 + len(df2) * score2) / len(df_cla)

print("The mse of TWO Lasso with v2 imputation is %s" % score)
print("Breakdown")
print("For ss > 2: %s" % score1)
print("For ss <= 2: %s" % score2)


The mse of TWO Lasso with v2 imputation is 1.2970142379825276
Breakdown
For ss > 2: 1.499125243015088
For ss <= 2: 0.6345392770424688


# Make Classification

The performance of the classification is poor. The best model (SVM) only predicts all data as their severity scores are higher than 2.



In [30]:
df_cla = pd.concat([df_train[['severity_score']], X_imputed_v2.loc[:76]], axis = 1)
df_cla['label'] = [1 if x > 2 else 0 for x in df_cla['severity_score']]

from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


def classify(X, y):
    loo = LeaveOneOut()
    n_tryout = len(X)
    s_rf = 0
    s_svm = 0
    s_gnb = 0
    s_tree = 0
    s_lda = 0
    for train, test in loo.split(list(range(n_tryout))):
        X_train = np.array(X.loc[train])
        y_train = np.array(y.loc[train])
        X_test = np.array(X.loc[test])
        y_test = np.array(y.loc[test])
        
        scaler = preprocessing.StandardScaler().fit(X_train) # normalization
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
        s_rf += RandomForestClassifier(random_state=0).fit(X_train, y_train).score(X_test, y_test)
        s_svm += svm.SVC(random_state=0).fit(X_train, y_train).score(X_test,  y_test)
        s_gnb += GaussianNB().fit(X_train, y_train).score(X_test,  y_test)
        s_tree += tree.DecisionTreeClassifier(random_state=0).fit(X_train, y_train).score(X_test,  y_test)
        s_lda += LinearDiscriminantAnalysis().fit(X_train, y_train).score(X_test,  y_test)
    
    print("Accuracy")
    print("Random Forest %s" % (s_rf / n_tryout))
    print("SVM %s" % (s_svm / n_tryout))
    print("Naive Bayes %s" % (s_gnb / n_tryout))
    print("Decision Trees %s" % (s_tree / n_tryout))
    print("Linear Discriminant Analysis %s" % (s_lda / n_tryout))

classify(df_cla[list(df_cla)[1:-1]] , df_cla['label'])
        
        
        

Accuracy
Random Forest 0.7012987012987013
SVM 0.7662337662337663
Naive Bayes 0.6493506493506493
Decision Trees 0.6233766233766234
Linear Discriminant Analysis 0.7532467532467533


# Make Prediction for test dataset

Normalize both X and y.
Imputate with all data.
Only one model.

In [62]:
def predict(X_train, y_train, X_test):
    X_train = np.array(X_train)
    X_test = np.array(X_test)
    

    mean = np.mean(np.array(y_train))
    std = np.std(np.array(y_train))
    y_train = (np.array(y_train) - mean) / std
        
    scaler = preprocessing.StandardScaler().fit(X_train) # normalization
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
        
    clf = linear_model.Lasso(alpha=0.1) # fit the lasso
    return reverse(clf.fit(X_train, y_train).predict(X_test), mean, std)
        
y_predict = predict(X_imputed_v2.loc[:76], df_train['severity_score'], X_imputed_v2.loc[77:])
prediction = pd.read_csv('prediction.csv')
prediction['severity_score'] = list(y_predict)
prediction.to_csv('prediction_v0810.csv', index = False)