In [1]:
from sklearn.feature_selection import RFECV, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import OneClassSVM
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math as math
import random as rd
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import warnings

In [2]:
train_data = pd.read_csv("train.csv")
# drop these columns ("x96", "x97", "x98", "x1021", "x1098", "x1099" are columns with no data)
train_X = train_data.drop(["health", "personid", "uniqueid", "year", "x96", "x97", "x98", "x1021", "x1098", "x1099"], axis = 1)

# change all numerical values to float type
train_X = train_X.astype(float)

# drop NA that accounts a certain percentage
threshold = math.floor(0.5 * 17398)
train_X_less_na = train_X.dropna(thresh=threshold, axis=1)
print(len(train_X_less_na.columns))
print(train_X.shape)

179
(17398, 1199)


In [3]:
# find numerical variables and store them in columns_to_drop 
category_vars = []
for c in train_X_less_na:
    if len(pd.unique(train_X_less_na[c])) < 14:
        category_vars.append(c)

print(len(category_vars))

138


In [4]:
# Impute our data by using the mean of a column as the missing values

train_X_less_na_imp = train_X_less_na.copy()

for name in train_X_less_na_imp.columns:
    if (name in category_vars):
        mode = train_X_less_na_imp[name].mode()
        for i in range(0, len(train_X_less_na_imp[name])):
            if (np.isnan(train_X_less_na_imp.at[i, name])):
                train_X_less_na_imp.at[i, name] = rd.choices(mode, k=1)[0]
    else:
        mean = train_X_less_na_imp[name].mean()
        for i in range(0, len(train_X_less_na_imp[name])):
            if (np.isnan(train_X_less_na_imp.at[i, name])):
                train_X_less_na_imp.at[i, name] = mean


# for c in train_X_less_na_imp:
    #if c in category_vars:
     #   train_X_less_na_imp[c].fillna(train_X_less_na_imp[c].mode(),inplace=True)
    # else:
     #   train_X_less_na_imp[c].fillna(train_X_less_na_imp[c].mean(),inplace=True)
        
# check if there is still any NAs
for c in train_X_less_na_imp:
    if train_X_less_na_imp[c].isnull().any():
        print(c)

In [5]:
# scale train_X_less_na_imp to fit One class SVM
train_X_scaled = train_X_less_na_imp.copy()

for c in train_X_scaled:
    if c not in category_vars:
        if (max(train_X_scaled[c]) > 1) or (min(train_X_scaled[c]) < 0):
            train_X_scaled[c] = (train_X_scaled[c]-train_X_scaled[c].mean())/train_X_scaled[c].std()
        
train_X_scaled = pd.get_dummies(train_X_scaled, columns = category_vars, drop_first = True) # change categorical variables to indicator variables

print(len(train_X_scaled.columns))

530


In [6]:
# outliers for outcomes in class = 1
one_class_svm_1 = OneClassSVM(gamma='scale', nu=0.01)
train_X_scaled_1 = train_X_scaled[train_data.health==1]
one_class_svm_1.fit(train_X_scaled_1)
outliers_1 = one_class_svm_1.predict(train_X_scaled_1) # outliers are indicated as -1, inliers are indicated as 1

# outliers for outcomes in class = 2
one_class_svm_2 = OneClassSVM(gamma='scale', nu=0.01)
train_X_scaled_2 = train_X_scaled[train_data.health==2]
one_class_svm_2.fit(train_X_scaled_2)
outliers_2 = one_class_svm_2.predict(train_X_scaled_2) # outliers are indicated as -1, inliers are indicated as 1

# outliers for outcomes in class = 3
one_class_svm_3 = OneClassSVM(gamma='scale', nu=0.01)
train_X_scaled_3 = train_X_scaled[train_data.health==3]
one_class_svm_3.fit(train_X_scaled_3)
outliers_3 = one_class_svm_3.predict(train_X_scaled_3) # outliers are indicated as -1, inliers are indicated as 1

# outliers for outcomes in class = 4
one_class_svm_4 = OneClassSVM(gamma='scale', nu=0.01)
train_X_scaled_4 = train_X_scaled[train_data.health==4]
one_class_svm_4.fit(train_X_scaled_4)
outliers_4 = one_class_svm_4.predict(train_X_scaled_4) # outliers are indicated as -1, inliers are indicated as 1  

# outliers for outcomes in class = 5
one_class_svm_5 = OneClassSVM(gamma='scale', nu=0.01)
train_X_scaled_5 = train_X_scaled[train_data.health==5]
one_class_svm_5.fit(train_X_scaled_5)
outliers_5 = one_class_svm_5.predict(train_X_scaled_5) # outliers are indicated as -1, inliers are indicated as 1  

In [7]:
# remove outliers from train_X_less_na_imp
outliers = [0 for i in range(0, len(train_data.health))]

i_1 = 0
i_2 = 0
i_3 = 0
i_4 = 0
i_5 = 0
cls_1 = (train_data.health==1)
cls_2 = (train_data.health==2)
cls_3 = (train_data.health==3)
cls_4 = (train_data.health==4)
cls_5 = (train_data.health==5)
for j in range(0, len(train_data.health)):
    if cls_1[j] == True:
        outliers[j] = outliers_1[i_1]
        i_1 += 1
    elif cls_2[j] == True:
        outliers[j] = outliers_2[i_2]
        i_2 += 1
    elif cls_3[j] == True:
        outliers[j] = outliers_3[i_3]
        i_3 += 1
    elif cls_4[j] == True:
        outliers[j] = outliers_4[i_4]
        i_4 += 1
    elif cls_5[j] == True:
        outliers[j] = outliers_5[i_5]
        i_5 += 1 
train_X_scaled['outliers'] = outliers
train_X_less_na_imp = train_X_less_na_imp[train_X_scaled.outliers == 1].reset_index(drop = True)
train_no_outlier = train_data[train_X_scaled.outliers == 1].reset_index(drop = True)
train_X_no_outlier = train_X[train_X_scaled.outliers == 1].reset_index(drop = True)
print(train_X_less_na_imp.shape)

(17093, 179)


In [8]:
warnings.filterwarnings('ignore')
# Removing highly correlated features

# Create correlation matrix
corr_matrix = train_X_less_na_imp.corr(method='pearson').abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

# Drop features 
train_X_less_na_imp_corr = train_X_less_na_imp.drop(to_drop, axis=1)
print(train_X_less_na_imp_corr.shape)

(17093, 170)


In [9]:
# Filtered variables based on their importance derived from RF models
selector_rf = RFE(RandomForestClassifier(n_estimators=10, oob_score=True, criterion="entropy", min_samples_split = 5), n_features_to_select=200, step=2)
# selector_rf = RFECV(RandomForestClassifier(n_estimators=10, criterion="entropy", min_samples_split = 5), step = 2, cv = 5, scoring= "neg_log_loss")
selector_rf = selector_rf.fit(train_X_less_na_imp_corr, train_no_outlier.health)

In [10]:
# the selected features
features_rf = train_X_less_na_imp_corr.columns[selector_rf.get_support()]
print(len(features_rf))
for f in features_rf:
    print(f)

170
x1
x2
x3
x4
x7
x14
x17
x19
x162
x163
x164
x195
x223
x224
x225
x226
x227
x228
x229
x230
x452
x472
x477
x544
x545
x546
x547
x548
x595
x596
x597
x613
x614
x615
x616
x617
x630
x631
x632
x634
x635
x638
x639
x640
x641
x642
x643
x644
x645
x646
x647
x648
x649
x650
x651
x652
x655
x657
x659
x681
x715
x718
x723
x725
x726
x728
x729
x730
x754
x758
x759
x766
x767
x768
x769
x770
x771
x772
x773
x774
x775
x776
x777
x778
x779
x782
x784
x893
x896
x897
x898
x901
x902
x906
x907
x908
x909
x910
x911
x912
x920
x921
x923
x929
x930
x931
x934
x935
x939
x940
x941
x942
x943
x944
x945
x953
x954
x956
x961
x963
x964
x965
x966
x967
x968
x970
x1032
x1033
x1035
x1036
x1039
x1106
x1108
x1110
x1132
x1133
x1140
x1141
x1143
x1144
x1145
x1146
x1147
x1150
x1152
x1153
x1157
x1158
x1159
x1160
x1161
x1162
x1164
x1165
x1166
x1167
x1175
x1176
x1177
x1178
x1179
x1181
x1182
x1183
x1184
x1185
x1201
x1202
x1203
x1204


In [11]:
test_data = pd.read_csv("test.csv")

# convert the test dataset to float type
test_data = test_data.astype(float)

# features_rf = features_rf.insert(0, "personid")
test_data_clean = test_data[features_rf]

In [12]:
# impute test data

test_data_clean_imp = test_data_clean.copy()

for name in test_data_clean_imp.columns:
    if (name in category_vars):
        mode = test_data_clean_imp[name].mode()
        for i in range(0, len(test_data_clean_imp[name])):
            if (np.isnan(test_data_clean_imp.at[i, name])):
                test_data_clean_imp.at[i, name] = rd.choices(mode, k=1)[0]
    else:
        mean = test_data_clean_imp[name].mean()
        for i in range(0, len(test_data_clean_imp[name])):
            if (np.isnan(test_data_clean_imp.at[i, name])):
                test_data_clean_imp.at[i, name] = mean

In [13]:
# re-impute train X data after removing outliers
train_X_no_outlier_final = train_X_no_outlier[features_rf]

for name in train_X_no_outlier_final.columns:
    if (name in category_vars):
        mode = train_X_no_outlier_final[name].mode()
        for i in range(0, len(train_X_no_outlier_final[name])):
            if (np.isnan(train_X_no_outlier_final.at[i, name])):
                train_X_no_outlier_final.at[i, name] = rd.choices(mode, k=1)[0]
    else:
        mean = train_X_no_outlier_final[name].mean()
        for i in range(0, len(train_X_no_outlier_final[name])):
            if (np.isnan(train_X_no_outlier_final.at[i, name])):
                train_X_no_outlier_final.at[i, name] = mean

# check if there is still any NAs
for c in train_X_no_outlier_final:
    if train_X_no_outlier_final[c].isnull().any():
        print(c)

In [14]:
# scale continuous data

for c in train_X_no_outlier_final.columns:
    if c not in category_vars:
        if (max(train_X_no_outlier_final[c]) > 1) or (min(train_X_no_outlier_final[c]) < 0):
            train_X_no_outlier_final[c] = (train_X_no_outlier_final[c]-train_X_no_outlier_final[c].mean())/train_X_no_outlier_final[c].std()

for c in test_data_clean_imp.columns:
    if c not in category_vars:
        if (max(test_data_clean_imp[c]) > 1) or (min(test_data_clean_imp[c]) < 0):
            test_data_clean_imp[c] = (test_data_clean_imp[c]-test_data_clean_imp[c].mean())/test_data_clean_imp[c].std()  

In [15]:
# convert X dataset's categorical variables to dummies
train_X_log_cat_columns = []
for c in train_X_no_outlier_final.columns:
    if c in category_vars:
        train_X_log_cat_columns.append(c)

train_X_no_outlier_final = pd.get_dummies(train_X_no_outlier_final, columns = train_X_log_cat_columns, drop_first = True)

# convert test dataset's categorical variables to dummies
test_log_cat_columns = []
for c in test_data_clean_imp.columns:
    if c in category_vars:
        test_log_cat_columns.append(c)

test_data_clean_imp = pd.get_dummies(test_data_clean_imp, columns = test_log_cat_columns, drop_first = True)

In [16]:
# handle the mismatching dummy X values between test and train

for names in test_data_clean_imp.columns.tolist():
    if (names not in train_X_no_outlier_final.columns.tolist()):
        test_data_clean_imp = test_data_clean_imp.drop(names, axis = 1)
        
for names in train_X_no_outlier_final.columns.tolist():
    if (names not in test_data_clean_imp.columns.tolist()):
        test_data_clean_imp[names] = [0]* len(test_data_clean_imp)

## correct order
test_data_clean_imp = test_data_clean_imp[train_X_no_outlier_final.columns]

In [None]:
# Set of parameters to perform Cross-validation
parameters = {'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1],
              'batch_size': [32, 64, 128, 256, 500],
              'hidden_layer_sizes': [[9,9,], [20,10],[47,47,],[87,47,], [9,5,], [20,],
                                     [9,],[87,],[100,]]}

nn_2 = MLPClassifier(max_iter = 500, learning_rate_init = 0.1, learning_rate = "adaptive") 
clf_nn_2 = GridSearchCV(nn_2, param_grid = parameters, scoring = "neg_log_loss") 
nn_model_2 = clf_nn_2.fit(X2, Y) # the final Neural Network model

In [None]:
print(nn_model_2.best_score_)
nn_model_2.best_estimator_

In [None]:
# the neural network model's prediction
predict_nn = nn_model_2.predict_proba(test_data_clean_imp)
predict_nn = pd.DataFrame(predict_nn, columns = ["p1", "p2", "p3", "p4", "p5"])
predict_nn.insert(loc = 0, column = 'uniqueid', value = test_data.uniqueid.astype(int))
#predict_nn.to_csv(r'C:\Users\AlecZZX\Desktop\441 data\prediction trials\prediction_10.csv', index = False, header = True)