In [18]:
import pandas as pd 
import numpy as np
import seaborn as sns

%matplotlib inline
import matplotlib as mpl 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_curve
from numpy import sqrt
from numpy import argmax

In [19]:
df = pd.read_csv("../Data/project_data.csv")

# Dropping redundant features
df = df.iloc[: , 1:]
df = df.iloc[: , 1:]

In [20]:
#df['result'] = df.result.astype(object)
df['gender'] = df.gender.astype(object)
df['race'] = df.race.astype(object)
df['marital_status'] = df.marital_status.astype(object)
df['education_level_adults'] = df.education_level_adults.astype(object)

df['language'] = df.language.astype(object)
df['trouble_sleeping_history'] = df.trouble_sleeping_history.astype(object)
df['SMQ020'] = df.SMQ020.astype(object)
df['SMQ040'] = df.SMQ040.astype(object)
df['SMQ670'] = df.SMQ670.astype(object)
df['WHQ030'] = df.WHQ030.astype(object)


df['WHQ040'] = df.WHQ040.astype(object)
df['WHQ070'] = df.WHQ070.astype(object)

In [21]:
train_set, test_set = train_test_split(df, test_size=0.3, random_state=0, stratify=df["result"])

In [22]:
X_train = train_set.drop("result", axis=1) 
y_train = train_set["result"].copy()

X_test = test_set.drop("result", axis=1) 
y_test = test_set["result"].copy()

In [23]:
y_train.unique()

array([0, 1])

# Data Preprocessing

In [24]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [25]:
num_pipeline = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
cat_pipeline = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder())])

In [26]:
num_attribs=["age","family_PIR","sleep_hours","drinks_per_occasion","SMD030","SMD641","SMD650","SMD630","WHD010","WHD020",
"WHD050","WHD110","WHD120","WHD140","WHQ150"]

cat_attribs=["gender","race","marital_status","education_level_adults","language","trouble_sleeping_history","SMQ020",
"SMQ040","SMQ670","WHQ030","WHQ040","WHQ070"] 

col_transform = ColumnTransformer([("num", num_pipeline, num_attribs), ("cat", cat_pipeline, cat_attribs)])

# Evaluate Several Machine Learning Models

In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [28]:
k_folds = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=0)

In [29]:
knn_model = Pipeline([("transformer", col_transform), ("classifier", KNeighborsClassifier())])
knn_scores = cross_val_score(knn_model, X_train, y_train, cv=k_folds)
print(knn_scores.mean(), knn_scores.std())

0.8447106457336079 0.0029068806969251666


In [33]:
rf_model = Pipeline([("transformer", col_transform), ("classifier", RandomForestClassifier(random_state=0))])
rf_scores = cross_val_score(rf_model, X_train, y_train, cv=k_folds)
print(rf_scores.mean(), rf_scores.std())

0.8546759185727834 0.0019324358605260579


## Feature Selection

In [34]:
from sklearn.feature_selection import SelectKBest, f_classif

In [52]:
knn_k_best = Pipeline([("transformer", col_transform), 
                         ("selector", SelectKBest(f_classif, k=5)),
                         ("classfier", KNeighborsClassifier()) ])

knn_k_best_scores = cross_val_score(knn_k_best, X_train, y_train, cv=k_folds)
print(knn_k_best_scores.mean(), knn_k_best_scores.std())

0.8361486727972386 0.004862116801432051


In [53]:
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC


knn_SFM = Pipeline([("transformer", col_transform),
                      ("selector", SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False))),
                      ("classfier", KNeighborsClassifier()) ])

knn_SFM_scores = cross_val_score(knn_SFM, X_train, y_train, cv=k_folds)
print(knn_SFM_scores.mean(), knn_SFM_scores.std())



0.843487404143394 0.0030671064084524476


In [50]:
rf_k_best = Pipeline([("transformer", col_transform), 
                         ("selector", SelectKBest(f_classif, k=5)),
                         ("classfier", RandomForestClassifier(random_state=0)) ])

rf_k_best_scores = cross_val_score(rf_k_best, X_train, y_train, cv=k_folds)
print(rf_k_best_scores.mean(), rf_k_best_scores.std())

0.8361126514293807 0.003610829509343398


In [51]:
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC


rf_SFM = Pipeline([("transformer", col_transform),
                      ("selector", SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False))),
                      ("classfier", RandomForestClassifier(random_state=0)) ])

rf_SFM_scores = cross_val_score(rf_SFM, X_train, y_train, cv=k_folds)
print(rf_SFM_scores.mean(), rf_SFM_scores.std())



0.8546579928105756 0.0014800983917987796


# Parameters tuning

In [54]:
from sklearn.model_selection import GridSearchCV 
knn_param_grid = {"classifier__n_neighbors": np.arange(1, 10), "classifier__weights": ["uniform", "distance"]}
knn_grid_cv = GridSearchCV(knn_model, knn_param_grid, scoring="accuracy", n_jobs=-1, cv=k_folds)
knn_grid_cv.fit(X_train, y_train)

GridSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=0),
             estimator=Pipeline(steps=[('transformer',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['age',
                                                                          'family_PIR',
                                                                          'sleep_hours',
                                                                          'drinks_per_oc

In [55]:
knn_grid_cv.best_params_

{'classifier__n_neighbors': 8, 'classifier__weights': 'uniform'}

In [56]:
knn_grid_cv.best_score_

0.8527512332696323

In [57]:
col_transform.fit_transform(X_train).shape

(18531, 55)

In [58]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

rf_param_distributions = { "classifier__n_estimators": randint(50, 200),
                           "classifier__max_features": randint(3, 9),
                           "classifier__max_depth": randint(5, 20),
                           "classifier__min_samples_leaf": randint(2, 4)}

rf_random_search = RandomizedSearchCV(rf_model, rf_param_distributions, n_iter=10, cv=k_folds, scoring="accuracy",
                                      return_train_score=True, random_state=0)

In [59]:
rf_random_search.fit(X_train, y_train)

RandomizedSearchCV(cv=RepeatedStratifiedKFold(n_repeats=3, n_splits=5, random_state=0),
                   estimator=Pipeline(steps=[('transformer',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('imputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('scaler',
                                                                                                StandardScaler())]),
                                                                               ['age',
                                                                                'family_PIR',
                                                                                'sleep_hours',
                            

In [60]:
rf_random_search.best_params_

{'classifier__max_depth': 17,
 'classifier__max_features': 8,
 'classifier__min_samples_leaf': 2,
 'classifier__n_estimators': 117}

In [61]:
rf_random_search.best_score_

0.8552515907900793

## Ensemble Learning

In [62]:
knn_grid_cv.best_params_

{'classifier__n_neighbors': 8, 'classifier__weights': 'uniform'}

In [63]:
rf_random_search.best_params_

{'classifier__max_depth': 17,
 'classifier__max_features': 8,
 'classifier__min_samples_leaf': 2,
 'classifier__n_estimators': 117}

In [64]:
from sklearn.ensemble import VotingClassifier
best_n_neighbors = knn_grid_cv.best_params_['classifier__n_neighbors']
best_weights = knn_grid_cv.best_params_['classifier__weights']
best_max_depth = rf_random_search.best_params_['classifier__max_depth']
best_max_features = rf_random_search.best_params_['classifier__max_features']
best_min_samples_leaf = rf_random_search.best_params_['classifier__min_samples_leaf']
best_n_estimators = rf_random_search.best_params_['classifier__n_estimators']

In [65]:
voting_model = VotingClassifier(
    estimators=[
        ("knn", Pipeline([("transformer", col_transform),
                          ("classifier", KNeighborsClassifier(n_neighbors=best_n_neighbors, weights=best_weights))])),
        ("rf", Pipeline([("transformer", col_transform),
                         ("classifier", RandomForestClassifier(random_state=0,max_depth=best_max_depth, max_features=best_max_features,
                                                               min_samples_leaf=best_min_samples_leaf, n_estimators=best_n_estimators))]))
    ],
    voting="soft"    
)

In [66]:
voting_scores = cross_val_score(voting_model, X_train, y_train, cv=k_folds)
print(voting_scores.mean(), voting_scores.std())

0.8538664495776308 0.0014246715740469733


In [67]:
voting_model.fit(X_train, y_train)

VotingClassifier(estimators=[('knn',
                              Pipeline(steps=[('transformer',
                                               ColumnTransformer(transformers=[('num',
                                                                                Pipeline(steps=[('imputer',
                                                                                                 SimpleImputer(strategy='median')),
                                                                                                ('scaler',
                                                                                                 StandardScaler())]),
                                                                                ['age',
                                                                                 'family_PIR',
                                                                                 'sleep_hours',
                                                                      

# Making Predictions

In [68]:
from sklearn.metrics import accuracy_score
def testModel(model, name=""):
    y_test_predicted = model.predict(X_test)
    print(name, "- Test accuracy:", accuracy_score(y_test, y_test_predicted))

In [70]:
knn_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

Pipeline(steps=[('transformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'family_PIR',
                                                   'sleep_hours',
                                                   'drinks_per_occasion',
                                                   'SMD030', 'SMD641', 'SMD650',
                                                   'SMD630', 'WHD010', 'WHD020',
                                                   'WHD050', 'WHD110', 'WHD120',
                                                   'WHD140', 'WHQ150']),
                                      

In [71]:
testModel(knn_model, "k-NN")
testModel(rf_model, "RF")
testModel(knn_grid_cv, "k-NN-tuned")
testModel(rf_random_search, "RF-tuned")
testModel(voting_model, "Ensemble")

k-NN - Test accuracy: 0.8465122135482246
RF - Test accuracy: 0.8560815915386553
k-NN-tuned - Test accuracy: 0.8552002014605893
RF-tuned - Test accuracy: 0.8564593301435407
Ensemble - Test accuracy: 0.8565852430118358
