 <font color = 'red' > <font size = '5' >Modeling

In [1]:
# basic python tools for numbers and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# statistics tools
import scipy.stats as stats

# scale and one hot encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# machine learning tools
from sklearn import preprocessing 

# from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier

# from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn import linear_model

from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB # Gaussian Naive Bays
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

# other tools
from functools import reduce
from imblearn.over_sampling import SMOTE
import itertools
import copy

# Clear warnings
import warnings
warnings.filterwarnings('ignore') # Filter out warnings

# show plots inline
%matplotlib inline

## Split train & test 

In [2]:
def normal_split_train_test(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)
#     print("Before OverSampling, counts of positive mental diagnosis in y_train:", sum(y_train==1))
#     print("Before OverSampling, counts of negative mental diagnosis in y_train:", sum(y_train==2))
    return X_train, X_test, y_train, y_test
    
def smote_split_train_test(X,y):
    X_train, X_test, y_train, y_test = normal_split_train_test(X,y)
    
    print("Before OverSampling, counts of positive depression in y_train:",sum(y_train==1))
    print("Before OverSampling, counts of negative depression in y_train:",sum(y_train==2))

    sm = SMOTE(random_state=2)
    X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())
    
    print("After OverSampling, counts of positive depression in y_train_res:",sum(y_train_res==1))
    print("After OverSampling, counts of negative depression in y_train_res:",sum(y_train_res==2))
    
    return X_train_res, X_test, y_train_res, y_test


## Machine learning models

In [3]:
def compute_accuracy(predict, actual):
        if len(predict) != len(actual):
            return 0
        correctness = 0
        for i in range(len(predict)):
            if predict[i] == actual[i]: 
                correctness += 1
            i += 1
        accuracy = correctness / len(predict)
        return accuracy
    
def main(df,model,SMOTE):
    y = df['mental diagnosis'].values
    X = df.drop(['mental diagnosis'],axis = 1).values
    
    if SMOTE == False:
        X_train, X_test, y_train, y_test = normal_split_train_test(X,y)
    else:
        X_train, X_test, y_train, y_test = smote_split_train_test(X,y)
    
    if model == 'logistic regression': 
        algo = LogisticRegression(C=9,dual=False)
    elif model == 'knn':
        algo = KNeighborsClassifier(n_neighbors = 3) 
    elif model == 'SVC':
        algo = SVC()
    elif model == 'random forest':
        algo = RandomForestClassifier(n_estimators=500) 
    elif model == 'linear SVC':
        algo = LinearSVC(C=1000,dual=False)
    else:
        print('This model is not included yet. Add this model now!')
    
    algo.fit(X_train,y_train)
    train_predictions = algo.predict(X_train)
    test_predictions = algo.predict(X_test)    
    
    print()
    print('The training set accuracy is', compute_accuracy(train_predictions, y_train)*100,'%')
    print('The testing set accuracy is', compute_accuracy(test_predictions, y_test)*100,'%')

## Fit data before SMOTE

In [4]:
df = pd.read_csv('part 2 dataframe.csv')
df.head()

Unnamed: 0,mental diagnosis,age,race,family income,education,marital status,male,female,freq:12 drinks/yr?,alco freq year,...,sm 100 cig?,sm yrs,nicotine content,work type,weekly work hrs,activity level,monthly activity freq,activity duration,BMI,weight goal outcome
0,2,1.342887,3.0,3,0,1.0,1,0,1.0,-0.383705,...,2.0,2.49589,-0.289257,1.0,1.154879,1.0,-0.377654,0.280776,-0.016783,2
1,2,-1.06876,1.0,1,0,1.0,0,1,1.0,1.458122,...,1.0,-0.571269,0.219321,4.0,-1.029785,1.0,-0.543703,-0.54156,-0.080694,1
2,1,1.515148,4.0,0,0,1.0,0,1,1.0,0.007709,...,1.0,-0.745823,1.338191,4.0,-0.969726,1.0,1.061435,-0.54156,-0.082984,2
3,2,1.342887,1.0,1,1,3.0,1,0,2.0,-0.579113,...,2.0,-0.172289,0.473609,1.0,1.305028,2.0,0.231191,-0.678616,-0.034844,1
4,2,-0.379718,4.0,0,1,3.0,1,0,1.0,0.732915,...,2.0,-0.396716,1.389049,1.0,-1.322575,1.0,6.042899,-0.747144,0.045181,2


In [5]:
main(df,'logistic regression',False)


The training set accuracy is 92.02702702702703 %
The testing set accuracy is 89.1891891891892 %


In [6]:
main(df,'knn',False)


The training set accuracy is 92.97297297297298 %
The testing set accuracy is 90.27027027027027 %


In [7]:
main(df,'SVC',False)


The training set accuracy is 91.48648648648648 %
The testing set accuracy is 91.8918918918919 %


In [8]:
main(df,'random forest',False)


The training set accuracy is 100.0 %
The testing set accuracy is 90.81081081081082 %


In [9]:
main(df,'linear SVC',False)


The training set accuracy is 91.35135135135135 %
The testing set accuracy is 91.35135135135135 %


## Fit data after SMOTE

In [10]:
main(df,'logistic regression',True)

Before OverSampling, counts of positive depression in y_train: 60
Before OverSampling, counts of negative depression in y_train: 680
After OverSampling, counts of positive depression in y_train_res: 680
After OverSampling, counts of negative depression in y_train_res: 680

The training set accuracy is 75.88235294117646 %
The testing set accuracy is 68.64864864864865 %


In [11]:
main(df,'knn',True)

Before OverSampling, counts of positive depression in y_train: 66
Before OverSampling, counts of negative depression in y_train: 674
After OverSampling, counts of positive depression in y_train_res: 674
After OverSampling, counts of negative depression in y_train_res: 674

The training set accuracy is 91.17210682492582 %
The testing set accuracy is 67.02702702702703 %


In [12]:
main(df,'SVC',True)

Before OverSampling, counts of positive depression in y_train: 65
Before OverSampling, counts of negative depression in y_train: 675
After OverSampling, counts of positive depression in y_train_res: 675
After OverSampling, counts of negative depression in y_train_res: 675

The training set accuracy is 93.85185185185185 %
The testing set accuracy is 79.45945945945945 %


In [13]:
main(df,'random forest',True)

Before OverSampling, counts of positive depression in y_train: 65
Before OverSampling, counts of negative depression in y_train: 675
After OverSampling, counts of positive depression in y_train_res: 675
After OverSampling, counts of negative depression in y_train_res: 675

The training set accuracy is 100.0 %
The testing set accuracy is 91.8918918918919 %


In [14]:
main(df,'linear SVC',True)

Before OverSampling, counts of positive depression in y_train: 61
Before OverSampling, counts of negative depression in y_train: 679
After OverSampling, counts of positive depression in y_train_res: 679
After OverSampling, counts of negative depression in y_train_res: 679

The training set accuracy is 70.83946980854198 %
The testing set accuracy is 65.4054054054054 %
