In [1]:
# automatically show plots inside the notebook
%matplotlib inline  

# reload all modules before executing code
%load_ext autoreload
%autoreload 2  

In [2]:
# you will use this notebook as a basis to walk us through what you did

In [3]:
import pandas as pd
import numpy as np
features = pd.read_csv('../data/features.csv')
features.head()

Unnamed: 0,Patient ID,Feature0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,...,Feature677,Feature678,Feature679,Feature680,Feature681,Feature682,Feature683,Feature684,Feature685,Feature686
0,1,743,3594,81,23154,887,491,368,1322,14624,...,0.05195,0.057096,4.30944,0.827737,0.81342,1.118799,0.635217,1.143215,1.450378,0.790279
1,2,249,1523,373,60306,1347,1016,201,1586,45525,...,0.023671,0.026299,33.007541,1.157097,0.712491,1.149333,0.709755,1.126794,1.171539,0.96027
2,4,150,815,88,54361,1558,452,105,1758,47862,...,0.03036,0.032339,55.780435,0.936845,0.621701,0.688862,0.635621,0.835548,0.89458,0.871643
3,5,376,2095,276,58681,1307,814,241,1527,45351,...,0.024164,0.026028,23.155095,1.188455,0.575252,1.237643,0.533353,1.076273,1.224851,1.133792
4,7,78,1946,109,34454,1577,415,513,1785,26612,...,0.050253,0.051808,12.761692,1.477104,0.71441,1.062945,0.616509,0.979109,1.204625,0.88828


In [295]:
labels = pd.read_csv('../data/labels.csv')
labels.head()

Unnamed: 0,Patient ID,Sickness
0,1,0
1,2,0
2,4,1
3,5,0
4,7,0


In [296]:
data = features.merge(labels, on = 'Patient ID')
target = 'Sickness'
data.head()

Unnamed: 0,Patient ID,Feature0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,...,Feature678,Feature679,Feature680,Feature681,Feature682,Feature683,Feature684,Feature685,Feature686,Sickness
0,1,743,3594,81,23154,887,491,368,1322,14624,...,0.057096,4.30944,0.827737,0.81342,1.118799,0.635217,1.143215,1.450378,0.790279,0
1,2,249,1523,373,60306,1347,1016,201,1586,45525,...,0.026299,33.007541,1.157097,0.712491,1.149333,0.709755,1.126794,1.171539,0.96027,0
2,4,150,815,88,54361,1558,452,105,1758,47862,...,0.032339,55.780435,0.936845,0.621701,0.688862,0.635621,0.835548,0.89458,0.871643,1
3,5,376,2095,276,58681,1307,814,241,1527,45351,...,0.026028,23.155095,1.188455,0.575252,1.237643,0.533353,1.076273,1.224851,1.133792,0
4,7,78,1946,109,34454,1577,415,513,1785,26612,...,0.051808,12.761692,1.477104,0.71441,1.062945,0.616509,0.979109,1.204625,0.88828,0


In [297]:
data.shape

(288, 689)

### Checkpoint

Number of data points : 288
Number of features: 685 (not including patient ID and sickness)

If I run a classifier on the data with these dimensions, I am most definitely going to overfit my model. 

An important preprocessing step I always make sure to do is to scale my data. 

```for every column x in my data:
    mean = mean(x)
    std = std(x)
    for every i in x:
        i = (x - mean)/std```

This above method will standardize every column to make their range and units equal. 

In [298]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
pid = data['Patient ID']
labels = data[target]
data.drop(['Patient ID', 'Sickness'], axis = 1, inplace = True)
col_names = data.columns
scaled_data = scaler.fit_transform(data)
scaled_data = pd.DataFrame(scaled_data, columns = col_names)
scaled_data['Patient ID'] = pid
scaled_data['Sickness'] = labels

In [299]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
train, test = train_test_split(scaled_data, test_size = 0.15, random_state = 30)

### Majority Class Classifier

Let's run a classifier which classifies all datapoints according to the majority class. 

In [269]:
from sklearn.metrics import accuracy_score
clas = []
clas.append(len(train[train[target] == 0]))
clas.append(len(train[train[target] == 1]))
clas = np.array(clas)
print('number of data points with sickness = true: ' + str(len(sick)))
print('number of data points with sickness = false: ' + str(len(no_sick)))
preds = [np.argmax(clas)]*len(test)
print('majority class accuracy percentage = ' + str(accuracy_score(preds, test[target])))

number of data points with sickness = true: 69
number of data points with sickness = false: 219
majority class accuracy percentage = 0.75


### Checkpoint
Majority class classifier has an accuracy level of 75%. 
Need to beat this. 

If I use a any classifier on this dataset, it is going to overfit this data since there are many more features than datapoints. 
My aim is to reduce the dimensions of the feature space of this dataset. 
I am going to go about doing this in three ways and compare their outputs: 
1. Early stopping for decision trees using max_depth
2. Logistic regression with Lasso Penalty to remove features which do not affect our outcome
3. Using Principal Component Analysis to reduce the number spaces to the ones which affect our output the most

In [300]:
def get_acc(clf, train, test):
    clf = clf.fit(train.drop(target, axis = 1), train[target])
    preds = clf.predict(test.drop(target, axis = 1))
    return accuracy_score(preds, test[target]), clf

### Decision Tree with build in GridSerch for selecting the best parameters

In [303]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'max_depth': list(range(1, 25))}
tree_clf = DecisionTreeClassifier()
best_clf = grid_search.GridSearchCV(tree_clf, parameters)
acc, clf = get_acc(best_clf, train, test)
print('test accuracy is :' + str(acc))

test accuracy is :0.7727272727272727


### Decision Tree with self implemented search for best parameter

In [271]:
from sklearn.tree import DecisionTreeClassifier

# params: get all max_depths of trees that I want to try
max_depths = list(range(1, 25))

iters = 50

best_acc = 0
best_depth = max(max_depths)
for max_depth in max_depths:
    mean_acc = 0
    for i in range(iters):
        train_small, validation = train_test_split(train, test_size = 0.1)
        curr_acc, _ = get_acc(DecisionTreeClassifier(max_depth = max_depth), train_small, validation)
        mean_acc += curr_acc
    curr_acc = mean_acc/(1.0*iters)
    if curr_acc > best_acc:
        best_acc = curr_acc
        best_depth = max_depth


# now lets test our best Decision Tree Classifier
acc, best_clf = get_acc(DecisionTreeClassifier(max_depth = best_depth), train, test)
print('test accuracy is :' + str(acc))

test accuracy is :0.7272727272727273



Problems with Decision Tree

1. The problem with decision trees is that it chooses the best feature to split on by checking its accuracy after split on the single feature. Hence, feature combination is not possible. 
2. Decision Trees are not stable as a small alterations in training data will form structurally different dicision trees. 
3. Results can often be improved by using Boosting algorithms or Random Forests. 

### Logistic Regression with Lasso Penalty

The parameter of logistic regression which I shall be tuning here is the 'C' parameter. This is the inverse of the regularization strenght. 
I shall be using l1 penalty or Lasso penalization for feature selection. 
The more I reduce C, the quicker the logistic regression should starting zeroing out features. 

### Lasso Logistic Regression using built in GridSearch

In [307]:
from sklearn.linear_model import LogisticRegression
parameters = {'C': [10**(-i) for i in range(5)]}
log_clf = LogisticRegression(penalty = 'l1')
best_clf = grid_search.GridSearchCV(log_clf, parameters)
acc, clf = get_acc(best_clf, train, test)
print('test accuracy is :' + str(acc))

test accuracy is :0.8181818181818182


### Lasso Logistic Regression using self implemented search for best parameter

In [308]:
from sklearn.linear_model import LogisticRegression
iters = 50
lambdas = []
for i in range(6):
    lambdas.append(10**(-i))

best_acc = 0
best_l = None
for l in lambdas:
    mean_acc = 0
    for i in range(iters):
        train_small, validation = train_test_split(train, test_size = 0.1)
        curr_acc, clf = get_acc(LogisticRegression(C=l, penalty='l1'), train_small, validation)
        mean_acc += curr_acc
    curr_acc = mean_acc/(1.0*iters)
    if curr_acc > best_acc:
        best_acc = curr_acc
        best_l = l

acc, best_clf = get_acc(LogisticRegression(C=best_l, penalty='l1'), train, test)
print('test accuracy is :' + str(acc))

test accuracy is :0.8181818181818182


### Random Forest

Let's try and better our performance of decision trees by using Random Forests. 
The advantages of using Random Forests are listed below: 
1. Even if you increase the number of trees used in the algorithm (by increasing n_estimators), it is hard to overfit Random Forests
2. It can handle thousands of input variables without having to delete any features
3. It gives estimates of what variables are important in the classification.

### Random Forests parameter selection using GridSearch

In [310]:
from sklearn.ensemble import RandomForestClassifier
parameters = {'n_estimators': [100, 250, 500, 750, 1000]}
rf_clf = RandomForestClassifier()
best_clf = grid_search.GridSearchCV(rf_clf, parameters)
acc, clf = get_acc(best_clf, train, test)
print('test accuracy is :' + str(acc))

test accuracy is :0.8409090909090909


### Random Forest self implemented parameter selection

In [316]:
from sklearn.ensemble import RandomForestClassifier
def best_RF_Accuracy(train, test, n_est = [100, 250, 500, 750, 1000], iters = 25):
    best_acc = 0
    best_n = None
    for n in n_est:
        mean_acc = 0
        for i in range(iters):
            train_small, validation = train_test_split(train, test_size = 0.1)
            curr_acc, clf = get_acc(RandomForestClassifier(n_estimators = n), train_small, validation)
            mean_acc += curr_acc
        curr_acc = mean_acc/(1.0*iters)
        if curr_acc > best_acc:
            best_acc = curr_acc
            best_n = n

    acc, best_clf = get_acc(RandomForestClassifier(n_estimators = best_n), train, test)
    return acc

In [311]:
print('test accuracy is :' + str(best_RF_Accuracy(train, test)))

test accuracy is :0.9318181818181818


In [341]:
print(best_n)

750


### Principal Component Analysis

Principal Component Analysis not only helps us find relations between our features but also reduces 
the dimensionality of our data to the desired output. 

In [314]:
from sklearn.decomposition import PCA
def create_pca_data(n_dim):
    pca = PCA(n_components=n_dim)
    pca.fit(scaled_data)
    pca_train, pca_test = train_test_split(scaled_data, test_size = 0.15, random_state = 30)
    return pca_train, pca_test

### Analysis of PCA

Here we see that lower dimensions produce better results for Random Forests running on our reduced dimension data. 

In [317]:
for i in range(1,10):
    pca_train, pca_test = create_pca_data(i)
    print(best_RF_Accuracy(train, test))

0.9090909090909091
0.9090909090909091
0.8636363636363636
0.8636363636363636
0.8863636363636364
0.8863636363636364
0.8409090909090909
0.8409090909090909
0.8636363636363636


## Result: The best model was a Random Forest without dimensionality reduction, with n_estimators around 750 

Accuracy level was 93.1%