In [17]:
# Imports for data-preprocessing
import pandas as pd
import numpy as np
from __future__ import print_function

# Import feature selection/libraries
from sklearn import metrics as mt
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit


# Import models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Viz libraries
import matplotlib.pyplot as plt
from string import ascii_letters
import seaborn as sns
%matplotlib inline

In [18]:
# Import data 
df = pd.read_csv("D://DATAMINING//master.csv")

In [19]:
bins = [17, 34, 49, 64, 90]

group_names = ['17-34', '35-49', '50-64', '65+']

age_groups = pd.cut(df.age, bins, labels=group_names)

df['age_groups'] = age_groups.tolist()

In [20]:
if 'fnlwgt' in df:
    del df['fnlwgt']
if 'workclass' in df:
        del df['workclass']
if 'education' in df:
        del df['education']
if 'education_num' in df:
        del df['education_num']
if 'marital_status' in df:
        del df['marital_status']
if 'occupation' in df:
        del df['occupation']
if 'native_country' in df:
        del df['native_country'] 
if 'age' in df:
           del df['age'] 
del df['Unnamed: 0']

In [21]:
df.head(3)

Unnamed: 0,relationship,race,sex,capital_gain,capital_loss,hours_per_week,income_binary,condensed_education,continent,condensed_marital,condensed_workclass,age_groups
0,Not-in-family,White,Male,2174,0,40,<=50K,Bachelors,United States,Never,Government,35-49
1,Husband,White,Male,0,0,13,<=50K,Bachelors,United States,Married,Self-Employed,50-64
2,Not-in-family,White,Male,0,0,40,<=50K,High School Graduate,United States,Divorced/Separated,Private,35-49


In [22]:
#Split the data into traning (80%) and test set (20%)
# We are using stratified cross validation here because the majority of the
#    individuals in the variable race are white

if 'age_groups' in df:
    y = (df['age_groups']) #get values we need 
    del df['age_groups']        #get rid of the class label
    X = df.values               #use everything else to predict 
    
X = pd.get_dummies(df).values

scl = StandardScaler()
#X = scl.fit_transform(X)

# Split the data into 20% Test and 80% Train
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.20, random_state=111)
sss.get_n_splits(X, y) #retreving the splits

10

In [23]:
# Create a for loop that grabs the values for each fold for traing and test sets
y = y.fillna("")

for train_index, test_index in sss.split(X,y.tolist()):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    y.fillna("")

TRAIN: [ 8492   773 13118 ..., 35980 33511 35588] TEST: [19692 27047 14102 ...,  5491 27134 25133]
TRAIN: [35754 44603 33516 ...,  9966 39949 27930] TEST: [ 6576 30339 39221 ..., 26307 20282 14216]
TRAIN: [ 9692 36813  2848 ..., 16610 48026 24110] TEST: [17088 44904 33742 ..., 39410 34564 24713]
TRAIN: [18717 17405 28960 ..., 17899 20181 28803] TEST: [29770  5237 39991 ..., 19858 25079 39382]
TRAIN: [31784 43606 33743 ..., 43070 11410 40058] TEST: [ 5991  5420 37845 ..., 23131 38821 38455]
TRAIN: [10324 22662 46232 ..., 13002  5850 47428] TEST: [35217 41009 45992 ..., 19937 37685 43961]
TRAIN: [10245 34540 19934 ...,  4642 48455  1175] TEST: [46966 10993 27579 ..., 31110 48759 43327]
TRAIN: [39885  4332  3592 ..., 44157 11660 46607] TEST: [31259 23034  5609 ..., 20253 24091 12492]
TRAIN: [17705 32836 34332 ..., 30752 31903 12175] TEST: [24823 12082 42233 ..., 14850 38281 46484]
TRAIN: [ 9054 18693 13929 ...,  8118 28501 10876] TEST: [37717 40166 28516 ...,  4861 47589  9591]


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics as mt

# criterion : 'gini' > 'entropy'
# max_features : 0 -> 42 (default is fine)
# bootstrap : keep True
# max_depth : not material

RFC = RandomForestClassifier()

iteration = 1

for train_indices, test_indices in sss.split(X,y.tolist()): 
    X_train = X[train_indices]  # train indices for X
    y_train = y[train_indices]  # train indices for y
    
    X_test = X[test_indices]    # test indices for X
    y_test = y[test_indices]    # test indices for y
    
    # train the reusable KNN classifier on the training data
    RFC.fit(X_train,y_train)  # train object
    y_hat = RFC.predict(X_test) # get test set precitions
    
    print(mt.accuracy_score(y_hat,y_test))

0.551847681441
0.543760876241
0.556454089467
0.54703654417
0.549390930494
0.543556146996
0.541406489917
0.544784522469
0.549698024363
0.548264919644


In [26]:
# Classification with RandomForest

# No parameter adjustments
from sklearn.metrics import classification_report
RF = RandomForestClassifier(random_state=111)

# Now we want to iterate through and grab the prediction
iter_num=0
# The indices are the rows used for training and testing in each iteration
for train_indices, test_indices in sss.split(X,y.astype('str')):
    X_train = X[train_indices]  # train indices for X
    y_train = y[train_indices]  # train indices for y
    
    X_test = X[test_indices]    # test indices for X
    y_test = y[test_indices]    # test indices for y
    
 # Train the reusable RF classifier on the training data
RF.fit(X_train,y_train)  # train object
y_hat = RF.predict(X_test) # get test set precitions

# Accuracy for the iterations of training/testing
accuracy_RF = mt.accuracy_score(y_test,y_hat)
print('RF accuracy =', accuracy_RF)

# Metric report 
metrics_RF = classification_report(y_test,y_hat)
print('RF metric report')
print(metrics_RF)

RF accuracy = 0.548776742758
RF metric report
             precision    recall  f1-score   support

                  0.54      0.55      0.55       119
      17-34       0.68      0.66      0.67      4083
      35-49       0.49      0.60      0.54      3432
      50-64       0.34      0.23      0.28      1718
        65+       0.46      0.31      0.37       417

avg / total       0.54      0.55      0.54      9769



In [27]:
# Classification with RandomForest

# No parameter adjustments

RF = RandomForestClassifier(random_state=111)

# Now we want to iterate through and grab the prediction, just like we did in the KNN above
iter_num=0
# the indices are the rows used for training and testing in each iteration
for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  # train indices for X
    y_train = y[train_indices]  # train indices for y
    
    X_test = X[test_indices]    # test indices for X
    y_test = y[test_indices]    # test indices for y
    
    # train the reusable KNN classifier on the training data
    RF.fit(X_train,y_train)  # train object
    y_hat = RF.predict(X_test) # get test set precitions
    
    # Accuracy for the iterations of training/testing
    accuracy_RF = mt.accuracy_score(y_test,y_hat)    # obtain accuracies for each iteration 
    print("----Iteration",iter_num," ----")          # print out each numbered interation 
    print('RF accuracy =', accuracy_RF)

    # Metric report 
    metrics_RF = classification_report(y_test,y_hat)
    print('RF metric report')
    print(metrics_RF)
    iter_num+=1

----Iteration 0  ----
RF accuracy = 0.551950046064
RF metric report
             precision    recall  f1-score   support

                  0.53      0.58      0.56       119
      17-34       0.69      0.65      0.67      4083
      35-49       0.49      0.63      0.55      3432
      50-64       0.34      0.22      0.27      1718
        65+       0.48      0.32      0.38       417

avg / total       0.55      0.55      0.54      9769

----Iteration 1  ----
RF accuracy = 0.54591053332
RF metric report
             precision    recall  f1-score   support

                  0.49      0.46      0.47       119
      17-34       0.68      0.64      0.66      4083
      35-49       0.49      0.61      0.54      3432
      50-64       0.35      0.24      0.28      1718
        65+       0.46      0.37      0.41       417

avg / total       0.54      0.55      0.54      9769

----Iteration 2  ----
RF accuracy = 0.56024158051
RF metric report
             precision    recall  f1-score   suppo

In [28]:
# Create an object that holds the second RF model with parameter adjustments

RF2= RandomForestClassifier(max_depth=10, random_state=111)

# Now we want to iterate through and grab the prediction, just like we did in the RF above
# Now we want to iterate through and grab the prediction, just like we did in the KNN above
iter_num=0
# the indices are the rows used for training and testing in each iteration
for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  # train indices for X
    y_train = y[train_indices]  # train indices for y
    
    X_test = X[test_indices]    # test indices for X
    y_test = y[test_indices]    # test indices for y
    
    # train the reusable KNN classifier on the training data
    RF2.fit(X_train,y_train)  # train object
    y_hat = RF2.predict(X_test) # get test set precitions
    
    # Accuracy for the iterations of training/testing
    accuracy_RF2 = mt.accuracy_score(y_test,y_hat)   # obtain accuracies for each iteration 
    print("----Iteration",iter_num," ----")          # print out each numbered interation 
    print('RF2 accuracy =', accuracy_RF2)

    # Metric report 
    metrics_RF2 = classification_report(y_test,y_hat)
    print('RF2 metric report')
    print(metrics_RF2)
    iter_num+=1

----Iteration 0  ----
RF2 accuracy = 0.586958747057
RF2 metric report
             precision    recall  f1-score   support

                  0.64      0.39      0.49       119
      17-34       0.76      0.65      0.70      4083
      35-49       0.49      0.79      0.60      3432
      50-64       0.48      0.11      0.18      1718
        65+       0.57      0.26      0.36       417

avg / total       0.60      0.59      0.56      9769

----Iteration 1  ----
RF2 accuracy = 0.586651653189
RF2 metric report
             precision    recall  f1-score   support

                  0.60      0.24      0.35       119
      17-34       0.75      0.65      0.70      4083
      35-49       0.49      0.79      0.61      3432
      50-64       0.47      0.11      0.17      1718
        65+       0.60      0.35      0.45       417

avg / total       0.60      0.59      0.56      9769

----Iteration 2  ----
RF2 accuracy = 0.584911454601
RF2 metric report
             precision    recall  f1-score

In [29]:
# Get the mean accuracy

# Load the accuracies
accuracies_RF2 = cross_val_score(RF2, X, y=y, cv=sss) # this also can help with parallelism

# Print out the mean 
mean_RF2 = np.mean(accuracies_RF2)
print("The mean accuracy for this model is ", mean_RF2)

The mean accuracy for this model is  0.581994062852


In [30]:
# Create an object that holds the third RF model with parameter adjustments

RF3 = RandomForestClassifier(max_depth=20, random_state=111)

# Now we want to iterate through and grab the prediction, just like we did in the RF2 above
iter_num=0
# The indices are the rows used for training and testing in each iteration
for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  #train indices for X
    y_train = y[train_indices]  #train indices for y
    
    X_test = X[test_indices]    #test indices for X
    y_test = y[test_indices]    #test indices for y
    
    # Train the reusable KNN classifier on the training data
    RF3.fit(X_train,y_train)  # train object
    y_hat = RF3.predict(X_test) #get the test set predictions 
    
    # Accuracy for the iterations of training/testing
    accuracy_RF3= mt.accuracy_score(y_test,y_hat)    # obtain accuracies for each iteration 
    print("----Iteration",iter_num," ----")          # print out each numbered interation 
    print('RF3 accuracy =', accuracy_RF3)

    # Metric report 
    metrics_RF3 = classification_report(y_test,y_hat)
    print('RF3 metric report')
    print(metrics_RF3)
    iter_num+=1

----Iteration 0  ----
RF3 accuracy = 0.571092230525
RF3 metric report
             precision    recall  f1-score   support

                  0.57      0.63      0.60       119
      17-34       0.72      0.65      0.68      4083
      35-49       0.50      0.69      0.58      3432
      50-64       0.37      0.21      0.27      1718
        65+       0.51      0.33      0.40       417

avg / total       0.57      0.57      0.56      9769

----Iteration 1  ----
RF3 accuracy = 0.572320605999
RF3 metric report
             precision    recall  f1-score   support

                  0.52      0.50      0.51       119
      17-34       0.72      0.65      0.68      4083
      35-49       0.50      0.69      0.58      3432
      50-64       0.39      0.22      0.28      1718
        65+       0.50      0.38      0.43       417

avg / total       0.57      0.57      0.56      9769

----Iteration 2  ----
RF3 accuracy = 0.576927014024
RF3 metric report
             precision    recall  f1-score

In [31]:
# Get the mean accuracy

# Load the accuracies
accuracies_RF3 = cross_val_score(RF3, X, y=y, cv=sss) # this also can help with parallelism

# Print out the mean 
mean_RF3 = np.mean(accuracies_RF3)
print("The mean accuracy for this model is ", mean_RF3)

The mean accuracy for this model is  0.567263793633


In [32]:
# Create an object that holds the third RF model with parameter adjustments

RF4 = RandomForestClassifier(max_depth=100, random_state=111)

# Now we want to iterate through and grab the prediction, just like we did in the RF3 above
iter_num=0
# The indices are the rows used for training and testing in each iteration
for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  #train indices for X
    y_train = y[train_indices]  #train indices for y
    
    X_test = X[test_indices]    #test indices for X
    y_test = y[test_indices]    #test indices for y
    
    # Train the reusable KNN classifier on the training data
    RF4.fit(X_train,y_train)  # train object
    y_hat = RF4.predict(X_test) #get the test set predictions 
    
    # Accuracy for the iterations of training/testing
    accuracy_RF4= mt.accuracy_score(y_test,y_hat)    # obtain accuracies for each iteration 
    print("----Iteration",iter_num," ----")          # print out each numbered interation 
    print('RF4 accuracy =', accuracy_RF4)

    # Metric report 
    metrics_RF4 = classification_report(y_test,y_hat)
    print('RF4 metric report')
    print(metrics_RF4)
    iter_num+=1

----Iteration 0  ----
RF4 accuracy = 0.551950046064
RF4 metric report
             precision    recall  f1-score   support

                  0.53      0.58      0.56       119
      17-34       0.69      0.65      0.67      4083
      35-49       0.49      0.63      0.55      3432
      50-64       0.34      0.22      0.27      1718
        65+       0.48      0.32      0.38       417

avg / total       0.55      0.55      0.54      9769

----Iteration 1  ----
RF4 accuracy = 0.54591053332
RF4 metric report
             precision    recall  f1-score   support

                  0.49      0.46      0.47       119
      17-34       0.68      0.64      0.66      4083
      35-49       0.49      0.61      0.54      3432
      50-64       0.35      0.24      0.28      1718
        65+       0.46      0.37      0.41       417

avg / total       0.54      0.55      0.54      9769

----Iteration 2  ----
RF4 accuracy = 0.56024158051
RF4 metric report
             precision    recall  f1-score  

In [33]:
# Get the mean accuracy

# Load the accuracies
accuracies_RF4 = cross_val_score(RF4, X, y=y, cv=sss) # this also can help with parallelism

# Print out the mean 
mean_RF4 = np.mean(accuracies_RF4)
print("The mean accuracy for this model is ", mean_RF4)

The mean accuracy for this model is  0.547507421435


In [34]:
# Create an object that holds the third RF model with parameter adjustments

RF5 = RandomForestClassifier(max_depth=50, random_state=111)

# Now we want to iterate through and grab the prediction, just like we did in the RF4 above
iter_num=0
# The indices are the rows used for training and testing in each iteration
for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  #train indices for X
    y_train = y[train_indices]  #train indices for y
    
    X_test = X[test_indices]    #test indices for X
    y_test = y[test_indices]    #test indices for y
    
    # Train the reusable KNN classifier on the training data
    RF5.fit(X_train,y_train)  # train object
    y_hat = RF5.predict(X_test) #get the test set predictions 
    
    # Accuracy for the iterations of training/testing
    accuracy_RF5= mt.accuracy_score(y_test,y_hat)    # obtain accuracies for each iteration 
    print("----Iteration",iter_num," ----")          # print out each numbered interation 
    print('RF5 accuracy =', accuracy_RF5)

    # Metric report 
    metrics_RF5 = classification_report(y_test,y_hat)
    print('RF5 metric report')
    print(metrics_RF5)
    iter_num+=1

----Iteration 0  ----
RF5 accuracy = 0.552052410687
RF5 metric report
             precision    recall  f1-score   support

                  0.54      0.58      0.56       119
      17-34       0.69      0.65      0.67      4083
      35-49       0.49      0.63      0.55      3432
      50-64       0.35      0.22      0.27      1718
        65+       0.49      0.32      0.38       417

avg / total       0.55      0.55      0.54      9769

----Iteration 1  ----
RF5 accuracy = 0.54591053332
RF5 metric report
             precision    recall  f1-score   support

                  0.49      0.46      0.47       119
      17-34       0.68      0.64      0.66      4083
      35-49       0.49      0.61      0.54      3432
      50-64       0.35      0.24      0.28      1718
        65+       0.46      0.37      0.41       417

avg / total       0.54      0.55      0.54      9769

----Iteration 2  ----
RF5 accuracy = 0.56024158051
RF5 metric report
             precision    recall  f1-score  

In [35]:
# Get the mean accuracy

# Load the accuracies
accuracies_RF5 = cross_val_score(RF5, X, y=y, cv=sss) # this also can help with parallelism

# Print out the mean 
mean_RF5 = np.mean(accuracies_RF5)
print("The mean accuracy for this model is ", mean_RF5)

The mean accuracy for this model is  0.547517657897


In [36]:
# Create an object that holds the third RF model with parameter adjustments

RF6 = RandomForestClassifier(max_depth=25, random_state=111)

# Now we want to iterate through and grab the prediction, just like we did in the RF4 above
iter_num=0
# The indices are the rows used for training and testing in each iteration
for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  #train indices for X
    y_train = y[train_indices]  #train indices for y
    
    X_test = X[test_indices]    #test indices for X
    y_test = y[test_indices]    #test indices for y
    
    # Train the reusable KNN classifier on the training data
    RF6.fit(X_train,y_train)  # train object
    y_hat = RF6.predict(X_test) #get the test set predictions 
    
    # Accuracy for the iterations of training/testing
    accuracy_RF6= mt.accuracy_score(y_test,y_hat)    # obtain accuracies for each iteration 
    print("----Iteration",iter_num," ----")          # print out each numbered interation 
    print('RF6 accuracy =', accuracy_RF6)

    # Metric report 
    metrics_RF6 = classification_report(y_test,y_hat)
    print('RF6 metric report')
    print(metrics_RF6)
    iter_num+=1

----Iteration 0  ----
RF6 accuracy = 0.55952502815
RF6 metric report
             precision    recall  f1-score   support

                  0.55      0.62      0.58       119
      17-34       0.70      0.65      0.68      4083
      35-49       0.49      0.64      0.56      3432
      50-64       0.36      0.23      0.28      1718
        65+       0.47      0.33      0.39       417

avg / total       0.56      0.56      0.55      9769

----Iteration 1  ----
RF6 accuracy = 0.55399733852
RF6 metric report
             precision    recall  f1-score   support

                  0.48      0.43      0.45       119
      17-34       0.69      0.65      0.67      4083
      35-49       0.49      0.63      0.55      3432
      50-64       0.35      0.22      0.27      1718
        65+       0.48      0.36      0.41       417

avg / total       0.55      0.55      0.55      9769

----Iteration 2  ----
RF6 accuracy = 0.561777049852
RF6 metric report
             precision    recall  f1-score  

In [37]:
# Get the mean accuracy

# Load the accuracies
accuracies_RF6 = cross_val_score(RF6, X, y=y, cv=sss) # this also can help with parallelism

# Print out the mean 
mean_RF6 = np.mean(accuracies_RF6)
print("The mean accuracy for this model is ", mean_RF6)

The mean accuracy for this model is  0.554068993756


In [43]:

# Accuracies for comparision for each RF model 

# RF model 1
print("1. The mean accuracy for RF model 1 is ", accuracy_RF)
print('------------------------------------------------------------------------')

# RF model 2
print("2. The mean accuracy for RF model 2 is ", accuracy_RF2)
print('------------------------------------------------------------------------')

# RF model 3
print("3. The mean accuracy for RF model 3 is ", accuracy_RF3)
print('------------------------------------------------------------------------')

# RF model 4
print("4. The mean accuracy for RF model 4 is ", accuracy_RF4)
print('------------------------------------------------------------------------')

# RF model 5
print("5. The mean accuracy for RF model 5 is ", accuracy_RF5)
print('------------------------------------------------------------------------')

# RF model 6
print("6. The mean accuracy for RF model 6 is ",accuracy_RF6)
print('------------------------------------------------------------------------')

1. The mean accuracy for RF model 1 is  0.548776742758
------------------------------------------------------------------------
2. The mean accuracy for RF model 2 is  0.578155389497
------------------------------------------------------------------------
3. The mean accuracy for RF model 3 is  0.563414883816
------------------------------------------------------------------------
4. The mean accuracy for RF model 4 is  0.548776742758
------------------------------------------------------------------------
5. The mean accuracy for RF model 5 is  0.548776742758
------------------------------------------------------------------------
6. The mean accuracy for RF model 6 is  0.553383150783
------------------------------------------------------------------------


In [44]:
# Create an object that holds the second RF model with parameter adjustments

RF2= RandomForestClassifier(max_depth=10, random_state=111)

# Now we want to iterate through and grab the prediction, just like we did in the RF above
iter_num=0
# the indices are the rows used for training and testing in each iteration
for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  # train indices for X
    y_train = y[train_indices]  # train indices for y
    
    X_test = X[test_indices]    # test indices for X
    y_test = y[test_indices]    # test indices for y
    
    # train the reusable KNN classifier on the training data
    RF2.fit(X_train,y_train)  # train object
    y_hat = RF2.predict(X_test) # get test set precitions
    
    # Accuracy for the iterations of training/testing
    accuracy_RF2 = mt.accuracy_score(y_test,y_hat)   # obtain accuracies for each iteration 
    print("----Iteration",iter_num," ----")          # print out each numbered interation 
    print('RF2 accuracy =', accuracy_RF2)

    # Metric report 
    metrics_RF2 = classification_report(y_test,y_hat)
    print('RF2 metric report')
    print(metrics_RF2)
    iter_num+=1

----Iteration 0  ----
RF2 accuracy = 0.586958747057
RF2 metric report
             precision    recall  f1-score   support

                  0.64      0.39      0.49       119
      17-34       0.76      0.65      0.70      4083
      35-49       0.49      0.79      0.60      3432
      50-64       0.48      0.11      0.18      1718
        65+       0.57      0.26      0.36       417

avg / total       0.60      0.59      0.56      9769

----Iteration 1  ----
RF2 accuracy = 0.586651653189
RF2 metric report
             precision    recall  f1-score   support

                  0.60      0.24      0.35       119
      17-34       0.75      0.65      0.70      4083
      35-49       0.49      0.79      0.61      3432
      50-64       0.47      0.11      0.17      1718
        65+       0.60      0.35      0.45       417

avg / total       0.60      0.59      0.56      9769

----Iteration 2  ----
RF2 accuracy = 0.584911454601
RF2 metric report
             precision    recall  f1-score

In [45]:
# Create an object that holds the third RF model with parameter adjustments
# The warm_start parameter reuses the solution of the previous model called and adds more estimators

RF7 = RandomForestClassifier(max_depth=10, random_state=111, n_estimators=150, warm_start=True)

# Now we want to iterate through and grab the prediction, just like we did in the RF2 above
iter_num=0
# The indices are the rows used for training and testing in each iteration
for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  #train indices for X
    y_train = y[train_indices]  #train indices for y
    
    X_test = X[test_indices]    #test indices for X
    y_test = y[test_indices]    #test indices for y
    
    # Train the reusable RF classifier on the training data
    RF7.fit(X_train,y_train)  # train object
    y_hat = RF7.predict(X_test) #get the test set predictions 
    
    # Accuracy for the iterations of training/testing
    accuracy_RF7= mt.accuracy_score(y_test,y_hat)
    print("----Iteration",iter_num," ----")          # print out each numbered interation 
    print('RF7 accuracy =', accuracy_RF7)

    # Metric report 
    metrics_RF7 = classification_report(y_test,y_hat)
    print('RF7 metric report')
    print(metrics_RF7)
    iter_num+=1

----Iteration 0  ----
RF7 accuracy = 0.585628006961
RF7 metric report
             precision    recall  f1-score   support

                  0.67      0.02      0.03       119
      17-34       0.74      0.66      0.70      4083
      35-49       0.49      0.79      0.60      3432
      50-64       0.47      0.11      0.18      1718
        65+       0.62      0.28      0.39       417

avg / total       0.60      0.59      0.55      9769



  warn("Warm-start fitting without increasing n_estimators does not "


----Iteration 1  ----
RF7 accuracy = 0.602108711229
RF7 metric report
             precision    recall  f1-score   support

                  1.00      0.03      0.07       119
      17-34       0.75      0.67      0.71      4083
      35-49       0.50      0.81      0.62      3432
      50-64       0.57      0.14      0.22      1718
        65+       0.71      0.36      0.48       417

avg / total       0.63      0.60      0.57      9769

----Iteration 2  ----
RF7 accuracy = 0.602313440475
RF7 metric report
             precision    recall  f1-score   support

                  1.00      0.05      0.10       119
      17-34       0.75      0.67      0.71      4083
      35-49       0.50      0.81      0.62      3432
      50-64       0.59      0.13      0.22      1718
        65+       0.74      0.35      0.47       417

avg / total       0.64      0.60      0.57      9769

----Iteration 3  ----
RF7 accuracy = 0.595045552257
RF7 metric report
             precision    recall  f1-score

In [46]:
# Get the mean accuracy

# Load the accuracies
accuracies_RF7 = cross_val_score(RF7, X, y=y, cv=sss) # this also can help with parallelism

# Print out the mean 
mean_RF7 = np.mean(accuracies_RF7)
print("The mean accuracy for this model is ", mean_RF7)

The mean accuracy for this model is  0.581912171154


In [47]:
# Create an object that holds the third RF model with parameter adjustments
# The warm_start parameter reuses the solution of the previous model called and adds more estimators

RF8 = RandomForestClassifier(max_depth=10, random_state=111, n_estimators=200, warm_start=True)

# Now we want to iterate through and grab the prediction, just like we did in the RF2 above
iter_num=0
# The indices are the rows used for training and testing in each iteration
for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  #train indices for X
    y_train = y[train_indices]  #train indices for y
    
    X_test = X[test_indices]    #test indices for X
    y_test = y[test_indices]    #test indices for y
    
    # Train the reusable KNN classifier on the training data
    RF8.fit(X_train,y_train)  # train object
    y_hat = RF8.predict(X_test) #get the test set predictions 
    
    # Accuracy for the iterations of training/testing
    accuracy_RF8= mt.accuracy_score(y_test,y_hat)
    print("----Iteration",iter_num," ----")          # print out each numbered interation 
    print('RF8 accuracy =', accuracy_RF8)

    # Metric report 
    metrics_RF8 = classification_report(y_test,y_hat)
    print('RF8 metric report')
    print(metrics_RF8)
    iter_num+=1

----Iteration 0  ----
RF8 accuracy = 0.585320913092
RF8 metric report
             precision    recall  f1-score   support

                  0.50      0.01      0.02       119
      17-34       0.74      0.66      0.70      4083
      35-49       0.49      0.79      0.60      3432
      50-64       0.46      0.11      0.18      1718
        65+       0.61      0.28      0.38       417

avg / total       0.60      0.59      0.55      9769



  warn("Warm-start fitting without increasing n_estimators does not "


----Iteration 1  ----
RF8 accuracy = 0.603029992834
RF8 metric report
             precision    recall  f1-score   support

                  1.00      0.02      0.03       119
      17-34       0.75      0.67      0.71      4083
      35-49       0.50      0.81      0.62      3432
      50-64       0.57      0.14      0.22      1718
        65+       0.71      0.36      0.48       417

avg / total       0.63      0.60      0.57      9769

----Iteration 2  ----
RF8 accuracy = 0.602211075852
RF8 metric report
             precision    recall  f1-score   support

                  1.00      0.03      0.07       119
      17-34       0.75      0.67      0.71      4083
      35-49       0.50      0.81      0.62      3432
      50-64       0.58      0.13      0.21      1718
        65+       0.74      0.35      0.47       417

avg / total       0.64      0.60      0.57      9769

----Iteration 3  ----
RF8 accuracy = 0.596069198485
RF8 metric report
             precision    recall  f1-score

  'precision', 'predicted', average, warn_for)


In [48]:
# Get the mean accuracy

# Load the accuracies
accuracies_RF8 = cross_val_score(RF8, X, y=y, cv=sss) # this also can help with parallelism

# Print out the mean 
mean_RF8 = np.mean(accuracies_RF8)
print("The mean accuracy for this model is ", mean_RF8)

The mean accuracy for this model is  0.58207595455


In [60]:
# Create an object that holds the third RF model with parameter adjustments
# The warm_start parameter reuses the solution of the previous model called and adds more estimators

RF9 = RandomForestClassifier(max_depth=20, random_state=111, n_estimators=225, warm_start=True)

# Now we want to iterate through and grab the prediction, just like we did in the RF2 above
iter_num=0
# The indices are the rows used for training and testing in each iteration
for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  #train indices for X
    y_train = y[train_indices]  #train indices for y
    
    X_test = X[test_indices]    #test indices for X
    y_test = y[test_indices]    #test indices for y
    
    # Train the reusable KNN classifier on the training data
    RF9.fit(X_train,y_train)  # train object
    y_hat = RF9.predict(X_test) #get the test set predictions 
    
    # Accuracy for the iterations of training/testing
    accuracy_RF9= mt.accuracy_score(y_test,y_hat)
    print("----Iteration",iter_num," ----")          # print out each numbered interation 
    print('RF9 accuracy =', accuracy_RF9)

    # Metric report 
    metrics_RF9 = classification_report(y_test,y_hat)
    print('RF9 metric report')
    print(metrics_RF9)
    iter_num+=1

----Iteration 0  ----
RF9 accuracy = 0.574674992323
RF9 metric report
             precision    recall  f1-score   support

                  0.57      0.54      0.55       119
      17-34       0.72      0.66      0.69      4083
      35-49       0.50      0.70      0.58      3432
      50-64       0.39      0.19      0.25      1718
        65+       0.53      0.35      0.42       417

avg / total       0.57      0.57      0.56      9769



  warn("Warm-start fitting without increasing n_estimators does not "


----Iteration 1  ----
RF9 accuracy = 0.686354795783
RF9 metric report
             precision    recall  f1-score   support

                  0.66      0.56      0.61       119
      17-34       0.79      0.73      0.76      4083
      35-49       0.59      0.81      0.68      3432
      50-64       0.71      0.37      0.49      1718
        65+       0.85      0.63      0.72       417

avg / total       0.71      0.69      0.68      9769

----Iteration 2  ----
RF9 accuracy = 0.68369331559
RF9 metric report
             precision    recall  f1-score   support

                  0.65      0.56      0.60       119
      17-34       0.79      0.73      0.76      4083
      35-49       0.58      0.81      0.68      3432
      50-64       0.73      0.36      0.48      1718
        65+       0.84      0.62      0.71       417

avg / total       0.71      0.68      0.68      9769

----Iteration 3  ----
RF9 accuracy = 0.679086907565
RF9 metric report
             precision    recall  f1-score 

In [61]:
# Get the mean accuracy

# Load the accuracies
accuracies_RF9 = cross_val_score(RF9, X, y=y, cv=sss) # this also can help with parallelism

# Print out the mean 
mean_RF9 = np.mean(accuracies_RF9)
print("The mean accuracy for this model is ", mean_RF9)

The mean accuracy for this model is  0.571010338827


In [53]:
# Create an object that holds the third RF model with parameter adjustments
# The warm_start parameter reuses the solution of the previous model called and adds more estimators

RF10 = RandomForestClassifier(max_depth=30, random_state=111, n_estimators=400, warm_start=True)

# Now we want to iterate through and grab the prediction, just like we did in the RF2 above
iter_num=0
# The indices are the rows used for training and testing in each iteration
for train_indices, test_indices in sss.split(X,y): 
    X_train = X[train_indices]  #train indices for X
    y_train = y[train_indices]  #train indices for y
    
    X_test = X[test_indices]    #test indices for X
    y_test = y[test_indices]    #test indices for y
    
    # Train the reusable RF classifier on the training data
    RF10.fit(X_train,y_train)  # train object
    y_hat = RF10.predict(X_test) #get the test set predictions 
    
    # Accuracy for the iterations of training/testing
    accuracy_RF10= mt.accuracy_score(y_test,y_hat)
    print("----Iteration",iter_num," ----")          # print out each numbered interation 
    print('RF10 accuracy =', accuracy_RF10)

    # Metric report 
    metrics_RF10 = classification_report(y_test,y_hat)
    print('RF10 metric report')
    print(metrics_RF10)
    iter_num+=1

----Iteration 0  ----
RF10 accuracy = 0.557784829563
RF10 metric report
             precision    recall  f1-score   support

                  0.54      0.55      0.55       119
      17-34       0.70      0.65      0.67      4083
      35-49       0.49      0.65      0.56      3432
      50-64       0.36      0.21      0.27      1718
        65+       0.50      0.35      0.41       417

avg / total       0.56      0.56      0.55      9769



  warn("Warm-start fitting without increasing n_estimators does not "


----Iteration 1  ----
RF10 accuracy = 0.697000716552
RF10 metric report
             precision    recall  f1-score   support

                  0.67      0.60      0.63       119
      17-34       0.80      0.74      0.77      4083
      35-49       0.60      0.79      0.69      3432
      50-64       0.69      0.42      0.52      1718
        65+       0.81      0.65      0.72       417

avg / total       0.71      0.70      0.69      9769

----Iteration 2  ----
RF10 accuracy = 0.699252738254
RF10 metric report
             precision    recall  f1-score   support

                  0.66      0.58      0.62       119
      17-34       0.80      0.75      0.77      4083
      35-49       0.60      0.80      0.69      3432
      50-64       0.70      0.42      0.52      1718
        65+       0.83      0.65      0.73       417

avg / total       0.71      0.70      0.69      9769

----Iteration 3  ----
RF10 accuracy = 0.693417954755
RF10 metric report
             precision    recall  f1

In [56]:
# Get the mean accuracy

# Load the accuracies
accuracies_RF10 = cross_val_score(RF10, X, y=y, cv=sss) # this also can help with parallelism

# Print out the mean 
mean_RF10 = np.mean(accuracies_RF10)
print("The mean accuracy for this model is ", mean_RF10)

The mean accuracy for this model is  0.554150885454


In [73]:
# Accuracies for comparision for each RF model 

# RF model 1
print("1. The mean accuracy for RF model 1 is ", accuracy_RF)
print('------------------------------------------------------------------------')

# RF model 2
print("2. The mean accuracy for RF model 2 is ", accuracy_RF2)
print('------------------------------------------------------------------------')

# RF model 3
print("3. The mean accuracy for RF model 3 is ", accuracy_RF3)
print('------------------------------------------------------------------------')

# RF model 4
print("4. The mean accuracy for RF model 4 is ", accuracy_RF4)
print('------------------------------------------------------------------------')

# RF model 5
print("5. The mean accuracy for RF model 5 is ", accuracy_RF5)
print('------------------------------------------------------------------------')

# RF model 6
print("6. The mean accuracy for RF model 6 is ", accuracy_RF6)
print('------------------------------------------------------------------------')

# RF model 7
print("7. The mean accuracy for RF model 7 is ", accuracy_RF7)
print('------------------------------------------------------------------------')

# RF model 8
print("8. The mean accuracy for RF model 8 is ", accuracy_RF8)
print('------------------------------------------------------------------------')
# RF model 9
print("9. The mean accuracy for RF model 9 is ", accuracy_RF9)
print('------------------------------------------------------------------------')
# RF model 10
print("10. The mean accuracy for RF model 10 is ", accuracy_RF10)
print('------------------------------------------------------------------------')


1. The mean accuracy for RF model 1 is  0.548776742758
------------------------------------------------------------------------
2. The mean accuracy for RF model 2 is  0.578155389497
------------------------------------------------------------------------
3. The mean accuracy for RF model 3 is  0.563414883816
------------------------------------------------------------------------
4. The mean accuracy for RF model 4 is  0.548776742758
------------------------------------------------------------------------
5. The mean accuracy for RF model 5 is  0.548776742758
------------------------------------------------------------------------
6. The mean accuracy for RF model 6 is  0.553383150783
------------------------------------------------------------------------
7. The mean accuracy for RF model 7 is  0.599242501791
------------------------------------------------------------------------
8. The mean accuracy for RF model 8 is  0.599344866414
-------------------------------------------------

The Random Forest model with the highest predicted accuracy is model 10. Random Forest model 10 has a max tree depth 
of 30 and 400 estimators, which represents the number of trees in the forest. Model 10 has the highest mean 
predicted accuracy of 69.49% 