## Feature Importance and RFE for college enrollment
#### (With the trained logistic regression and random forests models trained in Step 2)

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
#from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz

import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import sklearn.metrics

In [None]:
# reading in data
X=np.loadtxt('dat1_collen.csv', delimiter=',')
collen=np.loadtxt('collen.csv', delimiter=',')
weights=np.loadtxt('weights4_collen.csv', delimiter=',')

#setting us same train-test data as in step 2
skf = StratifiedKFold(n_splits=5, random_state = 666, shuffle= True)
skf.get_n_splits(X, collen)
train_indices=[]
test_indices=[]
for train_index, test_index in skf.split(X, collen):
    train_indices.append(train_index)
    test_indices.append(test_index)

train1 = train_indices[0]
test1 = test_indices[0]
train2 = train_indices[1]
test2 = test_indices[1]
train3 = train_indices[2]
test3 = test_indices[2]
train4 = train_indices[3]
test4 = test_indices[3]
train5 = train_indices[4]
test5 = test_indices[4]

#### Feature Importance -- logistic regression

In [None]:
#CV1
lr1 = LogisticRegression(random_state = 666, C = 10)
lr1.fit(X[train1], collen[train1], sample_weight = weights[train1])
coefs1 = lr1.coef_

#CV2
lr2 = LogisticRegression(random_state = 666, C = 1000)
lr2.fit(X[train2], collen[train2], sample_weight = weights[train2])
coefs2 = lr2.coef_

#CV3
lr3 = LogisticRegression(random_state = 666, C = 1)
lr3.fit(X[train3], collen[train3], sample_weight = weights[train3])
coefs3 = lr3.coef_

#CV4
lr4 = LogisticRegression(random_state = 666, C = .002)
lr4.fit(X[train4], collen[train4], sample_weight = weights[train4])
coefs4 = lr4.coef_

#CV5
lr5 = LogisticRegression(random_state = 666, C = .01)
lr5.fit(X[train5], collen[train5], sample_weight = weights[train5])
coefs5 = lr5.coef_


#calculate the average
coefs_collen = np.concatenate (([coefs1_std], [coefs2_std], [coefs3_std], [coefs4_std], [coefs5_std]), axis=0)
print(coefs_collen)
coefs_collen_avg = np.mean(coefs_collen, axis = 0)
coefs_collen_avg

#### Feature Importance --random forests

In [None]:
#CV1
rf1 = RandomForestClassifier(random_state = 666, n_estimators=400, max_depth = 7) 
rf1.fit(X[train1], collen[train1], sample_weight = weights[train1])
imps1 = rf1.feature_importances_

#CV2
rf2 = RandomForestClassifier(random_state = 666, n_estimators=800, max_depth = 10) 
rf2.fit(X[train2], collen[train2], sample_weight = weights[train2])
imps2 = rf2.feature_importances_

#CV3
rf3 = RandomForestClassifier(random_state = 666, n_estimators=900, max_depth = 10) 
rf3.fit(X[train3], collen[train3], sample_weight = weights[train3])
imps3 = rf3.feature_importances_

#CV4
rf4 = RandomForestClassifier(random_state = 666, n_estimators=400, max_depth = 11) 
rf4.fit(X[train4], collen[train4], sample_weight = weights[train4])
imps4 = rf4.feature_importances_

#CV5
rf5 = RandomForestClassifier(random_state = 666, n_estimators=700, max_depth = 20) 
rf5.fit(X[train5], collen[train5], sample_weight = weights[train5])
imps5 = rf5.feature_importances_

#Calculate the average
imps_collen = np.concatenate (([imps1], [imps2], [imps3], [imps4], [imps5]), axis=0)
imps_collen_avg = np.mean(imps_collen, axis = 0)
imps_collen_avg

#### Recursive feature elimination -- logistic regression

In [None]:
feature_names = ['Mother', 'Father', 'TwoBioParent', 'HHsize', 'SibNum', 'BirthOrder', 'momeduc', 'dadeduc', 
                 'momjob', 'dadjob', 'faminc', 'PAassistance', 'welfare', 'PAecohard', 'minvolve', 'dinvolve',
                 'PAPTA', 'mexp', 'dexp', 'mactiv', 'dactiv', 'control', 'mspv', 'dspv', 'mrel', 'drel', 
                 'famsup', 'dinner', 'PAclosure', 'mnativity', 'dnativity', 'PAage', 'PAhealth', 'PAsmoke', 
                 'malcohol', 'dalcohol', 'mobese', 'dobese', 'mdisable', 'ddisable', 'PArelig', 'HHsmoke', 
                 'HHdrug', 'fammed', 'EnglishHome', 'biosex', 'YAge', 'Latino', 'AA', 'Native', 'Asian', 'other_race',
                 'nativity']

In [None]:
optimal_Regs = [10, 1000, 1, 0.002, 0.01]
X_rfe = X
accuracies = []
feature_eliminate = []
while len(feature_names)>0:  #stop till all features are eliminated
    accuracy = []
    coefs_all=[]
    for i in range(5):
        train = train_indices[i]
        test = test_indices[i]
        lr = LogisticRegression(random_state = 666, C = optimal_Regs[i])
        lr.fit(X_rfe[train], collen[train], sample_weight = weights[train])
        acc = lr.score(X_rfe[test], collen[test], sample_weight=weights[test])
        accuracy.append(acc)
        coefs = lr.coef_
        coefs_std = np.std(X_rfe, 0)*coefs
        coefs_all.append(coefs_std)    #outer loop 5-fold cross-validation for accuracy calculation at each round
    accuracy_avg = np.mean(accuracy)
    print(accuracy_avg)
    accuracies.append(accuracy_avg)
    coefs_all_arr = np.concatenate ((coefs_all), axis=0)
    coefs_all_avg = np.mean(coefs_all_arr, axis = 0)
    print(coefs_all_avg)
    coefs_all_avg_abs = np.absolute(coefs_all_avg)
    least_imp_ind = np.argmin(coefs_all_avg_abs)
    print(feature_names[least_imp_ind], least_imp_ind)
    feature_eliminate.append(feature_names[least_imp_ind])
    feature_names.pop(least_imp_ind)
    X_rfe = np.delete(X_rfe, least_imp_ind, 1)  #deleting the least important feature
    print(X_rfe.shape)

In [None]:
print(feature_eliminate)
#eliminated feature order

In [None]:
#plot the RFE curve
eli = range(53)
plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams.update({'font.size': 16})
plt.plot(eli, accuracies)
plt.ylabel('Accuracy')
plt.xlabel('Number of features eliminated')
plt.tight_layout()
plt.savefig('collen_lr_rfe', dpi=1000)
plt.show()

#### Recursive feature elimination -- random forests

In [None]:
X_rfe = X
accuracies = []
feature_eliminate = []
ntree_rfe = [400, 800, 900, 400, 700]
depths_rfe = [7, 10, 10, 11, 20]
while len(feature_names)>0:
    accuracy = []
    imps_all=[]
    for i in range(5):
        train = train_indices[i]
        test = test_indices[i]
        rf = RandomForestClassifier(random_state = 666, n_estimators=ntree_rfe[i], max_depth = depths_rfe[i]) 
        rf.fit(X_rfe[train], collen[train], sample_weight = weights[train])
        acc = rf.score(X_rfe[test], collen[test], sample_weight=weights[test])
        accuracy.append(acc)
        imps = rf.feature_importances_
        #print('imps',imps)
        imps_all.append(imps)
        #print('imps_all',imps_all)
        print(i, acc)
    accuracy_avg = np.mean(accuracy)
    print('accuracy', accuracy_avg)
    accuracies.append(accuracy_avg)
    imps_all_avg = np.mean(imps_all, axis = 0)
    print('imps_all_avg', imps_all_avg)
    least_imp_ind = np.argmin(imps_all_avg)
    print(feature_names[least_imp_ind], least_imp_ind)
    feature_eliminate.append(feature_names[least_imp_ind])
    feature_names.pop(least_imp_ind)
    X_rfe = np.delete(X_rfe, least_imp_ind, 1)
    print(X_rfe.shape)

In [None]:
print(feature_eliminate)
#eliminated feature order

In [None]:
#plot
plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams.update({'font.size': 16})
eli = range(53)
plt.plot(eli, accuracies)
plt.ylabel('Accuracy')
plt.xlabel('Number of features eliminated')
plt.tight_layout()
plt.savefig('collen_rf_rfe', dpi=1000)
plt.show()