### Partial dependence plots for random forests model predicting college enrollment

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
#from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.externals.six import StringIO  
from sklearn.tree import export_graphviz

import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import sklearn.metrics

from skater.core.explanations import Interpretation
from skater.model import InMemoryModel
%matplotlib inline
from matplotlib.ticker import FormatStrFormatter

In [None]:
#setting up
feature_names = ['Mother', 'Father', 'TwoBioParent', 'HHsize', 'SibNum', 'BirthOrder', 'momeduc', 'dadeduc', 
                 'momjob', 'dadjob', 'faminc', 'PAassistance', 'welfare', 'PAecohard', 'minvolve', 'dinvolve',
                 'PAPTA', 'mexp', 'dexp', 'mactiv', 'dactiv', 'control', 'mspv', 'dspv', 'mrel', 'drel', 
                 'famsup', 'dinner', 'PAclosure', 'mnativity', 'dnativity', 'PAage', 'PAhealth', 'PAsmoke', 
                 'malcohol', 'dalcohol', 'mobese', 'dobese', 'mdisable', 'ddisable', 'PArelig', 'HHsmoke', 
                 'HHdrug', 'fammed', 'EnglishHome', 'biosex', 'YAge', 'Latino', 'AA', 'Native', 'Asian', 'other_race',
                 'nativity']
#using the model that has the highest prediction among the 5 models
X=np.loadtxt('dat1_collen.csv', delimiter=',')
collen=np.loadtxt('collen.csv', delimiter=',')
weights=np.loadtxt('weights4_collen.csv', delimiter=',')
skf = StratifiedKFold(n_splits=5, random_state = 666, shuffle= True)
skf.get_n_splits(X, collen)
train_indices=[]
test_indices=[]
for train_index, test_index in skf.split(X, collen):
    train_indices.append(train_index)
    test_indices.append(test_index)
train4 = train_indices[3]
test4 = test_indices[3]
rf4 = RandomForestClassifier(random_state = 666, n_estimators=400, max_depth = 11) 
rf4.fit(X[train4], collen[train4], sample_weight = weights[train4])

#### 2D PDP for nonlinear effects

In [None]:
#Use rf2 
plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams.update({'font.size': 18})
pyint_model = InMemoryModel(rf4.predict_proba, examples=X[train4], 
                            target_names=['No College Enrollment', 'College Enrollment Probability'])
interpreter = Interpretation(X[test4], feature_names=feature_names)
axes_list = interpreter.partial_dependence.plot_partial_dependence(['mactiv'],pyint_model,  #here, plug in the selected variable you're interested in looking!
                        grid_resolution=30, with_variance=False,figsize = (5, 5))
ax = axes_list[0][1]
#plt.xlabel('Mother-Adolescent Shared Activities')
plt.ylabel(' ') 
#plt.yticks([0.60, 0.62, 0.64, 0.66, 0.68])
ax.get_legend().remove()
plt.autoscale(False)
#ax.set_ylim(0.63, 0.66)
ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
#ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
ax.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))
#ax.xaxis.set_label_coords(0.5, 0.06)
#plt.savefig('faminc_collen_c.png', dpi=300) 
#plt.savefig('mattach.png', dpi=300)  
#ax.yaxis.get_offset_text().set_visible(False)

#### 3D PDP for interactive effects

In [None]:
plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams.update({'font.size': 16})
model = InMemoryModel(rf4.predict_proba, examples=X[train4], target_names=['No College Enrollment', 'College Enrollment Probability'])
axes_list=interpreter.partial_dependence.plot_partial_dependence([('momjob', 'dinner')], model, 
                                                                 #here, plug in the two selected variables you're interested in looking!
                                                       grid_resolution=10)
ax = axes_list[0][0].axes[0]
#plt.ylabel('Shared Dinner With Parents')
#plt.xlabel('Mother Occupational Prestige')
#ax.set_ylabel('Shared Dinner With Parents')
#ax.set_xlabel('Mother Occupational Prestige')
#ax.invert_yaxis()
#ax.get_legend().remove()
plt.autoscale(False)
#ax.set_ylim(0.50, 0.75)
ax.xaxis.set_ticks_position('top')
ax.yaxis.set_ticks_position('top')
ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
#ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
ax.zaxis.set_major_formatter(FormatStrFormatter('%.2f'))
#plt.savefig('PAage.png', dpi=300)  
ax2 = axes_list[0][0].axes[1]
ax2.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
#plt.savefig('dexp_Ymomjob_re_collen.png', dpi=300) 
