In [1]:
from sklearn.model_selection import cross_val_score
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.ensemble import ExtraTreesClassifier
>>> from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance
from sklearn import tree
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
nurse_df = pd.read_csv('../output_data/model_data.csv')
nurse_df.head()

Unnamed: 0,Cum GPA,TEAS Count,MaxOfTestScore,failing grade count,Time to Test (days),Completion Time (days),PrecEducDesc,Grade,GradePass,HighSchool,Assoc,Bach,Masters
0,2.84,1.0,76.0,0,209.0,591.0,High School/GED/Some College,Pass,1.0,1.0,0.0,0.0,0.0
1,3.14,1.0,74.7,0,73.0,467.0,High School/GED/Some College,Pass,1.0,1.0,0.0,0.0,0.0
2,3.22,1.0,75.3,0,84.51,578.0,High School/GED/Some College,Pass,1.0,1.0,0.0,0.0,0.0
3,3.29,2.0,77.3,0,60.0,591.0,High School/GED/Some College,Fail,0.0,1.0,0.0,0.0,0.0
4,3.38,2.0,72.7,0,80.33,600.0,Associates,Pass,1.0,0.0,1.0,0.0,0.0


In [3]:
nurses_df = nurse_df.drop(['PrecEducDesc', 'Grade'], axis=1)
nurses_df.head()

Unnamed: 0,Cum GPA,TEAS Count,MaxOfTestScore,failing grade count,Time to Test (days),Completion Time (days),GradePass,HighSchool,Assoc,Bach,Masters
0,2.84,1.0,76.0,0,209.0,591.0,1.0,1.0,0.0,0.0,0.0
1,3.14,1.0,74.7,0,73.0,467.0,1.0,1.0,0.0,0.0,0.0
2,3.22,1.0,75.3,0,84.51,578.0,1.0,1.0,0.0,0.0,0.0
3,3.29,2.0,77.3,0,60.0,591.0,0.0,1.0,0.0,0.0,0.0
4,3.38,2.0,72.7,0,80.33,600.0,1.0,0.0,1.0,0.0,0.0


In [4]:
nurses_df_pass = nurses_df[nurses_df["GradePass"]== 1]
len(nurses_df_pass)

898

In [5]:
nurses_df_fail = nurses_df[nurses_df["GradePass"] != 1]
len(nurses_df_fail)

327

In [6]:
new_data = pd.concat([nurses_df_fail ,nurses_df_pass.sample(n=len(nurses_df_fail)) ],copy=True,ignore_index=True)
len(new_data)

654

In [7]:
target = new_data["GradePass"]
target_names = ["negative", "positive"]
target

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
649    1.0
650    1.0
651    1.0
652    1.0
653    1.0
Name: GradePass, Length: 654, dtype: float64

In [8]:
data = new_data.drop("GradePass", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,Cum GPA,TEAS Count,MaxOfTestScore,failing grade count,Time to Test (days),Completion Time (days),HighSchool,Assoc,Bach,Masters
0,3.29,2.0,77.3,0,60.0,591.0,1.0,0.0,0.0,0.0
1,2.78,2.0,74.0,0,369.0,465.0,1.0,0.0,0.0,0.0
2,2.79,3.0,74.0,0,116.0,591.0,1.0,0.0,0.0,0.0
3,3.27,2.0,68.7,1,124.58,578.0,0.0,1.0,0.0,0.0
4,2.76,2.0,73.3,0,158.0,460.0,1.0,0.0,0.0,0.0


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target)

In [10]:
rf = ExtraTreesClassifier(n_estimators=100, max_depth=4)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.6890243902439024

In [11]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.4631324556076205, 'Cum GPA'),
 (0.14422502612615185, 'TEAS Count'),
 (0.12324351316379133, 'Completion Time (days)'),
 (0.07981853191202615, 'MaxOfTestScore'),
 (0.07033204686857421, 'Time to Test (days)'),
 (0.05288630188197738, 'failing grade count'),
 (0.035637728499101545, 'Masters'),
 (0.0170462505026231, 'Bach'),
 (0.00970203149678448, 'HighSchool'),
 (0.003976113941349406, 'Assoc')]

In [12]:
result = new_data.values
pd.DataFrame(result)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,3.29,2.0,77.3,0.0,60.00,591.0,0.0,1.0,0.0,0.0,0.0
1,2.78,2.0,74.0,0.0,369.00,465.0,0.0,1.0,0.0,0.0,0.0
2,2.79,3.0,74.0,0.0,116.00,591.0,0.0,1.0,0.0,0.0,0.0
3,3.27,2.0,68.7,1.0,124.58,578.0,0.0,0.0,1.0,0.0,0.0
4,2.76,2.0,73.3,0.0,158.00,460.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
649,3.52,2.0,74.0,0.0,88.00,481.0,1.0,1.0,0.0,0.0,0.0
650,3.03,2.0,68.7,0.0,260.35,591.0,1.0,1.0,0.0,0.0,0.0
651,3.31,2.0,75.3,0.0,98.45,578.0,1.0,1.0,0.0,0.0,0.0
652,3.16,1.0,78.7,0.0,117.00,592.0,1.0,1.0,0.0,0.0,0.0


In [13]:
y_predict = rf.predict(X_test)
y_predict

array([1., 0., 0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 1., 1.,
       0., 1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 1.,
       1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1.,
       0., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 1., 0., 0.,
       1., 0., 0., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0.,
       0., 0., 0., 1., 0., 1., 0., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0.,
       1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0.,
       1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 1.])

In [14]:
y_test

649    1.0
515    1.0
52     0.0
270    0.0
580    1.0
      ... 
22     0.0
194    0.0
574    1.0
219    0.0
324    0.0
Name: GradePass, Length: 164, dtype: float64

In [15]:
 # Calculate classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         0.0       0.69      0.71      0.70        83
         1.0       0.69      0.67      0.68        81

    accuracy                           0.69       164
   macro avg       0.69      0.69      0.69       164
weighted avg       0.69      0.69      0.69       164



In [16]:
class_report = classification_report(y_test, y_predict, output_dict=True)
class_report_df = pd.DataFrame(class_report).transpose()
#class_report_df.to_csv('Outputs/extreme_trees_report.csv')
class_report_df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.686047,0.710843,0.698225,83.0
1.0,0.692308,0.666667,0.679245,81.0
accuracy,0.689024,0.689024,0.689024,0.689024
macro avg,0.689177,0.688755,0.688735,164.0
weighted avg,0.689139,0.689024,0.688851,164.0


In [17]:
gpa_range_df = new_data[:21].copy()
new_data[:21]

Unnamed: 0,Cum GPA,TEAS Count,MaxOfTestScore,failing grade count,Time to Test (days),Completion Time (days),GradePass,HighSchool,Assoc,Bach,Masters
0,3.29,2.0,77.3,0,60.0,591.0,0.0,1.0,0.0,0.0,0.0
1,2.78,2.0,74.0,0,369.0,465.0,0.0,1.0,0.0,0.0,0.0
2,2.79,3.0,74.0,0,116.0,591.0,0.0,1.0,0.0,0.0,0.0
3,3.27,2.0,68.7,1,124.58,578.0,0.0,0.0,1.0,0.0,0.0
4,2.76,2.0,73.3,0,158.0,460.0,0.0,1.0,0.0,0.0,0.0
5,2.63,1.0,75.3,1,106.33,591.0,0.0,1.0,0.0,0.0,0.0
6,2.98,2.0,73.3,0,88.0,591.0,0.0,1.0,0.0,0.0,0.0
7,2.55,1.0,71.3,0,242.34,599.0,0.0,0.0,0.0,1.0,0.0
8,2.84,5.0,71.3,0,307.0,578.0,0.0,1.0,0.0,0.0,0.0
9,2.91,1.0,68.0,0,52.32,591.0,0.0,1.0,0.0,0.0,0.0


In [18]:
nurse_df_modes = nurse_df.mode()
nurse_df_means = pd.DataFrame(nurse_df.mean())
nurse_df_means

Unnamed: 0,0
Cum GPA,3.173167
TEAS Count,1.410612
MaxOfTestScore,75.897143
failing grade count,0.143673
Time to Test (days),98.963657
Completion Time (days),531.066939
GradePass,0.733061
HighSchool,0.845714
Assoc,0.050612
Bach,0.096327


In [19]:
# Build test set of average students with varying GPAs
gpa_range_df.loc[:,'TEAS Count'] = nurse_df_modes.loc[0,'TEAS Count']
gpa_range_df.loc[:,'MaxOfTestScore'] = nurse_df_means.loc['MaxOfTestScore',0]
gpa_range_df.loc[:,'failing grade count'] = nurse_df_modes.loc[0,'failing grade count']
gpa_range_df.loc[:,'Time to Test (days)'] = nurse_df_means.loc['Time to Test (days)',0]
gpa_range_df.loc[:,'Completion Time (days)'] = nurse_df_means.loc['Completion Time (days)',0]
gpa_range_df.loc[:,'HighSchool'] = nurse_df_modes.loc[0,'HighSchool']
gpa_range_df.loc[:,'Assoc'] = nurse_df_modes.loc[0,'Assoc']
gpa_range_df.loc[:,'Bach'] = nurse_df_modes.loc[0,'Bach']
gpa_range_df.loc[:,'Masters'] = nurse_df_modes.loc[0,'Masters']
gpa_range_df

Unnamed: 0,Cum GPA,TEAS Count,MaxOfTestScore,failing grade count,Time to Test (days),Completion Time (days),GradePass,HighSchool,Assoc,Bach,Masters
0,3.29,1.0,75.897143,0,98.963657,531.066939,0.0,1.0,0.0,0.0,0.0
1,2.78,1.0,75.897143,0,98.963657,531.066939,0.0,1.0,0.0,0.0,0.0
2,2.79,1.0,75.897143,0,98.963657,531.066939,0.0,1.0,0.0,0.0,0.0
3,3.27,1.0,75.897143,0,98.963657,531.066939,0.0,1.0,0.0,0.0,0.0
4,2.76,1.0,75.897143,0,98.963657,531.066939,0.0,1.0,0.0,0.0,0.0
5,2.63,1.0,75.897143,0,98.963657,531.066939,0.0,1.0,0.0,0.0,0.0
6,2.98,1.0,75.897143,0,98.963657,531.066939,0.0,1.0,0.0,0.0,0.0
7,2.55,1.0,75.897143,0,98.963657,531.066939,0.0,1.0,0.0,0.0,0.0
8,2.84,1.0,75.897143,0,98.963657,531.066939,0.0,1.0,0.0,0.0,0.0
9,2.91,1.0,75.897143,0,98.963657,531.066939,0.0,1.0,0.0,0.0,0.0


In [20]:
# Set up gpa range
gpa_range = [2.0, 2.1, 2.2, 2.3, 2.4,
             2.5, 2.6, 2.7, 2.8, 2.9,
             3.0, 3.1, 3.2, 3.3, 3.4,
            3.5, 3.6, 3.7, 3.8, 3.9, 4.0]
gpa_range_df['Cum GPA'] = gpa_range
gpa_range_df.drop(['GradePass'], inplace=True, axis=1)
gpa_range_df

Unnamed: 0,Cum GPA,TEAS Count,MaxOfTestScore,failing grade count,Time to Test (days),Completion Time (days),HighSchool,Assoc,Bach,Masters
0,2.0,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0
1,2.1,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0
2,2.2,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0
3,2.3,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0
4,2.4,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0
5,2.5,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0
6,2.6,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0
7,2.7,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0
8,2.8,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0
9,2.9,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0


In [22]:
gpa_predict = rf.predict(gpa_range_df)
gpa_predict_prob = rf.predict_proba(gpa_range_df)
gpa_predict_prob0 = []
gpa_predict_prob1 = []
for i in range(len(gpa_predict_prob)):
    gpa_predict_prob0.append(gpa_predict_prob[i][0])
    gpa_predict_prob1.append(gpa_predict_prob[i][1])


[0.6260910622622956,
 0.6260910622622956,
 0.6260910622622956,
 0.6260910622622956,
 0.6213142765480099,
 0.6099025889964628,
 0.5972611435497371,
 0.5832949407215329,
 0.5659798996410119,
 0.5482432626434602,
 0.5064617939297217,
 0.474486653488267,
 0.44398600629621593,
 0.395548231733235,
 0.3789126081089861,
 0.3530453774019233,
 0.3364880034521328,
 0.3227610536252737,
 0.3031284786686195,
 0.2758383376976788,
 0.2518316182087856]

In [26]:
gpa_range_df['Prediction'] = gpa_predict
gpa_range_df['Prob0'] = gpa_predict_prob0
gpa_range_df['Prob1'] = gpa_predict_prob1
gpa_range_df

Unnamed: 0,Cum GPA,TEAS Count,MaxOfTestScore,failing grade count,Time to Test (days),Completion Time (days),HighSchool,Assoc,Bach,Masters,Prediction,Prob0,Prob1
0,2.0,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0,0.0,0.626091,0.373909
1,2.1,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0,0.0,0.626091,0.373909
2,2.2,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0,0.0,0.626091,0.373909
3,2.3,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0,0.0,0.626091,0.373909
4,2.4,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0,0.0,0.621314,0.378686
5,2.5,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0,0.0,0.609903,0.390097
6,2.6,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0,0.0,0.597261,0.402739
7,2.7,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0,0.0,0.583295,0.416705
8,2.8,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0,0.0,0.56598,0.43402
9,2.9,1.0,75.897143,0,98.963657,531.066939,1.0,0.0,0.0,0.0,0.0,0.548243,0.451757


In [29]:
gpa_range_export = gpa_range_df[['Cum GPA', 'Prediction','Prob0','Prob1']]
gpa_range_export.to_csv('Outputs/extreme_trees_probas.csv')