# use students' enrollment characteristics and assignment performance to predict final exam failed rate

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
%matplotlib inline
import seaborn as sns
sns.set_style ('darkgrid')

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score

In [20]:
df=pd.read_csv('ACX5903_202002.csv')
df.head()

Unnamed: 0,Student_ID,Surname,Given_Name,Title,Unit_Location,Class,assignment1,assignment2,assignment3,final
0,31334733,ARORA,AYUSH,MR,CLAYTON,ON-CAMPUS,9.33,11.2,17.75,81.78
1,32028725,ARTAHSASTA,KEVIN RITTAR,MR,CLAYTON,ON-CAMPUS,7.21,12.0,17.88,71.34
2,31070272,BAI,XUE,MS,CLAYTON,ON-CAMPUS,9.95,14.0,17.5,70.2
3,30268281,CAO,YU,MR,CLAYTON,ON-CAMPUS,9.81,12.17,17.88,59.86
4,31011195,CAO,JINGRU,MS,CLAYTON,ON-CAMPUS,8.25,7.25,12.5,59.625


In [21]:
df.Title.replace('MISS','MS',inplace=True)

In [22]:
df['gender']=pd.get_dummies(df.Title).MR

In [23]:
df.head()

Unnamed: 0,Student_ID,Surname,Given_Name,Title,Unit_Location,Class,assignment1,assignment2,assignment3,final,gender
0,31334733,ARORA,AYUSH,MR,CLAYTON,ON-CAMPUS,9.33,11.2,17.75,81.78,1
1,32028725,ARTAHSASTA,KEVIN RITTAR,MR,CLAYTON,ON-CAMPUS,7.21,12.0,17.88,71.34,1
2,31070272,BAI,XUE,MS,CLAYTON,ON-CAMPUS,9.95,14.0,17.5,70.2,0
3,30268281,CAO,YU,MR,CLAYTON,ON-CAMPUS,9.81,12.17,17.88,59.86,1
4,31011195,CAO,JINGRU,MS,CLAYTON,ON-CAMPUS,8.25,7.25,12.5,59.625,0


In [24]:
df=pd.get_dummies(df,columns=['Title','Unit_Location','Class'])

In [25]:
df.head()

Unnamed: 0,Student_ID,Surname,Given_Name,assignment1,assignment2,assignment3,final,gender,Title_MR,Title_MRS,Title_MS,Unit_Location_CAULFIELD,Unit_Location_CLAYTON,Class_EVENING,Class_ON-CAMPUS
0,31334733,ARORA,AYUSH,9.33,11.2,17.75,81.78,1,1,0,0,0,1,0,1
1,32028725,ARTAHSASTA,KEVIN RITTAR,7.21,12.0,17.88,71.34,1,1,0,0,0,1,0,1
2,31070272,BAI,XUE,9.95,14.0,17.5,70.2,0,0,0,1,0,1,0,1
3,30268281,CAO,YU,9.81,12.17,17.88,59.86,1,1,0,0,0,1,0,1
4,31011195,CAO,JINGRU,8.25,7.25,12.5,59.625,0,0,0,1,0,1,0,1


In [26]:
df.dtypes

Student_ID                   int64
Surname                     object
Given_Name                  object
assignment1                float64
assignment2                float64
assignment3                float64
final                      float64
gender                       uint8
Title_MR                     uint8
Title_MRS                    uint8
Title_MS                     uint8
Unit_Location_CAULFIELD      uint8
Unit_Location_CLAYTON        uint8
Class_EVENING                uint8
Class_ON-CAMPUS              uint8
dtype: object

In [27]:
df.isnull().sum()

Student_ID                 0
Surname                    0
Given_Name                 0
assignment1                0
assignment2                0
assignment3                0
final                      0
gender                     0
Title_MR                   0
Title_MRS                  0
Title_MS                   0
Unit_Location_CAULFIELD    0
Unit_Location_CLAYTON      0
Class_EVENING              0
Class_ON-CAMPUS            0
dtype: int64

In [28]:
df['fail']=(df.final<45).astype(int)
df['d_hd']=(df.final>=70).astype(int)
df['hd']=(df.final>=80).astype(int)

In [29]:
df['a1_ratio']=df.assignment1/10
df['a2_ratio']=df.assignment2/15
df['a3_ratio']=df.assignment3/25
df['final_ratio']=df.final/50


In [30]:
df.head(40)

Unnamed: 0,Student_ID,Surname,Given_Name,assignment1,assignment2,assignment3,final,gender,Title_MR,Title_MRS,...,Unit_Location_CLAYTON,Class_EVENING,Class_ON-CAMPUS,fail,d_hd,hd,a1_ratio,a2_ratio,a3_ratio,final_ratio
0,31334733,ARORA,AYUSH,9.33,11.2,17.75,81.78,1,1,0,...,1,0,1,0,1,1,0.933,0.746667,0.71,1.6356
1,32028725,ARTAHSASTA,KEVIN RITTAR,7.21,12.0,17.88,71.34,1,1,0,...,1,0,1,0,1,0,0.721,0.8,0.7152,1.4268
2,31070272,BAI,XUE,9.95,14.0,17.5,70.2,0,0,0,...,1,0,1,0,1,0,0.995,0.933333,0.7,1.404
3,30268281,CAO,YU,9.81,12.17,17.88,59.86,1,1,0,...,1,0,1,0,0,0,0.981,0.811333,0.7152,1.1972
4,31011195,CAO,JINGRU,8.25,7.25,12.5,59.625,0,0,0,...,1,0,1,0,0,0,0.825,0.483333,0.5,1.1925
5,31265294,CHANDE,SHREYA,9.48,11.72,17.63,86.455,0,0,0,...,1,0,1,0,1,1,0.948,0.781333,0.7052,1.7291
6,29966981,CHEN,SIXU,9.36,11.0,11.88,61.615,1,1,0,...,1,0,1,0,0,0,0.936,0.733333,0.4752,1.2323
7,30155665,CHEN,LENING,10.0,11.4,17.75,68.4,1,1,0,...,1,0,1,0,0,0,1.0,0.76,0.71,1.368
8,30701023,CHEN,JINJUN,9.95,11.8,19.38,75.63,1,1,0,...,1,0,1,0,1,0,0.995,0.786667,0.7752,1.5126
9,30385288,CHENG,JINYI,9.68,11.7,22.13,66.01,0,0,0,...,1,0,1,0,0,0,0.968,0.78,0.8852,1.3202


In [31]:
df.drop(['assignment1','assignment2','assignment3','final','Student_ID','Surname','Title_MR','Title_MS','Class_EVENING','Given_Name','final_ratio'],axis=1,inplace=True)

In [32]:
df.head()

Unnamed: 0,gender,Title_MRS,Unit_Location_CAULFIELD,Unit_Location_CLAYTON,Class_ON-CAMPUS,fail,d_hd,hd,a1_ratio,a2_ratio,a3_ratio
0,1,0,0,1,1,0,1,1,0.933,0.746667,0.71
1,1,0,0,1,1,0,1,0,0.721,0.8,0.7152
2,0,0,0,1,1,0,1,0,0.995,0.933333,0.7
3,1,0,0,1,1,0,0,0,0.981,0.811333,0.7152
4,0,0,0,1,1,0,0,0,0.825,0.483333,0.5


In [33]:
df.to_csv('cohort_2020.csv',index=None)

In [34]:
df2=pd.read_csv('cohort_2020.csv')

In [35]:
df2.head()

Unnamed: 0,gender,Title_MRS,Unit_Location_CAULFIELD,Unit_Location_CLAYTON,Class_ON-CAMPUS,fail,d_hd,hd,a1_ratio,a2_ratio,a3_ratio
0,1,0,0,1,1,0,1,1,0.933,0.746667,0.71
1,1,0,0,1,1,0,1,0,0.721,0.8,0.7152
2,0,0,0,1,1,0,1,0,0.995,0.933333,0.7
3,1,0,0,1,1,0,0,0,0.981,0.811333,0.7152
4,0,0,0,1,1,0,0,0,0.825,0.483333,0.5


In [36]:
df2.drop(['hd','d_hd'], axis=1, inplace=True)

In [37]:
df2.head()

Unnamed: 0,gender,Title_MRS,Unit_Location_CAULFIELD,Unit_Location_CLAYTON,Class_ON-CAMPUS,fail,a1_ratio,a2_ratio,a3_ratio
0,1,0,0,1,1,0,0.933,0.746667,0.71
1,1,0,0,1,1,0,0.721,0.8,0.7152
2,0,0,0,1,1,0,0.995,0.933333,0.7
3,1,0,0,1,1,0,0.981,0.811333,0.7152
4,0,0,0,1,1,0,0.825,0.483333,0.5


In [49]:
y=df2.fail
X=df2.drop('fail',axis=1)

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, stratify=df2.fail, random_state=1111)

print(len(X_train),len(X_test), len(y_train), len(y_test))

233 101 233 101


In [50]:
X_train.head()

Unnamed: 0,gender,Title_MRS,Unit_Location_CAULFIELD,Unit_Location_CLAYTON,Class_ON-CAMPUS,a1_ratio,a2_ratio,a3_ratio
219,0,0,1,0,1,0.997,0.82,0.6452
165,1,0,1,0,1,0.979,0.88,0.7152
154,0,0,1,0,1,0.989,0.84,0.83
313,1,0,1,0,1,0.653,0.38,0.4
65,1,0,0,1,1,0.783,0.773333,0.74


In [51]:
y_train.head()

219    1
165    1
154    0
313    0
65     0
Name: fail, dtype: int64

In [52]:
pipelines={'l1':make_pipeline(StandardScaler(), LogisticRegression(penalty='l1', random_state=111)),
           'l2': make_pipeline(StandardScaler(),LogisticRegression(penalty='l2' , random_state=111)),
          'rf': make_pipeline(StandardScaler(), RandomForestClassifier(random_state=111)),
          'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=111))}

In [53]:
l1_hyperparameters={'logisticregression__C':[0.5,1,10,100,200]}
l2_hyperparameters={'logisticregression__C':[0.5,1,10,100,200]}
rf_hyperparameters={'randomforestclassifier__n_estimators':[1,2,5,10,20],
                   'randomforestclassifier__min_samples_leaf':[1,2,5,10,20]}
gb_hyperparameters={'gradientboostingclassifier__n_estimators':[1,2,5,10,20],
                   'gradientboostingclassifier__min_samples_leaf':[1,2,5,10,20]}

hyperparameters={'l1':l1_hyperparameters,
                 'l2':l2_hyperparameters,
                 'rf':rf_hyperparameters,
                 'gb':gb_hyperparameters}

In [54]:
fitted_models={}

for name, pipeline in pipelines.items():
    model=GridSearchCV(pipeline, hyperparameters[name],cv=10, n_jobs=-1)
    model.fit(X_train, y_train)
    fitted_models[name]=model
    
    print(name,'has been fitted')



l1 has been fitted




l2 has been fitted




rf has been fitted
gb has been fitted




In [55]:
for name, model in fitted_models.items():
    print(name, model.best_score_)

l1 0.9399141630901288
l2 0.9399141630901288
rf 0.9356223175965666
gb 0.9356223175965666


In [56]:
for name, model in fitted_models.items():
    pred=model.predict_proba(X_test)
    pred=[p[1] for p in pred]
    
    print(name, roc_auc_score(y_test, pred))

l1 0.46842105263157896
l2 0.46842105263157896
rf 0.6885964912280702
gb 0.4736842105263158


In [57]:
type(fitted_models['rf'].best_estimator_)

sklearn.pipeline.Pipeline

In [58]:
pipelines['rf']

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators='warn', n_jobs=None,
                                        oob_score=False, random_state=111,
                                        verbose=0, warm_start=False))],
         verbose=False)

In [59]:
import pickle 

with open ('final_model.pkl','wb') as f:
    pickle.dump(fitted_models['rf'].best_estimator_, f)