In [9]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [10]:
stu_info = pd.read_csv('data/studentInfo.csv')

In [11]:
stu_info.head()

Unnamed: 0,code_module,code_presentation,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,AAA,2013J,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,AAA,2013J,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,AAA,2013J,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,AAA,2013J,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,AAA,2013J,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass


# Preprocessing

In [12]:
stu_info.drop(['code_module', 'code_presentation'], axis=1, inplace=True)

In [13]:
stu_info.head()

Unnamed: 0,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,11391,M,East Anglian Region,HE Qualification,90-100%,55<=,0,240,N,Pass
1,28400,F,Scotland,HE Qualification,20-30%,35-55,0,60,N,Pass
2,30268,F,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,Y,Withdrawn
3,31604,F,South East Region,A Level or Equivalent,50-60%,35-55,0,60,N,Pass
4,32885,F,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,N,Pass


In [15]:
stu_info.final_result.replace(to_replace=dict(Pass=1, Distinction=1, Fail=0, Withdrawn=0), inplace=True)

In [24]:
stu_info['gender'] = (stu_info['gender']=='M').astype(int)

In [26]:
stu_info['disability'] = (stu_info['disability']=='Y').astype(int)

In [27]:
stu_info.head()

Unnamed: 0,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result
0,11391,1,East Anglian Region,HE Qualification,90-100%,55<=,0,240,0,1
1,28400,0,Scotland,HE Qualification,20-30%,35-55,0,60,0,1
2,30268,0,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,1,0
3,31604,0,South East Region,A Level or Equivalent,50-60%,35-55,0,60,0,1
4,32885,0,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,0,1


In [17]:
stu_info['region'].unique()

array(['East Anglian Region', 'Scotland', 'North Western Region',
       'South East Region', 'West Midlands Region', 'Wales',
       'North Region', 'South Region', 'Ireland', 'South West Region',
       'East Midlands Region', 'Yorkshire Region', 'London Region'],
      dtype=object)

In [18]:
stu_info['highest_education'].unique()

array(['HE Qualification', 'A Level or Equivalent', 'Lower Than A Level',
       'Post Graduate Qualification', 'No Formal quals'], dtype=object)

In [19]:
stu_info['imd_band'].unique()

array(['90-100%', '20-30%', '30-40%', '50-60%', '80-90%', '70-80%', nan,
       '60-70%', '40-50%', '10-20', '0-10%'], dtype=object)

In [20]:
stu_info['age_band'].unique()

array(['55<=', '35-55', '0-35'], dtype=object)

In [28]:
cat_vars = ['region','highest_education','imd_band','age_band']
for var in cat_vars:
    cat_list = 'var' + '_' + var
    cat_list = pd.get_dummies(stu_info[var], prefix=var)
    stu_info1 = stu_info.join(cat_list)
    stu_info = stu_info1

In [29]:
stu_info.head()

Unnamed: 0,id_student,gender,region,highest_education,imd_band,age_band,num_of_prev_attempts,studied_credits,disability,final_result,...,imd_band_30-40%,imd_band_40-50%,imd_band_50-60%,imd_band_60-70%,imd_band_70-80%,imd_band_80-90%,imd_band_90-100%,age_band_0-35,age_band_35-55,age_band_55<=
0,11391,1,East Anglian Region,HE Qualification,90-100%,55<=,0,240,0,1,...,0,0,0,0,0,0,1,0,0,1
1,28400,0,Scotland,HE Qualification,20-30%,35-55,0,60,0,1,...,0,0,0,0,0,0,0,0,1,0
2,30268,0,North Western Region,A Level or Equivalent,30-40%,35-55,0,60,1,0,...,1,0,0,0,0,0,0,0,1,0
3,31604,0,South East Region,A Level or Equivalent,50-60%,35-55,0,60,0,1,...,0,0,1,0,0,0,0,0,1,0
4,32885,0,West Midlands Region,Lower Than A Level,50-60%,0-35,0,60,0,1,...,0,0,1,0,0,0,0,1,0,0


In [31]:
info_vars = stu_info.columns.values.tolist()
to_keep = [i for i in info_vars if i not in cat_vars]

In [32]:
stuinfo_final = stu_info[to_keep]
stuinfo_final.columns.values

array(['id_student', 'gender', 'num_of_prev_attempts', 'studied_credits',
       'disability', 'final_result', 'region_East Anglian Region',
       'region_East Midlands Region', 'region_Ireland',
       'region_London Region', 'region_North Region',
       'region_North Western Region', 'region_Scotland',
       'region_South East Region', 'region_South Region',
       'region_South West Region', 'region_Wales',
       'region_West Midlands Region', 'region_Yorkshire Region',
       'highest_education_A Level or Equivalent',
       'highest_education_HE Qualification',
       'highest_education_Lower Than A Level',
       'highest_education_No Formal quals',
       'highest_education_Post Graduate Qualification', 'imd_band_0-10%',
       'imd_band_10-20', 'imd_band_20-30%', 'imd_band_30-40%',
       'imd_band_40-50%', 'imd_band_50-60%', 'imd_band_60-70%',
       'imd_band_70-80%', 'imd_band_80-90%', 'imd_band_90-100%',
       'age_band_0-35', 'age_band_35-55', 'age_band_55<='], dtype=

In [33]:
stuinfo_final.head()

Unnamed: 0,id_student,gender,num_of_prev_attempts,studied_credits,disability,final_result,region_East Anglian Region,region_East Midlands Region,region_Ireland,region_London Region,...,imd_band_30-40%,imd_band_40-50%,imd_band_50-60%,imd_band_60-70%,imd_band_70-80%,imd_band_80-90%,imd_band_90-100%,age_band_0-35,age_band_35-55,age_band_55<=
0,11391,1,0,240,0,1,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,28400,0,0,60,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,30268,0,0,60,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
3,31604,0,0,60,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,32885,0,0,60,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0


In [34]:
stuinfo_final_vars = stuinfo_final.columns.values.tolist()
Y = ['final_result']
X = [i for i in stuinfo_final_vars if i not in Y ]

# Feature Selection

In [37]:
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [51]:
model = LogisticRegression()

rfe = RFE(model, 15)
rfe = rfe.fit(stuinfo_final[X], stuinfo_final[Y] )
print(rfe.support_)
print(rfe.ranking_)

  y = column_or_1d(y, warn=True)


[False False  True False False False False False False False False False
 False False False False False False  True  True False  True  True  True
  True  True  True  True  True  True  True  True  True False False False]
[22  6  1 21  2 13 16  5 12 14 11 20  8 10  9 18 17 15  1  1  7  1  1  1
  1  1  1  1  1  1  1  1  1 19  4  3]


In [52]:
select_cols = [col for i, col in enumerate(X) if rfe.support_[i] == True]

In [53]:
select_cols

['num_of_prev_attempts',
 'highest_education_A Level or Equivalent',
 'highest_education_HE Qualification',
 'highest_education_No Formal quals',
 'highest_education_Post Graduate Qualification',
 'imd_band_0-10%',
 'imd_band_10-20',
 'imd_band_20-30%',
 'imd_band_30-40%',
 'imd_band_40-50%',
 'imd_band_50-60%',
 'imd_band_60-70%',
 'imd_band_70-80%',
 'imd_band_80-90%',
 'imd_band_90-100%']

# Log Reg

In [54]:
Y = stuinfo_final['final_result']

In [55]:
X = stuinfo_final.drop(['final_result'], axis=1)

In [60]:
import statsmodels.api as sm
logit_model=sm.Logit(Y,X)
result=logit_model.fit()
print(result.summary())

         Current function value: 0.654991
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:           final_result   No. Observations:                32593
Model:                          Logit   Df Residuals:                    32559
Method:                           MLE   Df Model:                           33
Date:                Tue, 05 Nov 2019   Pseudo R-squ.:                 0.05291
Time:                        18:22:11   Log-Likelihood:                -21348.
converged:                      False   LL-Null:                       -22541.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                    coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------------------
id_student                                     2.738e-08   2.15e-08      1

  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


In [61]:
from sklearn import linear_model
clf = linear_model.LogisticRegression()
clf.fit(X, Y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [62]:
clf.score(X,Y)

0.5427238977694597