In [133]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

import statsmodels.api as sm
from statsmodels.regression.linear_model import OLS

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [134]:
###get data and define independent variables and dependent varible
data = pd.read_excel("data_set.xlsx") 
regressor=["Gender","DISRATIO","LNINCOME",'EXPENSE',"Cantonese","BeentoHK","SCHOLAR","below2","Gaokao","SIBLINGS"]
predict="School"

sample_size=234
SH_HK_Split=136
#get selected variables to numpy
tot_var=regressor.copy()
tot_var.append(predict)
datac=data[tot_var][0:sample_size]

X=np.array(datac.drop([predict],1))
Y=np.array(datac[predict])


In [135]:
###delete rows with NaN values
def Delete (X,Y,NaN):
    CX=np.copy(X)
    i=-1
    for x in CX:
        i +=1 
        for y in x:
            if y==NaN:
                X=np.delete(X,i,0)
                Y=np.delete(Y,i,0)
                i = i-1
                break
    CY=np.copy(Y)
    j= -1
    for y in CY:
        j=j+1
        if y==NaN:
            X=np.delete(X,j,0)
            Y=np.delete(Y,j,0)
            j = j-1
    
    return X,Y
#NaN in my data_set is -3
X,Y=Delete(X,Y,-3)

In [136]:
###Preprocessing
#Get variance inflation factor of independent variables to see their correlation
def variance_inflation_factor(exog, exog_idx):
    k_vars = exog.shape[1]
    x_i = exog[:, exog_idx]
    mask = np.arange(k_vars) != exog_idx
    x_noti = exog[:, mask]
    r_squared_i = OLS(x_i, x_noti).fit().rsquared
    vif = 1. / (1. - r_squared_i)
    return vif

XC = sm.add_constant(X)
regressor.insert(0,'Constant')
print(pd.Series([variance_inflation_factor(XC, i) 
                for i in range(XC.shape[1])], 
              index=regressor))


###Get Skewness
def Skew(var):
    print('Skeness of {}'.format(pd.DataFrame.skew(data[[var]][0:sample_size])))
Skew('INCOME')
Skew('LNINCOME')
Skew('DISRATIO')

Constant     36.079277
Gender        1.146794
DISRATIO      1.389619
LNINCOME      1.408037
EXPENSE       1.172702
Cantonese     1.152869
BeentoHK      1.170524
SCHOLAR       1.108762
below2        1.288797
Gaokao        1.213168
SIBLINGS      1.055103
dtype: float64
Skeness of INCOME    5.784146
dtype: float64
Skeness of LNINCOME   -1.887287
dtype: float64
Skeness of DISRATIO   -5.006302
dtype: float64


In [137]:
###Logistic regression
def LogReg(X,Y):
    logit_model=sm.Logit(Y,X)
    mfit=logit_model.fit()
    result=logit_model.fit()


    ###Convert result to Odds Ratio
    odds=np.exp(result.params)

    ###Get std for Odds Ratio
    cov = mfit.cov_params()
    coefvar = np.diag(cov)
    sd=np.sqrt(coefvar*odds*odds)

    ###Get Odds Ratio confidence_interval
    conf = np.exp(result.conf_int())
    lst2=conf[:,0]
    lst3=conf[:,1]

    ### construct result as dataframe to visualize
    df = pd.DataFrame(list(zip(odds, sd,lst2,lst3)), 
                      columns =['Odds Ratio','   Standard Error', '[0.25  ','0.75]'],
                      index=regressor)
    print(result.summary2())
    print(df)
    return result



In [138]:
###main regression
print('\nMain Reg:\n')
result=LogReg(XC,Y)

###Drop Low Gaokao Score Partcipants
XNEW=X[X[:,8]<=2]
YNEW=Y[X[:,8]<=2]
XCNEW = sm.add_constant(XNEW)

print('\nLow Gaokao Score Partcipants Dropped:\n')
LogReg(XCNEW,YNEW)


Main Reg:

Optimization terminated successfully.
         Current function value: 0.333049
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.333049
         Iterations 8
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.478     
Dependent Variable: y                AIC:              157.2178  
Date:               2020-07-31 14:29 BIC:              193.6631  
No. Observations:   203              Log-Likelihood:   -67.609   
Df Model:           10               LL-Null:          -129.44   
Df Residuals:       192              LLR p-value:      9.1068e-22
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     8.0000                                       
-------------------------------------------------------------------
           Coef.    Std.Err.      z      P>|z|     [0.025    0.975]
-------------------------------------------------------------------
const     -2

<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x2a836ea4f60>