In [68]:
# import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# read and look at data
edu = pd.read_csv('https://raw.githubusercontent.com/wchen952/STAT-303-2-Project/main/data/xAPI-Edu-Data.csv')
edu_test = pd.read_csv("https://raw.githubusercontent.com/wchen952/STAT-303-2-Project/main/data/edu_test.csv")

In [69]:
# make train dataset
edu_train = edu.copy()

In [70]:
# clean data 

# get two grades
edu_train = edu_train.loc[edu['GradeID'].isin(['G-07','G-08'])]

# drop stageID, sectionID, semester, relation, parentansweringsurvey, parentschoolsatisfaction
edu_train.drop(['NationalITy', 'PlaceofBirth', 'StageID', 'SectionID', 'Semester', 'Relation', 'ParentAnsweringSurvey', 'ParentschoolSatisfaction'], inplace = True, axis = 1)

#dropping na values
edu_train.dropna(inplace =True)

# drop English topic because there are only two observations 
edu_train = edu_train[edu_train.Topic != 'English']

# drop low
edu_train = edu_train[edu_train.Class != 'L']

#replacing M with 0 and H with 1
edu_train["Class"].replace(to_replace = {'M': 0, 'H': 1}, inplace = True)

In [71]:
edu_train['Topic'].value_counts()

Biology      26
Geology      24
Arabic       22
Spanish      17
Chemistry    16
IT           13
Science      13
Quran        13
History      10
Math          9
Name: Topic, dtype: int64

In [72]:
edu_train['Class'].value_counts()

0    101
1     62
Name: Class, dtype: int64

In [73]:
# developing model
model1 = smf.logit(formula = 'Class~gender+raisedhands+VisITedResources+AnnouncementsView+Discussion+StudentAbsenceDays', data = edu_train).fit()
model1.summary()

Optimization terminated successfully.
         Current function value: 0.442542
         Iterations 7


0,1,2,3
Dep. Variable:,Class,No. Observations:,163.0
Model:,Logit,Df Residuals:,156.0
Method:,MLE,Df Model:,6.0
Date:,"Tue, 28 Feb 2023",Pseudo R-squ.:,0.3338
Time:,13:37:21,Log-Likelihood:,-72.134
converged:,True,LL-Null:,-108.27
Covariance Type:,nonrobust,LLR p-value:,1.395e-13

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-8.8299,1.708,-5.171,0.000,-12.177,-5.483
gender[T.M],-1.3045,0.476,-2.740,0.006,-2.238,-0.371
StudentAbsenceDays[T.Under-7],2.2674,0.796,2.849,0.004,0.707,3.827
raisedhands,0.0250,0.012,2.158,0.031,0.002,0.048
VisITedResources,0.0550,0.017,3.148,0.002,0.021,0.089
AnnouncementsView,0.0014,0.010,0.134,0.893,-0.019,0.021
Discussion,0.0204,0.008,2.439,0.015,0.004,0.037


In [74]:
# predict
pred_class = model1.predict(edu_test)
pred_class

0     0.096737
1     0.155446
2     0.906027
3     0.125585
4     0.358812
5     0.711294
6     0.007423
7     0.004618
8     0.000098
9     0.000461
10    0.729591
11    0.725637
12    0.004811
13    0.003411
14    0.865975
15    0.606794
16    0.012744
17    0.037264
18    0.028499
19    0.039297
20    0.217790
21    0.131981
22    0.937979
23    0.925367
24    0.001899
25    0.002195
26    0.729586
27    0.841498
28    0.322416
29    0.413903
dtype: float64

In [76]:
edu_train.corr()

Unnamed: 0,raisedhands,VisITedResources,AnnouncementsView,Discussion,Class
raisedhands,1.0,0.526306,0.53415,0.261052,0.379809
VisITedResources,0.526306,1.0,0.458145,0.089096,0.400705
AnnouncementsView,0.53415,0.458145,1.0,0.262746,0.219604
Discussion,0.261052,0.089096,0.262746,1.0,0.269701
Class,0.379809,0.400705,0.219604,0.269701,1.0


In [77]:
continuous_vars = edu[['raisedhands','VisITedResources','AnnouncementsView','Discussion']]

continuous_vars.columns[1:]

continuous_vars = add_constant(continuous_vars)
vif_data = pd.DataFrame()
vif_data["feature"] = continuous_vars.columns

for i in range(len(continuous_vars.columns)):
    vif_data.loc[i,'VIF'] = variance_inflation_factor(continuous_vars.values, i)

print(vif_data)

NameError: name 'add_constant' is not defined