In [None]:
# building the MODEL based on LOGISTIC REGRESSION

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

%matplotlib inline

In [7]:
train_df = pd.read_csv('student-mat-train.csv')
train_df.head(5)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3,pass
0,GP,F,18,U,GT3,T,1,1,other,other,...,no,5,4,4,1,1,4,4,10,1
1,GP,F,17,U,LE3,T,3,3,other,other,...,yes,5,3,3,2,3,1,56,8,0
2,GP,M,18,U,GT3,T,4,4,other,other,...,no,4,3,3,2,2,3,3,11,1
3,GP,M,16,U,GT3,T,4,4,services,services,...,no,5,3,2,1,2,5,0,12,1
4,GP,F,18,U,LE3,T,3,3,services,services,...,no,5,3,3,1,1,1,7,17,1


In [8]:
# CONVERTING CATEGORICAL DATA to NUMERICAL VARIABLES : ONE HOT ENCODING
# using the function :  pd.get_dummies()

In [9]:
internet_dummies = pd.get_dummies(train_df[['internet']], drop_first=True)
internet_dummies.head()

Unnamed: 0,internet_yes
0,1
1,1
2,1
3,1
4,1


In [10]:
internet_dummies = pd.get_dummies(train_df[['Mjob']], drop_first=True)
internet_dummies.head()

Unnamed: 0,Mjob_health,Mjob_other,Mjob_services,Mjob_teacher
0,0,1,0,0
1,0,1,0,0
2,0,1,0,0
3,0,0,1,0
4,0,0,1,0


In [11]:
internet_dummies = pd.get_dummies(train_df[['Fjob']], drop_first=True)
internet_dummies.head()

Unnamed: 0,Fjob_health,Fjob_other,Fjob_services,Fjob_teacher
0,0,1,0,0
1,0,1,0,0
2,0,1,0,0
3,0,0,1,0
4,0,0,1,0


In [13]:
internet_dummies = pd.get_dummies(train_df[['romantic']], drop_first=True)
internet_dummies.head()

Unnamed: 0,romantic_yes
0,0
1,1
2,0
3,0
4,0


In [14]:
# APPLYING ONE-HOT ENCODING to the ENTIRE TRAINING DATAFRAME

In [15]:
train_dummies_df = pd.get_dummies(train_df, drop_first=True)
train_dummies_df.head()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
0,18,1,1,2,2,0,5,4,4,1,...,1,0,1,0,0,1,1,1,1,0
1,17,3,3,1,2,0,5,3,3,2,...,1,0,0,1,0,1,1,1,1,1
2,18,4,4,1,3,0,4,3,3,2,...,1,0,0,0,0,1,1,1,1,0
3,16,4,4,1,1,0,5,3,2,1,...,1,0,0,0,0,1,1,1,1,0
4,18,3,3,1,4,0,5,3,3,1,...,1,0,0,1,0,0,1,1,1,0


In [18]:
train_dummies_df.columns.sort_values()

Index(['Dalc', 'Fedu', 'Fjob_health', 'Fjob_other', 'Fjob_services',
       'Fjob_teacher', 'G3', 'Medu', 'Mjob_health', 'Mjob_other',
       'Mjob_services', 'Mjob_teacher', 'Pstatus_T', 'Walc', 'absences',
       'activities_yes', 'address_U', 'age', 'failures', 'famrel',
       'famsize_LE3', 'famsup_yes', 'freetime', 'goout', 'guardian_mother',
       'guardian_other', 'health', 'higher_yes', 'internet_yes', 'nursery_yes',
       'paid_yes', 'pass', 'reason_home', 'reason_other', 'reason_reputation',
       'romantic_yes', 'school_MS', 'schoolsup_yes', 'sex_M', 'studytime',
       'traveltime'],
      dtype='object')

In [None]:
# APPLYING ONE-HOT ENCODING to the ENTIRE TESTING DATAFRAME

In [19]:
test_df = pd.read_csv('student-mat-test.csv')
test_df.head(2)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3,pass
0,GP,F,18,U,GT3,T,2,2,at_home,services,...,yes,4,3,3,1,1,3,0,0,0
1,GP,F,15,U,GT3,T,4,4,services,teacher,...,no,4,4,4,1,1,3,2,7,0


In [21]:
test_dummies_df = pd.get_dummies(test_df, drop_first=True)
test_dummies_df.head(2)

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
0,18,2,2,1,3,0,4,3,3,1,...,1,0,0,1,1,1,1,1,1,1
1,15,4,4,1,2,1,4,4,4,1,...,0,0,1,1,0,1,0,1,1,0


In [23]:
print(test_dummies_df.shape)
print(train_dummies_df.shape)
# print(train_dummies_df.columns)
# print(test_dummies_df.columns)

(79, 41)
(316, 41)


In [None]:
# making the PREDICTORS and the PREDICTAND (OUTPUT) variable

In [31]:
# separate the TRAINING DATA into predictors and predictand
xCols = [col for col in train_dummies_df.columns if col not in ['pass', 'G3']]
X_train = train_dummies_df[xCols]
y_train = train_dummies_df['pass']
X_train.head()
y_train.head()

0    1
1    0
2    1
3    1
4    1
Name: pass, dtype: int64

In [32]:
# separate the TESTING DATA into predictors and predictand
X_test = test_dummies_df[xCols]
y_test = test_dummies_df['pass']
X_test.head()
y_test.head()

0    0
1    0
2    0
3    0
4    1
Name: pass, dtype: int64

In [33]:
# TRAINING the LOGISTIC REGRESSION MODEL

In [34]:
## making a logistic regression OBJECT 
logReg = LogisticRegression(C=1e15)
logReg

## to train it, and set the values of the coefficients
logReg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=1000000000000000.0)

In [None]:
# INSPECTING the TRAINING MODEL

In [36]:
print(logReg.intercept_)

[1.35748392]


In [37]:
print(logReg.coef_)

[[-0.05668254 -0.01038776  0.24712084  0.22015577  0.3401917  -0.76064415
   0.22256144  0.15532393 -0.74270861 -0.19089258  0.2597469  -0.23700572
  -0.022314   -0.39811162  0.64490976  0.68660388  0.2885885  -0.75427168
   0.06019398 -0.77358083  0.21905489 -0.88976476 -0.46021593  0.48719847
   0.17395456  1.07058764  0.2429677   1.36119211  0.61043188 -0.80403997
  -1.03205844 -0.75846073 -0.71272962 -0.00448134 -0.07175915 -0.8460349
   1.62265889  0.79468463 -0.22789449]]


In [38]:
print(X_train.columns)   

Index(['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel',
       'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'school_MS',
       'sex_M', 'address_U', 'famsize_LE3', 'Pstatus_T', 'Mjob_health',
       'Mjob_other', 'Mjob_services', 'Mjob_teacher', 'Fjob_health',
       'Fjob_other', 'Fjob_services', 'Fjob_teacher', 'reason_home',
       'reason_other', 'reason_reputation', 'guardian_mother',
       'guardian_other', 'schoolsup_yes', 'famsup_yes', 'paid_yes',
       'activities_yes', 'nursery_yes', 'higher_yes', 'internet_yes',
       'romantic_yes'],
      dtype='object')


In [None]:
# PRINTING the EQUATION

In [39]:
equation = '{:.2f}'.format(logReg.intercept_[0])
for (coef, feature) in zip(logReg.coef_[0], X_train.columns):
    equation += ' + {:.2f}({})'.format(coef, feature)
print(equation)

1.36 + -0.06(age) + -0.01(Medu) + 0.25(Fedu) + 0.22(traveltime) + 0.34(studytime) + -0.76(failures) + 0.22(famrel) + 0.16(freetime) + -0.74(goout) + -0.19(Dalc) + 0.26(Walc) + -0.24(health) + -0.02(absences) + -0.40(school_MS) + 0.64(sex_M) + 0.69(address_U) + 0.29(famsize_LE3) + -0.75(Pstatus_T) + 0.06(Mjob_health) + -0.77(Mjob_other) + 0.22(Mjob_services) + -0.89(Mjob_teacher) + -0.46(Fjob_health) + 0.49(Fjob_other) + 0.17(Fjob_services) + 1.07(Fjob_teacher) + 0.24(reason_home) + 1.36(reason_other) + 0.61(reason_reputation) + -0.80(guardian_mother) + -1.03(guardian_other) + -0.76(schoolsup_yes) + -0.71(famsup_yes) + -0.00(paid_yes) + -0.07(activities_yes) + -0.85(nursery_yes) + 1.62(higher_yes) + 0.79(internet_yes) + -0.23(romantic_yes)


In [40]:
z = logReg.intercept_ + np.dot(logReg.coef_, X_train.iloc[[0], :].values.T)
1 / (1+np.exp(-z))

array([[0.33524354]])

In [42]:
# COMPUTING the PROBABILITY for BOTH CLASSES
# it will give you the probability predicted for both clases (these will sum to 1)
# by using .predict_proba() method.

In [43]:
# predicts the probability of each class (failing, passing)
print(logReg.predict_proba(X_train.iloc[[1], :]))

[[0.36216179 0.63783821]]


In [None]:
# and for the entire DATASET

In [46]:
# for each student, it predict PROBAB_FAILING, PROBAB_PASSING
logReg.predict_proba(X_train)

array([[0.66475646, 0.33524354],
       [0.36216179, 0.63783821],
       [0.11930042, 0.88069958],
       [0.05413912, 0.94586088],
       [0.0713549 , 0.9286451 ],
       [0.38669634, 0.61330366],
       [0.089364  , 0.910636  ],
       [0.05765415, 0.94234585],
       [0.01885908, 0.98114092],
       [0.08335633, 0.91664367],
       [0.29927869, 0.70072131],
       [0.24977243, 0.75022757],
       [0.66224706, 0.33775294],
       [0.04603213, 0.95396787],
       [0.27297553, 0.72702447],
       [0.0084277 , 0.9915723 ],
       [0.13931297, 0.86068703],
       [0.36930353, 0.63069647],
       [0.21861446, 0.78138554],
       [0.0316435 , 0.9683565 ],
       [0.11252969, 0.88747031],
       [0.06591822, 0.93408178],
       [0.61228296, 0.38771704],
       [0.02820771, 0.97179229],
       [0.37798238, 0.62201762],
       [0.12968819, 0.87031181],
       [0.24015201, 0.75984799],
       [0.36132995, 0.63867005],
       [0.15379611, 0.84620389],
       [0.05911207, 0.94088793],
       [0.

In [48]:
# COMPUTING the ACCURACY of the MODEL on TRAINING DATA 

In [51]:
pred_train = logReg.predict(X_train)
pred_train
train_data_accuracy  = accuracy_score(pred_train, y_train)
train_data_accuracy

0.7626582278481012

In [47]:
# COMPUTING the ACCURACY of the MODEL on TESTING DATA 

In [52]:
pred_test = logReg.predict(X_test)
pred_test
test_data_accuracy  = accuracy_score(pred_test, y_test)
test_data_accuracy

0.6455696202531646

In [None]:
# Printing the ACCURACY

In [54]:
## printing 
print("Accuracy on the training data {:.2f}%".format(train_data_accuracy * 100))
print("Accuracy on the test data {:.2f}%".format(test_data_accuracy * 100))

Accuracy on the training data 76.27%
Accuracy on the test data 64.56%


In [None]:
# It looks that the MODEL OVERFITS the DATA