In [1]:
# Predict student performance in secondary education

## Introduction to the data set
The data we use in this project comes from two datasets on Portuguese students and their performance in math (395 observations) and Portuguese (649 observations) courses. 382 students belong to both datasets and while we mainly work with the datasets separately, some of our analysis involves the joint dataset. There are 33 predictors in both datasets involving information such as school, sex, age, information about the students’ study and lifestyle habits, family details, and three grades. We acknowledge that the earlier grades (G1, G2) are helpful in predicting the final grade (G3) and we explore models with and without G1 and G2. The full list and description of predictors can be found at

In [40]:
import sys

import scipy
import numpy as np
import matplotlib
import pandas as pd
import sklearn

In [41]:
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC   

In [42]:
# https://archive.ics.uci.edu/ml/datasets/Student+Performance
# https://hugovalent.com/mlearn.html

Attribute Information:

# Attributes for both student-mat.csv (Math course) and student-por.csv (Portuguese language course) datasets:
1 school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira)
2 sex - student's sex (binary: 'F' - female or 'M' - male)
3 age - student's age (numeric: from 15 to 22)
4 address - student's home address type (binary: 'U' - urban or 'R' - rural)
5 famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3)
6 Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart)
7 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education)
8 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education)
9 Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
10 Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other')
11 reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other')
12 guardian - student's guardian (nominal: 'mother', 'father' or 'other')
13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour)
14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours)
15 failures - number of past class failures (numeric: n if 1<=n<3, else 4)
16 schoolsup - extra educational support (binary: yes or no)
17 famsup - family educational support (binary: yes or no)
18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no)
19 activities - extra-curricular activities (binary: yes or no)
20 nursery - attended nursery school (binary: yes or no)
21 higher - wants to take higher education (binary: yes or no)
22 internet - Internet access at home (binary: yes or no)
23 romantic - with a romantic relationship (binary: yes or no)
24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent)
25 freetime - free time after school (numeric: from 1 - very low to 5 - very high)
26 goout - going out with friends (numeric: from 1 - very low to 5 - very high)
27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high)
28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high)
29 health - current health status (numeric: from 1 - very bad to 5 - very good)
30 absences - number of school absences (numeric: from 0 to 93)

# these grades are related with the course subject, Math or Portuguese:
31 G1 - first period grade (numeric: from 0 to 20)
31 G2 - second period grade (numeric: from 0 to 20)
32 G3 - final grade (numeric: from 0 to 20, output target)



In [43]:
# Load Dataset
fileURL = "/home/vahid/Documents/Personal/Machine Learning/Project/student/student-por.csv"
#fileURL = "/home/vahid/Documents/Personal/Machine Learning/Project/student/student-mat.csv"
# names=['age', 'traveltime','studytime','G3']
dataSet=pd.read_csv(fileURL,sep=';',skiprows=1) # names=names,
dataSet.head(10)

# names=['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
# dataset = pandas.read_csv(url, names=names)



Unnamed: 0,GP,F,18,U,GT3,A,4,4.1,at_home,teacher,...,4.2,3,4.3,1,1.1,3.1,4.4,0.1,11,11.1
0,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
1,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
2,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
3,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13
4,GP,M,16,U,LE3,T,4,3,services,other,...,5,4,2,1,2,5,6,12,12,13
5,GP,M,16,U,LE3,T,2,2,other,other,...,4,4,4,1,1,3,0,13,12,13
6,GP,F,17,U,GT3,A,4,4,other,teacher,...,4,1,4,1,1,1,2,10,13,13
7,GP,M,15,U,LE3,A,3,2,services,other,...,4,2,2,1,1,1,0,15,16,17
8,GP,M,15,U,GT3,T,3,4,other,other,...,5,5,1,1,1,5,0,12,12,13
9,GP,F,15,U,GT3,T,4,4,teacher,health,...,3,3,3,1,2,2,2,14,14,14


In [52]:
# Dummy Variables
df=dataSet
dummies=pd.get_dummies(df.at_home)
df_merged=pd.concat([df,dummies],axis='columns')
df_final=df_merged.drop(['at_home','other'],axis='columns')
df_final

Unnamed: 0,GP,F,18,U,GT3,A,4,4.1,teacher,course,...,1,1.1,3.1,4.4,0.1,11,11.1,health,services,teacher.1
0,GP,F,17,U,GT3,T,1,1,other,course,...,1,1,3,2,9,11,11,0,0,0
1,GP,F,15,U,LE3,T,1,1,other,other,...,2,3,3,6,12,13,12,0,0,0
2,GP,F,15,U,GT3,T,4,2,services,home,...,1,1,5,0,14,14,14,1,0,0
3,GP,F,16,U,GT3,T,3,3,other,home,...,1,2,5,0,11,13,13,0,0,0
4,GP,M,16,U,LE3,T,4,3,other,reputation,...,1,2,5,6,12,12,13,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,MS,F,19,R,GT3,T,2,3,other,course,...,1,2,5,4,10,11,10,0,1,0
644,MS,F,18,U,LE3,T,3,1,services,course,...,1,1,1,4,15,15,16,0,0,1
645,MS,F,18,U,GT3,T,1,1,other,course,...,1,1,5,6,11,12,9,0,0,0
646,MS,M,17,U,LE3,T,3,1,services,course,...,3,4,2,6,10,10,10,0,1,0


In [None]:
# Dummy Variables - testing


In [38]:
# Encode categorical OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

ath=dataSet["at_home"]
# print(ath)
# ohe=OneHotEncoder()
# a=ohe.fit_transform([ath])
# print(a[0])
# print(ath.unique())
print(ath.value_counts().sort_values(ascending=False).head(20))


ath_frequency_map=ath.value_counts().to_dict()
print(ath_frequency_map)
ath2=ath.map(ath_frequency_map)
ath2.head()
# for label in 10:
#     df[at_home+'_'+label] = np.where(df[variable]==label, 1, 0)



other       258
services    136
at_home     134
teacher      72
health       48
Name: at_home, dtype: int64
{'other': 258, 'services': 136, 'at_home': 134, 'teacher': 72, 'health': 48}


0    134
1    134
2     48
3    258
4    136
Name: at_home, dtype: int64

In [30]:
print(dataSet.shape)

(649, 4)


In [31]:
print(dataSet.describe())

              age  traveltime   studytime          G3
count  649.000000  649.000000  649.000000  649.000000
mean     3.659476   11.399076   11.570108   11.906009
std      4.640759    2.745265    2.913639    3.230656
min      0.000000    0.000000    0.000000    0.000000
25%      0.000000   10.000000   10.000000   10.000000
50%      2.000000   11.000000   11.000000   12.000000
75%      6.000000   13.000000   13.000000   14.000000
max     32.000000   19.000000   19.000000   19.000000


In [32]:
print(dataSet.groupby('age').size())

age
0     244
1      12
2     110
3       7
4      93
5      12
6      49
7       3
8      42
9       7
10     21
11      5
12     12
13      1
14      8
15      2
16     10
18      3
21      2
22      2
24      1
26      1
30      1
32      1
dtype: int64


In [33]:
# histograms
# dataSet.hist()
# plt.show()

In [34]:
# scater plot matrix
# scatter_matrix(dataSet)
# plt.show()

In [35]:
# Split-out validation dataset
array=dataSet.values
X=array[:,0:3]
y=array[:,3]
validation_size=0.20
seed=7
X_trin,X_val,y_train,y_val=model_selection.train_test_split(X,y,test_size=validation_size,random_state=seed)

In [36]:
# Test option and evaluation metric
scoring = 'accuracy'

In [37]:
print(X_trin.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(519, 3)
(130, 3)
(519,)
(130,)


In [38]:
models=[]
models.append(('LR',LogisticRegression()))
models.append(('KNN',KNeighborsClassifier()))
models.append(('SVM',SVC()))


In [41]:
# evaluate each model
results=[]
names=[]

for name,model in models:
    kfold=model_selection.KFold(n_splits=10)
    cv_results=model_selection.cross_val_score(model,X_trin,y_train,cv=kfold,scoring=scoring)
    names.append(name)
    msg= "%s: %f (%f)" %(name,cv_results.mean(),cv_results.std())
    print(msg)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

LR: 0.306335 (0.060339)
KNN: 0.356637 (0.067921)
SVM: 0.420023 (0.045951)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [43]:
# MAke predictions on validation dataset

for name,model in models:
    model.fit(X_trin,y_train)
    predictions=model.predict(X_val)
    print(name)
    print(accuracy_score(y_val,predictions))
    print(classification_report(y_val,predictions))

LR
0.3
              precision    recall  f1-score   support

           0       0.33      0.50      0.40         2
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         6
           9       0.00      0.00      0.00         7
          10       0.24      0.28      0.26        18
          11       0.31      0.61      0.42        18
          12       0.29      0.12      0.17        17
          13       0.33      0.71      0.45        17
          14       0.33      0.33      0.33        15
          15       0.12      0.11      0.12         9
          16       0.00      0.00      0.00         8
          17       0.40      0.33      0.36         6
          18       0.00      0.00      0.00         3

    accuracy                           0.30       130
   macro avg       0.17      0.21      0.18       130
weighted avg       0.23      0.30      0.24       130

KNN
0.4
          

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
