In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)
math_df = pd.read_csv("data/math_data.csv")
por_df = pd.read_csv("data/por_data.csv")

In [3]:
math_df

Unnamed: 0,School,Sex,Age,Address,Family Size,Parental Status,Mother Education,Father Education,Mother Job,Father Job,Reason for School,Guardian,Travel Time,Study Time,# of Failures,Extra Educational Support,Extra Family Support,Extra Paid Classes,Extracurricular,Nursery,Desire Higher Education,Internet Access,Romantic Relationship,Family Relationship Quality,Amount of Free Time,Frequency of Going Out,Weekday Alcohol Consumption,Weekend Alcohol Consumption,Health Status,Abscences,1st Period Grade,2nd Period Grade,Final Grade
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,3,yes,no,yes,no,yes,yes,yes,no,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,yes,yes,yes,yes,yes,yes,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,yes,no,yes,yes,no,no,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,course,other,1,2,2,no,yes,yes,no,yes,yes,no,no,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,course,mother,2,1,0,no,no,no,no,no,yes,yes,no,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,course,other,1,1,3,no,no,no,no,no,yes,no,no,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,course,mother,3,1,0,no,no,no,no,no,yes,yes,no,4,4,1,3,4,5,0,11,12,10


In [4]:
math_data = math_df.iloc[:, :-3] # Only taking features that isn't the final grade.
math_outcome = math_df.iloc[:, -1:] # Only want the outcome of the final grade.

In [5]:
# Preprocessing of math data df to turn binary string values and categorial variables into integers
# Copying df to keep original
no_yes_answer = {'no': 0,'yes': 1}
sex = {'F': 0,'M': 1} 
school_identity = {'GP': 0,'MS': 1} 
urban_rural = {'U': 0,'R': 1} 
family_size = {'LE3': 0,'GT3': 1} 
parent_status = {'A': 0,'T': 1} 
binary_categories = {
                     "Extra Educational Support": no_yes_answer,
                     "Extra Family Support": no_yes_answer,
                     "Extra Paid Classes": no_yes_answer,
                     "Extracurricular": no_yes_answer,
                     "Nursery": no_yes_answer,
                     "Desire Higher Education": no_yes_answer,
                     "Internet Access": no_yes_answer,
                     "Romantic Relationship": no_yes_answer,
                     "Sex": sex,
                     "School": school_identity,
                     "Address": urban_rural,
                     "Family Size": family_size,
                     "Parental Status": parent_status
                    }
encode_math_data = math_data.copy()
encode_math_data = encode_math_data.replace(binary_categories)
categorial_features = encode_math_data.select_dtypes(include=object).columns.tolist()
encode_math_data = pd.get_dummies(encode_math_data, columns=categorial_features)

In [6]:
# Preprocessing of math outcome df to turn final grade values into a binary pass/no pass
# Making a copy of the df since we want to keep the original dataframe
bin_math_outcome = math_outcome.copy()
for i, score in enumerate(bin_math_outcome["Final Grade"]):
    if score > 9:
        bin_math_outcome.at[i] = 1
    else:
        bin_math_outcome.at[i] = 0
        
bin_math_outcome

Unnamed: 0,Final Grade
0,0
1,0
2,1
3,1
4,1
...,...
390,0
391,1
392,0
393,1


In [7]:
from sklearn.model_selection import train_test_split
math_data_train, math_data_test, math_outcome_train, math_outcome_test = train_test_split(encode_math_data, bin_math_outcome, test_size=0.30,shuffle=True)
display(math_outcome_test)
math_data_train, math_data_test, math_outcome_train, math_outcome_test = math_data_train.values, math_data_test.values, math_outcome_train.values.ravel(), math_outcome_test.values.ravel()

Unnamed: 0,Final Grade
101,1
81,1
166,1
42,1
147,1
...,...
348,1
82,0
150,0
369,1


In [8]:
def mean_squared_error(y, y_hat):
    from sklearn.metrics import mean_squared_error
    return mean_squared_error(y, y_hat)

In [9]:
def correctness_percentage(y, y_hat):
    n = len(y)
    accuracyList = []
    for i in range(n):
        if y[i] == y_hat[i]:
            accuracyList.append(1)
        else:
            accuracyList.append(0)
            
    return (sum(accuracyList) / n) * 100

In [10]:
from sklearn import linear_model, tree
from sklearn.ensemble import RandomForestClassifier
dt = tree.DecisionTreeClassifier(max_depth=5)
logreg = linear_model.LogisticRegression(solver="liblinear")
clf = RandomForestClassifier(max_depth=5)

In [11]:
clf.fit(math_data_train, math_outcome_train)
math_yhat = clf.predict(math_data_test)

In [12]:
math_outcome_test

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 1])

In [13]:
math_yhat

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1])

In [14]:
mean_squared_error(math_outcome_test, math_yhat)

0.21008403361344538

In [16]:
correct_percentage(math_outcome_test, math_yhat)

78.99159663865547

In [17]:
results = []
for i in range(len(math_outcome_test)):
    print((math_outcome_test[i], math_yhat[i]))

(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 0)
(1, 0)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(0, 1)
(1, 1)
(1, 1)
(0, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 1)
(1, 1)
(0, 1)
(0, 1)
(1, 1)
(1, 1)
(0, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(1, 0)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(0, 1)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(0, 1)
(1, 1)
(1, 1)
(0, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(0, 1)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(0, 1)
(1, 1)
(1, 1)
(0, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 0)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 1)
(1, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 1)
(1, 1)
(0, 1)
(1, 1)
(0, 1)
(0, 0)
(1, 1)
(1, 1)
(0, 1)
(1, 0)
(1, 1)
(1, 1)
(1, 1)
(0, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 1)
(1, 1)
(0, 1)
(1, 1)
(1, 1)
(1, 1)
(0, 1)
(0, 0)
(1, 1)
(1, 1)
