In [132]:
## Import Libraries
import pandas as pd 
import numpy as np
import sklearn 
from sklearn import linear_model 
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [133]:
## reading the data using pandas
df_full = pd.read_csv("student-mat.csv", sep = ";" )
df_full

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,MS,M,20,U,LE3,A,2,2,services,services,...,5,5,4,4,5,4,11,9,9,9
391,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,3,14,16,16
392,MS,M,21,R,GT3,T,1,1,other,other,...,5,5,3,3,3,3,3,10,8,7
393,MS,M,18,R,LE3,T,3,2,services,other,...,4,4,1,3,4,5,0,11,12,10


In [134]:
## Creating a subset of the original data
df_sub = df_full[["age", "freetime","absences", "G1", "G2", "G3" ]]
df_sub

Unnamed: 0,age,freetime,absences,G1,G2,G3
0,18,3,6,5,6,6
1,17,3,4,5,5,6
2,15,3,10,7,8,10
3,15,2,2,15,14,15
4,16,3,4,6,10,10
...,...,...,...,...,...,...
390,20,5,11,9,9,9
391,17,4,3,14,16,16
392,21,5,3,10,8,7
393,18,4,0,11,12,10


In [135]:
## Data statistical summary 
df_sub.describe()

Unnamed: 0,age,freetime,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,3.235443,5.708861,10.908861,10.713924,10.41519
std,1.276043,0.998862,8.003096,3.319195,3.761505,4.581443
min,15.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,8.0,13.0,13.0,14.0
max,22.0,5.0,75.0,19.0,19.0,20.0


In [136]:
## Checking for null values
df_sub.isna().sum()

age         0
freetime    0
absences    0
G1          0
G2          0
G3          0
dtype: int64

In [137]:
## checking the correlation between parameters
df_sub.corr()

Unnamed: 0,age,freetime,absences,G1,G2,G3
age,1.0,0.016434,0.17523,-0.064081,-0.143474,-0.161579
freetime,0.016434,1.0,-0.058078,0.012613,-0.013777,0.011307
absences,0.17523,-0.058078,1.0,-0.031003,-0.031777,0.034247
G1,-0.064081,0.012613,-0.031003,1.0,0.852118,0.801468
G2,-0.143474,-0.013777,-0.031777,0.852118,1.0,0.904868
G3,-0.161579,0.011307,0.034247,0.801468,0.904868,1.0


In [138]:
## Creating the feature column
df_feature = df_sub.drop(columns="G3")
df_feature

Unnamed: 0,age,freetime,absences,G1,G2
0,18,3,6,5,6
1,17,3,4,5,5
2,15,3,10,7,8
3,15,2,2,15,14
4,16,3,4,6,10
...,...,...,...,...,...
390,20,5,11,9,9
391,17,4,3,14,16
392,21,5,3,10,8
393,18,4,0,11,12


In [139]:
## Creating the target column
df_target = df_sub["G3"]
df_target

0       6
1       6
2      10
3      15
4      10
       ..
390     9
391    16
392     7
393    10
394     9
Name: G3, Length: 395, dtype: int64

In [140]:
##reassigning the target and features
x = df_feature
y = df_target

In [141]:
## Splitting data into train and test
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.2)

## Using LinearRegressor

In [142]:
## Linear regression
linear = linear_model.LinearRegression(fit_intercept=True)

In [143]:
## fitting the model
linear.fit(x_train, y_train)

In [144]:
##Checking the accuracu score
acc_score  = linear.score(x_test, y_test)
acc_score

0.830990627771653

In [145]:
##checking coefficients
linear.coef_

array([-0.20722559,  0.16747368,  0.03705786,  0.17015092,  0.9729042 ])

In [146]:
##checking intercepts
linear.intercept_

0.819900149910934

In [147]:
## Predicting student score using test values
predictions = linear.predict(x_test)
df_pred = pd.DataFrame(predictions)

In [148]:
fin = pd.concat([df_pred,y_test],axis = 1)


In [149]:
## converting our data frame to an array to make processing easier 
x_test_ar = np.array(x_test)
y_test_ar = np.array(y_test)
pred_ar = np.array(predictions)

In [166]:
## printing out actual test score, features, and predicted test score
for x in range(10):
    print(pred_ar[x], x_test_ar[x], y_test_ar[x])

6.0232023750873145 [18  3  7  8  7] 8
17.11498732727555 [15  4 12 16 16] 16
3.2704614870806883 [19  3  0  6  5] 0
10.375769729870287 [17  2  0 12 11] 12
11.760397455125348 [18  3  3 14 12] 12
14.006806192688334 [16  4  0 13 14] 13
9.462227041885303 [17  4 28 10  9] 9
6.901447542785587 [21  5  3 10  8] 7
15.266372603090536 [17  2  4 17 15] 16
8.527555988138868 [18  2 22  9  9] 9


## Using Decision Tree Regressor

In [151]:
## Decision Tree
import sklearn
from sklearn.tree import DecisionTreeRegressor

In [152]:
## using the decision tree regressor
tree = DecisionTreeRegressor(random_state=42)

In [153]:
## fitting the model
tree.fit(x_train, y_train)

In [154]:
## Evaluating DecisionTreeRegressor
y_tree_test_pred = tree.predict(x_test)


In [155]:
from sklearn.metrics import r2_score

In [156]:
# y train data into dataframe
y_train = pd.DataFrame(y_train)

In [157]:
## converting into array for easy manipulation
x_test_ar_tree = np.array(x_test)
y_test_ar_tree = np.array(y_test)
pred_ar_tree = np.array(y_tree_test_pred)

In [158]:
#checking and converting y_train to float
y_test_ar_tree[0:10].astype(float)

array([ 8., 16.,  0., 12., 12., 13.,  9.,  7., 16.,  9.])

In [159]:
# converting predicted score to float for easy computation
pred_ar_tree[0:10].astype(float)

array([ 9., 15.,  0., 10., 12., 14., 10.,  8., 15.,  7.])

In [160]:
#tree_rsq = r2_score(x_train, y_tree_train_pred)
## predicted test values against actual test values
for x in range(5):
    print(pred_ar_tree[x],"<<<>>>", y_test_ar_tree[x])

9.0 <<<>>> 8
15.0 <<<>>> 16
0.0 <<<>>> 0
10.0 <<<>>> 12
12.0 <<<>>> 12


In [161]:
pred_ar_tree[0:4]

array([ 9., 15.,  0., 10.])

In [162]:
## Accuracy of Decision Tree Regressor
tree_rsq = r2_score(y_test_ar_tree, pred_ar_tree)
print(f"Accuracy Score using Decision Tree Regressor:{round(tree_rsq*100, 2)}")

Accuracy Score using Decision Tree Regressor:89.63


In [163]:
## Accuracy of Linear regression
LR_rsq = r2_score(y_test_ar, pred_ar)
print(f"Accuracy using Linear Regressor:{round(LR_rsq*100, 2)}")

Accuracy using Linear Regressor:83.1


## Using Random Forest Regressor

In [164]:
## Random Forest 
#Initialize the model with some predetermined Hyperparameters (we will look at optimizing hyper parameters in the next section)
forest = RandomForestRegressor(random_state = 30, n_jobs = -1) # remember why we changed the variables

#Initializing the Random Forest Model
forest.fit(x_train, y_train.values.ravel())

In [165]:
## Accuracy of RandomForestRegressor
RFR_rsq = r2_score(y_test_ar, pred_ar)
print(f"Accuracy of Random Forest Regression:{round(RFR_rsq*100, 2)}")

Accuracy of Random Forest Regression:83.1
