In [1]:
import pandas as pd
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import numpy as np

In [2]:
X_train = pd.read_csv("sets/X_train.csv")
X_test = pd.read_csv("sets/X_test.csv")
y_train = pd.read_csv("sets/y_train.csv")
y_test = pd.read_csv("sets/y_test.csv")
y = pd.read_csv("sets/y.csv")

### Decision trees

Decision trees are a simple ML model that we can use for the regression task of predicting the score of some student in an assessment.
This model should be able to deal with the non-linear relationship between the features and the target, thus achieving better results compared to linear regression. However, it might be prone to overfitting.

In [3]:
#using the scikit-learn implementation of the model
from sklearn.tree import DecisionTreeRegressor

min_samples_leaf = 15 #minimum number of samples required in a leaf -> may smooth the model
min_samples_split = 10 #minimum number of samples to split an internal node

decTree = DecisionTreeRegressor(min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split)

decTree.fit(X_train, y_train)

We evaluate the model by considering the RMSE error and the R2 score.

In [4]:
y_pred = decTree.predict(X_test)
y_pred_train = decTree.predict(X_train)
yMean = y.mean()[0]
yStd = y.std()[0]

#de-normalize preds and test target values
y_pred = y_pred*yStd + yMean
y_pred_train = y_pred_train*yStd + yMean
y_test_deNorm = y_test*yStd + yMean
y_train_deNorm = y_train*yStd + yMean

RMSE_trees = metrics.mean_squared_error(y_test_deNorm, y_pred, squared=False)
R2_trees = metrics.r2_score(y_test_deNorm, y_pred)

RMSE_trees_train = metrics.mean_squared_error(y_train_deNorm, y_pred_train, squared=False)
R2_trees_train = metrics.r2_score(y_train_deNorm, y_pred_train)

In [None]:
print("RMSE: "+str(RMSE_trees))
print("R2: "+str(R2_trees))

print("RMSE for training: "+str(RMSE_trees_train))
print("R2 for training: "+str(R2_trees_train))

print(y_pred.type)
print(y_test_deNorm.type)
print(y_pred.shape)
print(y_test_deNorm.shape)
print(y_test_deNorm.values.shape)

diff=y_pred - y_test_deNorm.values
print(diff.shape)
for j in range(20):
    condition = (diff > 10*j) & (diff < 10*j+1)
    print(condition.shape)
    #print(j,np.mean(diff[condition]))
    print(j)

RMSE: 16.668364623306456
R2: 0.20563719880837017
RMSE for training: 13.264815448992728
R2 for training: 0.5014456929749644
(32678,)
(32678, 1)
(32678, 1)
(32678, 32678)
(32678, 32678)
0


: 