In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import PolynomialFeatures    
from sklearn.metrics import mean_squared_error    
from sklearn.tree import DecisionTreeRegressor

In [29]:
# read in data from the file "StudentsPerformance" and store it in the DataFrame "df"
df = pd.read_csv('StudentsPerformance.csv')

# print the first 5 recordings in the DataFrame `df`
df.head(5)  

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


Preparing data

In [30]:
# Renaming columns
df.rename(columns = {'math score': 'math_score', 'writing score':'writing_score', 'reading score':'reading_score', 'parental level of education':'parent_edu', 'race/ethnicity': 'race', 'test preparation course':'test_prep'}, inplace = True)

df['gender'] = df['gender'].map({'female': 0, 'male': 1})
df['test_prep'] = df['test_prep'].map({'none': 0, 'completed': 1})
race_map = {'group A': 1, 'group B': 2, 'group C': 3, 'group D': 4, 'group E': 5}
df['race'] = df['race'].map(race_map)

# Aggregate into Total_mark
df['total_score'] = df['reading_score'] + df['writing_score'] + df['math_score']

# Creating economic_status
education_map = {'some high school': 1, 'high school': 2, 'some college': 3, 'associate\'s degree': 4, 'bachelor\'s degree': 5, 'master\'s degree': 6}
lunch_map = {'standard': 1, 'free/reduced': 0}

df['parent_edu'] = df['parent_edu'].map(education_map)
df['lunch'] = df['lunch'].map(lunch_map)
df['economic_status'] = df['parent_edu'] + df['lunch']

In [31]:
df = df.drop(['parent_edu','lunch','math_score','reading_score', 'writing_score'],axis = 1)
df.head(5)

Unnamed: 0,gender,race,test_prep,total_score,economic_status
0,0,2,0,218,6
1,0,3,1,247,4
2,0,2,0,278,7
3,1,1,0,148,4
4,1,3,0,229,4


In [32]:
X=df.drop(['total_score'],axis=1).to_numpy()
y=df['total_score'].to_numpy()

Train, validation, test datasets

In [33]:
X_train, X_val_test, y_train, y_val_test = train_test_split(X,y, test_size=0.4,random_state=10)
X_val, X_test, y_val, y_test = train_test_split(X_val_test,y_val_test, test_size=0.5,random_state=10)

Polynomial Regression

In [34]:
degrees = range(2,11)
# variables to store the resulting training and validation errors for each polynomial degree
poly_tr_errors = []          
poly_val_errors = []
poly_test_errors = []
for degree in degrees:    # use for-loop to fit polynomial regression models with different degrees
    lin_regr = LinearRegression(fit_intercept=False)
    poly = PolynomialFeatures(degree=degree)    # generate polynomial features
    X_train_poly = poly.fit_transform(X_train)    # fit the raw features
    lin_regr.fit(X_train_poly, y_train)    # apply linear regression to these new features and labels
  
    y_pred_train = lin_regr.predict(X_train_poly)    # predict using the linear model
    tr_error = mean_squared_error(y_train, y_pred_train)    # calculate the training error
    X_val_poly = poly.transform(X_val) # transform the raw features for the validation data 
    y_pred_val = lin_regr.predict(X_val_poly) # predict values for the validation data using the linear model 
    val_error = mean_squared_error(y_val, y_pred_val) # calculate the validation error
    
    poly_tr_errors.append(tr_error)
    poly_val_errors.append(val_error)

In [35]:
# create a table to compare training and validation errors
errors_poly = {"poly degree": degrees,
          "poly_train_errors":np.round(poly_tr_errors,3),
          "poly_val_errors":np.round(poly_val_errors,3),
         }
pd.DataFrame(errors_poly).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8
poly degree,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0
poly_train_errors,1504.74,1454.89,1426.583,1385.146,1334.375,1306.816,1280.24,1326.566,1279.541
poly_val_errors,1466.831,1503.228,1496.975,1540.881,1927.048,2774.155,27339.384,1.058831e+16,234561.728


Decision Tree Regressor

In [36]:
max_depths = range(1,10)

#variables to store errors for corresponding maximum depths
dect_tr_errors = []
dect_val_errors = []
dect_test_errors = []

for depth in max_depths:
    dect_regr = DecisionTreeRegressor(max_depth=depth)
    dect_regr.fit(X_train, y_train)
    
    y_pred_train = dect_regr.predict(X_train)
    tr_error = mean_squared_error(y_train, y_pred_train)
    y_pred_val = dect_regr.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred_val)
    
    dect_tr_errors.append(tr_error)
    dect_val_errors.append(val_error)

In [37]:
errors_dect = {"max_depth":max_depths,
          "dect_train_errors":np.round(dect_tr_errors,3),
          "dect_val_errors":np.round(dect_val_errors,3),
         }
pd.DataFrame(errors_dect).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8
max_depth,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0
dect_train_errors,1806.062,1664.912,1564.861,1465.827,1378.697,1336.524,1314.82,1293.387,1282.327
dect_val_errors,1589.675,1576.299,1572.02,1550.094,1572.652,1647.296,1662.78,1745.001,1747.822


In [38]:
lin_regr = LinearRegression(fit_intercept=False)
poly = PolynomialFeatures(degree=2)    
X_train_poly = poly.fit_transform(X_train)    
lin_regr.fit(X_train_poly, y_train)   
y_pred_train_poly = lin_regr.predict(X_train_poly)    
tr_error_poly = mean_squared_error(y_train, y_pred_train)

X_test_poly = poly.transform(X_test)
y_pred_test_poly = lin_regr.predict(X_test_poly)    
test_error_poly = mean_squared_error(y_test, y_pred_test_poly)

In [42]:
dect_regr = DecisionTreeRegressor(max_depth=3)
dect_regr.fit(X_train, y_train)
y_pred_train_dect = dect_regr.predict(X_train)
tr_error_dect = mean_squared_error(y_train, y_pred_train_dect)
 
y_pred_test_dect = dect_regr.predict(X_test)
test_error_dect = mean_squared_error(y_test, y_pred_test_dect)

In [43]:
print("Test errors for Polynomial regression: ",test_error_poly)
print("Test errors for Decision Tree Regression: ",test_error_dect)

Test errors for Polynomial regression:  1508.4498500990367
Test errors for Decision Tree Regression:  1523.6860775151417
