In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import PolynomialFeatures    # function to generate polynomial and interaction features
from sklearn.linear_model import LinearRegression    # classes providing Linear Regression with ordinary squared error loss and Huber loss, respectively
from sklearn.metrics import mean_squared_error    # function to calculate mean squared error 

In [None]:
# read in data from the file "StudentsPerformance" and store it in the DataFrame "df"

df = pd.read_csv('StudentsPerformance.csv')

# print the first 5 recordings in the DataFrame `df`

df.head(5)  

Preparing data

In [None]:
import pandas as pd
df = pd.read_csv('StudentsPerformance.csv')

# Renaming columns
df.rename(columns = {'math score': 'math_score', 'writing score':'writing_score', 'reading score':'reading_score', 'parental level of education':'parent_edu', 'race/ethnicity': 'race', 'test preparation course':'test_prep'}, inplace = True)


df['gender'] = df['gender'].map({'female': 0, 'male': 1})
df['test_prep'] = df['test_prep'].map({'none': 0, 'completed': 1})
race_map = {'group A': 1, 'group B': 2, 'group C': 3, 'group D': 4, 'group E': 5}
df['race'] = df['race'].map(race_map)

# 2. Aggregate into Total_mark
df['total_score'] = df['reading_score'] + df['writing_score'] + df['math_score']

# 3. Creating economic_status
education_map = {'some high school': 1, 'high school': 2, 'some college': 3, 'associate\'s degree': 4, 'bachelor\'s degree': 5, 'master\'s degree': 6}
lunch_map = {'standard': 1, 'free/reduced': 0}

df['parent_edu'] = df['parent_edu'].map(education_map)
df['lunch'] = df['lunch'].map(lunch_map)
df['economic_status'] = df['parent_edu'] + df['lunch']

df

In [None]:
# automatically populated; add a variable name below
_ = df

In [None]:
df = df.drop(['parent_edu','lunch','math_score','reading_score', 'writing_score'],axis = 1)

In [None]:
df

In [None]:
X=df.drop(['total_score'],axis=1).to_numpy()
y=df['total_score'].to_numpy()

Train, validation, test datasets

In [None]:
X_train, X_val_test, y_train, y_val_test = train_test_split(X,y, test_size=0.4,shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_val_test,y_val_test, test_size=0.5,shuffle=True)

Polynomial Regression

In [None]:
degrees = range(2,10)
# we will use this variables to store the resulting training and validation errors for each polynomial degree
poly_tr_errors = []          
poly_val_errors = []
for degree in degrees:    # use for-loop to fit polynomial regression models with different degrees
    lin_regr = LinearRegression(fit_intercept=False) # NOTE: "fit_intercept=False" as we already have a constant iterm in the new feature X_poly
    poly = PolynomialFeatures(degree=degree)    # generate polynomial features
    X_train_poly = poly.fit_transform(X_train)    # fit the raw features
    lin_regr.fit(X_train_poly, y_train)    # apply linear regression to these new features and labels
  
    y_pred_train = lin_regr.predict(X_train_poly)    # predict using the linear model
    tr_error = mean_squared_error(y_train, y_pred_train)    # calculate the training error
    X_val_poly = poly.transform(X_val) # transform the raw features for the validation data 
    y_pred_val = lin_regr.predict(X_val_poly) # predict values for the validation data using the linear model 
    val_error = mean_squared_error(y_val, y_pred_val) # calculate the validation error
 
    poly_tr_errors.append(tr_error)
    poly_val_errors.append(val_error)

In [None]:
# create a table to compare training and validation errors
errors = {"poly degree":degrees,
          "poly_train_errors":poly_tr_errors,
          "poly_val_errors":poly_val_errors,
         }
pd.DataFrame({ key:pd.Series(value) for key, value in errors.items()})

Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

max_depths = range(1,10)

#variables to store errors for corresponding maximum depths
dect_tr_errors = []
dect_val_errors = []

for depth in max_depths:
    dect_regr = DecisionTreeRegressor(max_depth=depth)
    dect_regr.fit(X_train, y_train)
    y_pred_train = dect_regr.predict(X_train)
    tr_error = mean_squared_error(y_train, y_pred_train)
    y_pred_val = dect_regr.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred_val)
    dect_tr_errors.append(tr_error)
    dect_val_errors.append(val_error)

In [None]:
errors = {"max_depth":max_depths,
          "dec_train_errors":dect_tr_errors,
          "dec_val_errors":dect_val_errors,
         }
pd.DataFrame(errors)