In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import PolynomialFeatures    
from sklearn.metrics import mean_squared_error    
from sklearn.tree import DecisionTreeRegressor

In [11]:
# read in data from the file "StudentsPerformance" and store it in the DataFrame "df"
df = pd.read_csv('StudentsPerformance.csv')

# print the first 5 recordings in the DataFrame `df`
df.head(5)  

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


Preparing data

In [12]:
# Renaming columns
df.rename(columns = {'math score': 'math_score', 'writing score':'writing_score', 'reading score':'reading_score', 'parental level of education':'parent_edu', 'race/ethnicity': 'race', 'test preparation course':'test_prep'}, inplace = True)

df['gender'] = df['gender'].map({'female': 0, 'male': 1})
df['test_prep'] = df['test_prep'].map({'none': 0, 'completed': 1})
race_map = {'group A': 1, 'group B': 2, 'group C': 3, 'group D': 4, 'group E': 5}
df['race'] = df['race'].map(race_map)

# Aggregate into Total_mark
df['total_score'] = df['reading_score'] + df['writing_score'] + df['math_score']

# Creating economic_status
education_map = {'some high school': 1, 'high school': 2, 'some college': 3, 'associate\'s degree': 4, 'bachelor\'s degree': 5, 'master\'s degree': 6}
lunch_map = {'standard': 1, 'free/reduced': 0}

df['parent_edu'] = df['parent_edu'].map(education_map)
df['lunch'] = df['lunch'].map(lunch_map)
df['economic_status'] = df['parent_edu'] + df['lunch']

In [13]:
df = df.drop(['parent_edu','lunch','math_score','reading_score', 'writing_score'],axis = 1)
df.head(5)

Unnamed: 0,gender,race,test_prep,total_score,economic_status
0,0,2,0,218,6
1,0,3,1,247,4
2,0,2,0,278,7
3,1,1,0,148,4
4,1,3,0,229,4


In [14]:
X=df.drop(['total_score'],axis=1).to_numpy()
y=df['total_score'].to_numpy()

Train, validation, test datasets

In [15]:
X_train, X_val_test, y_train, y_val_test = train_test_split(X,y, test_size=0.4,shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_val_test,y_val_test, test_size=0.5,shuffle=True)

Polynomial Regression

In [16]:
degrees = range(2,11)
# variables to store the resulting training and validation errors for each polynomial degree
poly_tr_errors = []          
poly_val_errors = []
for degree in degrees:    # use for-loop to fit polynomial regression models with different degrees
    lin_regr = LinearRegression(fit_intercept=False)
    poly = PolynomialFeatures(degree=degree)    # generate polynomial features
    X_train_poly = poly.fit_transform(X_train)    # fit the raw features
    lin_regr.fit(X_train_poly, y_train)    # apply linear regression to these new features and labels
  
    y_pred_train = lin_regr.predict(X_train_poly)    # predict using the linear model
    tr_error = mean_squared_error(y_train, y_pred_train)    # calculate the training error
    X_val_poly = poly.transform(X_val) # transform the raw features for the validation data 
    y_pred_val = lin_regr.predict(X_val_poly) # predict values for the validation data using the linear model 
    val_error = mean_squared_error(y_val, y_pred_val) # calculate the validation error
 
    poly_tr_errors.append(tr_error)
    poly_val_errors.append(val_error)

In [17]:
# create a table to compare training and validation errors
errors = {"poly degree":degrees,
          "poly_train_errors":poly_tr_errors,
          "poly_val_errors":poly_val_errors,
         }
pd.DataFrame({ key:pd.Series(value) for key, value in errors.items()})

Unnamed: 0,poly degree,poly_train_errors,poly_val_errors
0,2,1518.271551,1271.065
1,3,1471.245422,1293.442
2,4,1413.716105,1405.073
3,5,1377.163911,1455.912
4,6,1337.258393,1586.804
5,7,1309.781579,1867.246
6,8,1275.471504,36299590000.0
7,9,1278.26552,20732680000000.0
8,10,1267.238149,95250.98


Decision Tree Regressor

In [18]:
max_depths = range(1,10)

#variables to store errors for corresponding maximum depths
dect_tr_errors = []
dect_val_errors = []

for depth in max_depths:
    dect_regr = DecisionTreeRegressor(max_depth=depth)
    dect_regr.fit(X_train, y_train)
    
    y_pred_train = dect_regr.predict(X_train)
    tr_error = mean_squared_error(y_train, y_pred_train)
    y_pred_val = dect_regr.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred_val)
    
    dect_tr_errors.append(tr_error)
    dect_val_errors.append(val_error)

In [19]:
errors = {"max_depth":max_depths,
          "dec_train_errors":dect_tr_errors,
          "dec_val_errors":dect_val_errors,
         }
pd.DataFrame(errors)

Unnamed: 0,max_depth,dec_train_errors,dec_val_errors
0,1,1764.899161,1478.458713
1,2,1622.03031,1448.206304
2,3,1543.018296,1483.456579
3,4,1481.048195,1479.838727
4,5,1417.541831,1459.466156
5,6,1378.31879,1564.981063
6,7,1337.70663,1624.318304
7,8,1287.97109,1724.230414
8,9,1269.91373,1728.023332
