# Exam Performance Problem

## 1. Regression Problem

### A. Load and Process Data

In [329]:
#import necessary libraries
import pandas as pd
import numpy as np

In [330]:
#load data as pandas dataframe
main_data = pd.read_csv("./datasets/exam-preformance/StudentsPerformance.csv")
df = main_data #temp variable of data to keep original
df[:5] #preview of dataset

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [331]:
#simplify target value as average of 3 scores
df["average score"] = (df["math score"]+df["reading score"]+df["writing score"])/3
#drop 3 individual score columns and race/ethnicity column
df = df.drop(labels=["race/ethnicity","math score","reading score","writing score"],axis=1)
df[:5] #preview dataset

Unnamed: 0,gender,parental level of education,lunch,test preparation course,average score
0,female,bachelor's degree,standard,none,72.666667
1,female,some college,standard,completed,82.333333
2,female,master's degree,standard,none,92.666667
3,male,associate's degree,free/reduced,none,49.333333
4,male,some college,standard,none,76.333333


### B. Ordinal Encoding Approach

In [333]:
#replace strings with numerical values
df = df.replace("male",0)
df = df.replace("female",1)
df = df.replace("free/reduced",0)
df = df.replace("standard",1)
df = df.replace("none",0)
df = df.replace("completed",1)
df[:5] #preview dataset

Unnamed: 0,gender,parental level of education,lunch,test preparation course,average score
0,1,bachelor's degree,1,0,72.666667
1,1,some college,1,1,82.333333
2,1,master's degree,1,0,92.666667
3,0,associate's degree,0,0,49.333333
4,0,some college,1,0,76.333333


In [334]:
#get unique string values of parent education
df["parental level of education"].unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school'],
      dtype=object)

In [335]:
#replace parent education with numerical values (from 0 to 1)
df = df.replace("some high school",0)
df = df.replace("high school",0.2)
df = df.replace("some college",0.4)
df = df.replace("associate's degree",0.6)
df = df.replace("bachelor's degree",0.8)
df = df.replace("master's degree",1)
df[:5] #preview dataset

Unnamed: 0,gender,parental level of education,lunch,test preparation course,average score
0,1,0.8,1,0,72.666667
1,1,0.4,1,1,82.333333
2,1,1.0,1,0,92.666667
3,0,0.6,0,0,49.333333
4,0,0.4,1,0,76.333333


In [336]:
#convert pandas dataframe to numpy array
X_and_Y = df.values
#split data into feature and label vectors
X = X_and_Y[:,:-1]
Y = X_and_Y[:,-1]
print(X.shape, Y.shape)

(1000, 4) (1000,)


In [337]:
#apply linear regression and check accuracy
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(X,Y)
reg.score(X,Y) #poor accuracy without event splitting data

0.21544904862664838

### C. One-hot Encoding Approach

In [338]:
#go back to using original dataset
df = main_data
df["average score"] = (df["math score"]+df["reading score"]+df["writing score"])/3
df = df.drop(labels=["math score","reading score","writing score"],axis=1)
df[:5] #preview dataset

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,average score
0,female,group B,bachelor's degree,standard,none,72.666667
1,female,group C,some college,standard,completed,82.333333
2,female,group B,master's degree,standard,none,92.666667
3,male,group A,associate's degree,free/reduced,none,49.333333
4,male,group C,some college,standard,none,76.333333


In [339]:
#now use one-hot encoding
df = pd.get_dummies(df)
df[:5]#preview dataset

Unnamed: 0,average score,gender_female,gender_male,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,72.666667,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1
1,82.333333,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0
2,92.666667,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1
3,49.333333,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1
4,76.333333,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1


In [349]:
#convert pandas dataframe to numpy array
X_and_Y = df.values
#split data into feature and label vectors
X = X_and_Y[:,1:]
Y = X_and_Y[:,0]
print(X.shape, Y.shape)

(1000, 17) (1000,)


In [350]:
#apply linear regression and check accuracy
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(X,Y)
Y_pred = reg.predict(X)
reg.score(X,Y) #still poor accuracy but slightly better

0.24219960364907145

In [351]:
#svm regressor
from sklearn.svm import SVR
svm_class = svm.SVR(kernel='linear')
svm_class.fit(X, Y)
test_acc = svm_class.score(X,Y)
print(test_acc)

0.23297698243998463


In [352]:
#decision tree regressor
from sklearn.tree import DecisionTreeRegressor
dectree_class = DecisionTreeRegressor()
dectree_class.fit(X, Y)
test_acc = dectree_class.score(X,Y)
print(test_acc)

0.3720183432788404


In [353]:
#random forest regressor
from sklearn.ensemble import RandomForestRegressor
randforest_class = RandomForestRegressor()
randforest_class.fit(X, Y)
test_acc = randforest_class.score(X,Y)
print(test_acc)

0.34999849323775306


## 2. Classification Problem

In [360]:
#shuffling data, changing regression to classification (splitting along mean)
np.random.shuffle(X_and_Y)
X = X_and_Y[:,1:]
Y = X_and_Y[:,0]
Y_mean = np.mean(Y)
Y_class = np.copy(Y)
Y_class[Y>=Y_mean] = 1
Y_class[Y<Y_mean] = -1
print(X.shape, Y.shape)

(1000, 17) (1000,)


In [361]:
#partition data (80/20)
set_split = int(Y.shape[0]*0.8)
X_train_val = X[:set_split,:] # Get features from train + val set.
X_test      = X[set_split:,:] # Get features from test set.     
Y_train_val = Y_class[:set_split] # Get labels from train + val set.
Y_test      = Y_class[set_split:] # Get labels from test set.  
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(800, 17) (200, 17) (800,) (200,)


In [362]:
#svm classifier
from sklearn import svm
svm_class = svm.SVC(kernel='linear')
svm_class.fit(X_train_val, Y_train_val)
test_acc = svm_class.score(X_train_val,Y_train_val)
print(test_acc)

0.6725


In [363]:
#logistic regression classifier
from sklearn.linear_model import LogisticRegression
logreg_class = LogisticRegression()
logreg_class.fit(X_train_val, Y_train_val)
test_acc = logreg_class.score(X_test,Y_test)
print(test_acc)

0.695


In [364]:
#decision tree classifier
from sklearn.tree import DecisionTreeClassifier
dectree_class = DecisionTreeClassifier()
dectree_class.fit(X_train_val, Y_train_val)
test_acc = dectree_class.score(X_test,Y_test)
print(test_acc)

0.585


In [365]:
#random forest classifier
from sklearn.ensemble import RandomForestClassifier
randforest_class = RandomForestClassifier()
randforest_class.fit(X_train_val, Y_train_val)
test_acc = randforest_class.score(X_test,Y_test)
print(test_acc)

0.57
