# Student Scores: Pre-Processing and Training

# Load Required Python Packages

In [50]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

**Locate and print current working directory and parent directory.**

In [51]:
path = os.getcwd()
parent = os.path.dirname(path)
print(path)
print(parent)

/Users/tiffanyflor/Dropbox/MyProjects/Student Scores/notebooks
/Users/tiffanyflor/Dropbox/MyProjects/Student Scores


**Print contents of data/interim.**

In [52]:
os.listdir(parent+'/data/interim')

['total_student_scores_without_dummies.csv',
 'cleaned_student_scores.csv',
 'student_scores_no_outliers.csv',
 'total_student_scores_dummies.csv']

# Load Data

In [53]:
df = pd.read_csv(parent+'/data/interim/total_student_scores_without_dummies.csv',index_col=0)

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   gender                       1000 non-null   object 
 1   race/ethnicity               1000 non-null   object 
 2   parental level of education  1000 non-null   int64  
 3   lunch                        1000 non-null   object 
 4   test preparation course      1000 non-null   int64  
 5   mean score                   1000 non-null   float64
dtypes: float64(1), int64(2), object(3)
memory usage: 54.7+ KB


# Create Dummy Variables

In [55]:
# One-hot encode categorical features
df_dummies = pd.get_dummies(df)

# Pearson Correlation 
corr_dummies = df_dummies.corr(method='pearson')
corr_dummies.round(2).style.background_gradient(cmap='coolwarm')

Unnamed: 0,parental level of education,test preparation course,mean score,gender_female,gender_male,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,lunch_free/reduced,lunch_standard
parental level of education,1.0,-0.01,0.21,0.04,-0.04,-0.06,-0.07,0.04,0.02,0.06,0.02,-0.02
test preparation course,-0.01,1.0,0.26,-0.01,0.01,-0.01,-0.0,0.01,-0.06,0.06,0.02,-0.02
mean score,0.21,0.26,1.0,0.13,-0.13,-0.1,-0.08,-0.03,0.06,0.14,-0.29,0.29
gender_female,0.04,-0.01,0.13,1.0,-1.0,-0.07,0.03,0.06,-0.03,-0.02,0.02,-0.02
gender_male,-0.04,0.01,-0.13,-1.0,1.0,0.07,-0.03,-0.06,0.03,0.02,-0.02,0.02
race/ethnicity_group A,-0.06,-0.01,-0.1,-0.07,0.07,1.0,-0.15,-0.21,-0.19,-0.13,0.03,-0.03
race/ethnicity_group B,-0.07,-0.0,-0.08,0.03,-0.03,-0.15,1.0,-0.33,-0.29,-0.2,0.01,-0.01
race/ethnicity_group C,0.04,0.01,-0.03,0.06,-0.06,-0.21,-0.33,1.0,-0.41,-0.28,0.0,-0.0
race/ethnicity_group D,0.02,-0.06,0.06,-0.03,0.03,-0.19,-0.29,-0.41,1.0,-0.24,0.01,-0.01
race/ethnicity_group E,0.06,0.06,0.14,-0.02,0.02,-0.13,-0.2,-0.28,-0.24,1.0,-0.05,0.05


In [56]:
print(df.shape)
df_dummies.head()

(1000, 6)


Unnamed: 0,parental level of education,test preparation course,mean score,gender_female,gender_male,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,lunch_free/reduced,lunch_standard
0,4,0,72.7,1,0,0,1,0,0,0,0,1
1,2,1,82.3,1,0,0,0,1,0,0,0,1
2,5,0,92.7,1,0,0,1,0,0,0,0,1
3,3,0,49.3,0,1,1,0,0,0,0,1,0
4,2,0,76.3,0,1,0,0,1,0,0,0,1


In [58]:
df_dummies.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
parental level of education,1000.0,2.081,1.460333,0.0,1.0,2.0,3.0,5.0
test preparation course,1000.0,0.358,0.479652,0.0,0.0,0.0,1.0,1.0
mean score,1000.0,67.7698,14.257197,9.0,58.3,68.3,77.7,100.0
gender_female,1000.0,0.518,0.499926,0.0,0.0,1.0,1.0,1.0
gender_male,1000.0,0.482,0.499926,0.0,0.0,0.0,1.0,1.0
race/ethnicity_group A,1000.0,0.089,0.284886,0.0,0.0,0.0,0.0,1.0
race/ethnicity_group B,1000.0,0.19,0.392497,0.0,0.0,0.0,0.0,1.0
race/ethnicity_group C,1000.0,0.319,0.466322,0.0,0.0,0.0,1.0,1.0
race/ethnicity_group D,1000.0,0.262,0.439943,0.0,0.0,0.0,1.0,1.0
race/ethnicity_group E,1000.0,0.14,0.347161,0.0,0.0,0.0,0.0,1.0


In [59]:
df_dummies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   parental level of education  1000 non-null   int64  
 1   test preparation course      1000 non-null   int64  
 2   mean score                   1000 non-null   float64
 3   gender_female                1000 non-null   uint8  
 4   gender_male                  1000 non-null   uint8  
 5   race/ethnicity_group A       1000 non-null   uint8  
 6   race/ethnicity_group B       1000 non-null   uint8  
 7   race/ethnicity_group C       1000 non-null   uint8  
 8   race/ethnicity_group D       1000 non-null   uint8  
 9   race/ethnicity_group E       1000 non-null   uint8  
 10  lunch_free/reduced           1000 non-null   uint8  
 11  lunch_standard               1000 non-null   uint8  
dtypes: float64(1), int64(2), uint8(9)
memory usage: 40.0 KB


## Save DataFrame with Dummies

In [60]:
df_dummies.to_csv(parent+'/data/interim/total_student_scores_dummies.csv')

# One Hot Label Encoder

# Split into Train and Test Data

In [61]:
from sklearn.model_selection import train_test_split

X = df_dummies.drop('mean score', axis=1)
y = df_dummies['mean score']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=27)

In [62]:
# examine data details
df_dummies.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
parental level of education,1000.0,2.081,1.460333,0.0,1.0,2.0,3.0,5.0
test preparation course,1000.0,0.358,0.479652,0.0,0.0,0.0,1.0,1.0
mean score,1000.0,67.7698,14.257197,9.0,58.3,68.3,77.7,100.0
gender_female,1000.0,0.518,0.499926,0.0,0.0,1.0,1.0,1.0
gender_male,1000.0,0.482,0.499926,0.0,0.0,0.0,1.0,1.0
race/ethnicity_group A,1000.0,0.089,0.284886,0.0,0.0,0.0,0.0,1.0
race/ethnicity_group B,1000.0,0.19,0.392497,0.0,0.0,0.0,0.0,1.0
race/ethnicity_group C,1000.0,0.319,0.466322,0.0,0.0,0.0,1.0,1.0
race/ethnicity_group D,1000.0,0.262,0.439943,0.0,0.0,0.0,1.0,1.0
race/ethnicity_group E,1000.0,0.14,0.347161,0.0,0.0,0.0,0.0,1.0


# Standardize the Magnitude of Numeric Features Using a Scaler

In [14]:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

# Make scaler object
scaler = preprocessing.MinMaxScaler()

# Fit training data to scaler object
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

In [69]:
X_test

Unnamed: 0,parental level of education,test preparation course,gender_female,gender_male,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,lunch_free/reduced,lunch_standard
754,3,0,0,1,0,0,1,0,0,1,0
768,0,0,1,0,0,0,0,1,0,0,1
23,0,0,1,0,0,0,1,0,0,0,1
47,1,0,1,0,0,0,1,0,0,0,1
453,2,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
480,1,1,0,1,0,1,0,0,0,0,1
992,3,0,1,0,0,0,0,1,0,1,0
783,3,1,1,0,0,0,1,0,0,0,1
201,2,0,1,0,0,0,0,1,0,1,0


In [63]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X_train, y_train )

print("Train Accuracy:",regression.score(X_train, y_train))
print("Test Accuracy:",regression.score(X_test, y_test))

Train Accuracy: 0.2529961577201669
Test Accuracy: 0.16259284332078217


In [73]:
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

tree_clf = RandomForestRegressor(max_depth=4,
                                 min_samples_split=50,
                                 max_features='auto',
                                 criterion='mse').fit(X_train,y_train)

MSE_test = mean_squared_error(y_pred=tree_clf.predict(X_test),y_true=y_test,squared=False)
print(f"RMSE for test set: {MSE_test}")

RMSE for test set: 12.710378713559065


In [74]:
score = tree_clf.score(X_test, y_test)
score

0.11262065482813988

In [70]:
from sklearn.ensemble import RandomForestRegressor

#instantiating the model
model = RandomForestRegressor()

#fitting the model
model.fit(X_train, y_train)

RandomForestRegressor()

In [71]:

y_preds = model.predict(X_test)

In [72]:
model.score(X_test, y_test)

-0.034201109636332605