# Student Scores: Pre-Processing and Training

# Load Required Python Packages

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline



**Locate and print current working directory and parent directory.**

In [2]:
path = os.getcwd()
parent = os.path.dirname(path)
print(path)
print(parent)

/Users/tiffanyflor/Dropbox/MyProjects/Student Scores/notebooks
/Users/tiffanyflor/Dropbox/MyProjects/Student Scores


**Print contents of data/interim.**

In [3]:
os.listdir(parent+'/data/interim')

['total_student_scores_without_dummies.csv',
 'cleaned_student_scores.csv',
 'student_scores_no_outliers.csv',
 'total_student_scores_dummies.csv',
 '.ipynb_checkpoints']

# Load Data

In [4]:
df = pd.read_csv(parent+'/data/interim/total_student_scores_without_dummies.csv',index_col=0)

In [5]:
df.head(20)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,mean score,education,Pass Mean Exam,Pass Math,Pass Reading,Pass Writing
0,female,group B,bachelor's degree,standard,0,72.7,1,pass,pass,pass,pass
1,female,group C,some college,standard,1,82.3,1,pass,fail,pass,pass
2,female,group B,master's degree,standard,0,92.7,1,pass,pass,pass,pass
3,male,group A,associate's degree,free/reduced,0,49.3,1,fail,fail,fail,fail
4,male,group C,some college,standard,0,76.3,1,pass,pass,pass,pass
5,female,group B,associate's degree,standard,0,77.3,1,pass,pass,pass,pass
6,female,group B,some college,standard,1,91.7,1,pass,pass,pass,pass
7,male,group B,some college,free/reduced,0,40.7,1,fail,fail,fail,fail
8,male,group D,high school,free/reduced,1,65.0,0,fail,fail,fail,fail
9,female,group B,high school,free/reduced,0,49.3,0,fail,fail,fail,fail


In [7]:
df = df.astype({'gender':'category', 'race/ethnicity':'category','parental level of education':'category','lunch':'category','Pass Mean Exam':'category','Pass Math':'category','Pass Reading':'category','Pass Writing':'category'})

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9950 entries, 0 to 9949
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   gender                       9950 non-null   category
 1   race/ethnicity               9950 non-null   category
 2   parental level of education  9950 non-null   category
 3   lunch                        9950 non-null   category
 4   test preparation course      9950 non-null   int64   
 5   mean score                   9950 non-null   float64 
 6   education                    9950 non-null   int64   
 7   Pass Mean Exam               9950 non-null   category
 8   Pass Math                    9950 non-null   category
 9   Pass Reading                 9950 non-null   category
 10  Pass Writing                 9950 non-null   category
dtypes: category(8), float64(1), int64(2)
memory usage: 389.6 KB


In [9]:
df.drop(['mean score'],axis=1,inplace=True)

In [10]:
X = df.drop(['Pass Mean Exam','Pass Math','Pass Reading','Pass Writing'], axis=1)
y_mean = df['Pass Mean Exam']
y_math = df['Pass Math']
y_reading = df['Pass Reading']
y_writing = df['Pass Writing']

# Create Dummy Variables

In [11]:
# One-hot encode categorical features
X = pd.get_dummies(X,drop_first=True)

# Pearson Correlation 
corr_dummies = X.corr(method='pearson')
corr_dummies.round(2).style.background_gradient(cmap='coolwarm')

Unnamed: 0,test preparation course,education,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard
test preparation course,1.0,0.0,0.02,-0.0,0.02,-0.02,0.0,0.01,-0.0,0.01,0.0,-0.0,0.0
education,0.0,1.0,-0.02,-0.01,-0.01,-0.01,0.03,0.29,-0.63,0.22,0.43,-0.61,0.0
gender_male,0.02,-0.02,1.0,0.0,-0.01,-0.0,0.0,-0.02,0.01,0.01,-0.0,0.01,-0.01
race/ethnicity_group B,-0.0,-0.01,0.0,1.0,-0.34,-0.3,-0.2,-0.01,0.01,0.01,-0.0,-0.0,0.02
race/ethnicity_group C,0.02,-0.01,-0.01,-0.34,1.0,-0.4,-0.27,-0.0,0.0,0.01,-0.0,0.01,-0.02
race/ethnicity_group D,-0.02,-0.01,-0.0,-0.3,-0.4,1.0,-0.24,-0.01,-0.01,-0.0,-0.01,0.02,0.01
race/ethnicity_group E,0.0,0.03,0.0,-0.2,-0.27,-0.24,1.0,0.01,-0.01,-0.0,0.02,-0.02,0.0
parental level of education_bachelor's degree,0.01,0.29,-0.02,-0.01,-0.0,-0.01,0.01,1.0,-0.18,-0.1,-0.2,-0.17,0.0
parental level of education_high school,-0.0,-0.63,0.01,0.01,0.0,-0.01,-0.01,-0.18,1.0,-0.14,-0.27,-0.24,-0.01
parental level of education_master's degree,0.01,0.22,0.01,0.01,0.01,-0.0,-0.0,-0.1,-0.14,1.0,-0.15,-0.13,-0.0


In [12]:
print(df.shape)
X.head()

(9950, 10)


Unnamed: 0,test preparation course,education,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard
0,0,1,0,1,0,0,0,1,0,0,0,0,1
1,1,1,0,0,1,0,0,0,0,0,1,0,1
2,0,1,0,1,0,0,0,0,0,1,0,0,1
3,0,1,1,0,0,0,0,0,0,0,0,0,0
4,0,1,1,0,1,0,0,0,0,0,1,0,1


In [13]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
test preparation course,9950.0,0.34201,0.474407,0.0,0.0,0.0,1.0,1.0
education,9950.0,0.617387,0.486049,0.0,0.0,1.0,1.0,1.0
gender_male,9950.0,0.504623,0.500004,0.0,0.0,1.0,1.0,1.0
race/ethnicity_group B,9950.0,0.199799,0.399869,0.0,0.0,0.0,0.0,1.0
race/ethnicity_group C,9950.0,0.314774,0.464449,0.0,0.0,0.0,1.0,1.0
race/ethnicity_group D,9950.0,0.262915,0.440239,0.0,0.0,0.0,1.0,1.0
race/ethnicity_group E,9950.0,0.139598,0.346587,0.0,0.0,0.0,0.0,1.0
parental level of education_bachelor's degree,9950.0,0.117588,0.322136,0.0,0.0,0.0,0.0,1.0
parental level of education_high school,9950.0,0.196281,0.397204,0.0,0.0,0.0,0.0,1.0
parental level of education_master's degree,9950.0,0.071759,0.258101,0.0,0.0,0.0,0.0,1.0


In [14]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9950 entries, 0 to 9949
Data columns (total 13 columns):
 #   Column                                         Non-Null Count  Dtype
---  ------                                         --------------  -----
 0   test preparation course                        9950 non-null   int64
 1   education                                      9950 non-null   int64
 2   gender_male                                    9950 non-null   uint8
 3   race/ethnicity_group B                         9950 non-null   uint8
 4   race/ethnicity_group C                         9950 non-null   uint8
 5   race/ethnicity_group D                         9950 non-null   uint8
 6   race/ethnicity_group E                         9950 non-null   uint8
 7   parental level of education_bachelor's degree  9950 non-null   uint8
 8   parental level of education_high school        9950 non-null   uint8
 9   parental level of education_master's degree    9950 non-null   uint8
 10  

## Save DataFrame with Dummies

In [16]:
Xy_dummies = pd.concat([X,y_mean,y_math,y_reading,y_writing],axis=1)
Xy_dummies.to_csv(parent+'/data/interim/total_student_scores_dummies.csv')

# Split into Train and Test Data

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y_mean,test_size=0.2,random_state=27)

In [19]:
# examine data details
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
test preparation course,9950.0,0.34201,0.474407,0.0,0.0,0.0,1.0,1.0
education,9950.0,0.617387,0.486049,0.0,0.0,1.0,1.0,1.0
gender_male,9950.0,0.504623,0.500004,0.0,0.0,1.0,1.0,1.0
race/ethnicity_group B,9950.0,0.199799,0.399869,0.0,0.0,0.0,0.0,1.0
race/ethnicity_group C,9950.0,0.314774,0.464449,0.0,0.0,0.0,1.0,1.0
race/ethnicity_group D,9950.0,0.262915,0.440239,0.0,0.0,0.0,1.0,1.0
race/ethnicity_group E,9950.0,0.139598,0.346587,0.0,0.0,0.0,0.0,1.0
parental level of education_bachelor's degree,9950.0,0.117588,0.322136,0.0,0.0,0.0,0.0,1.0
parental level of education_high school,9950.0,0.196281,0.397204,0.0,0.0,0.0,0.0,1.0
parental level of education_master's degree,9950.0,0.071759,0.258101,0.0,0.0,0.0,0.0,1.0


# DO NOT DO for this dataset because it is made up of One Hot Encoded features: Standardize the Magnitude of Numeric Features Using a Scaler

In [20]:
# This is here for future reference of how to perform scaling
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

# Make scaler object
scaler = preprocessing.StandardScaler()

# Fit training data to scaler object
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

In [21]:
X_test.head()

Unnamed: 0,test preparation course,education,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard
6379,0,1,0,0,0,0,1,0,0,0,1,0,0
511,0,0,1,0,0,0,0,0,0,0,0,1,1
6654,0,0,1,0,0,0,1,0,0,0,0,1,1
6906,1,1,1,1,0,0,0,0,0,0,0,0,0
2279,0,1,0,1,0,0,0,0,0,1,0,0,1
