# Student Scores: Data Wrangling

# Data Collection
Load required packages and modules into Python. Then load the data into a pandas dataframe.

In [1]:
# load python packages from environment
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline



**Prepare to save the urls to csv. Save current working directory.**

In [2]:
path = os.getcwd()
parent = os.path.dirname(path)
print(parent)

/Users/tiffanyflor/Dropbox/MyProjects/Student Scores


In [3]:
data_path = parent+'/data/raw'
os.listdir(data_path)

['StudentsPerformance.csv']

## Save the data to csv file
Data from [Kaggle] (link: https://www.kaggle.com/spscientist/students-performance-in-exams)

In [4]:
df = pd.read_csv(data_path+'/StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


# Data Organization

In [5]:
# Folder to hold data that has been edited and to be used in future steps
#os.mkdir(parent+'/data/interim')

In [6]:
# Folder to hold visuals such as graphs
#os.mkdir(parent+'visuals')

# Data Definition
Review which columns are integer, float, categorical, or dates. Ensure the data type is loaded properly into the dataframe.

## Column Names

In [7]:
df.columns

Index(['gender', 'race/ethnicity', 'parental level of education', 'lunch',
       'test preparation course', 'math score', 'reading score',
       'writing score'],
      dtype='object')

## Date Types

In [8]:
df.dtypes

gender                         object
race/ethnicity                 object
parental level of education    object
lunch                          object
test preparation course        object
math score                      int64
reading score                   int64
writing score                   int64
dtype: object

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


# Data Cleaning

## Examine NAs

In [10]:
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

## Examine value counts of object types
Map the values of parental level of education and test perparation course to ranking integers.
Keep the others as objects to be changed to categorical in next notebook. These will eventually be converted into dummy variables.

In [11]:
df['gender'].value_counts()

female    518
male      482
Name: gender, dtype: int64

In [12]:
df['race/ethnicity'].value_counts().sum()

1000

In [13]:
df['parental level of education'].value_counts()

some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: parental level of education, dtype: int64

In [14]:
education_map = {'some high school':0, 'high school':1, 'some college': 2, 'associate\'s degree':3, 'bachelor\'s degree':4, 'master\'s degree':5}

df['parental level of education'] = df['parental level of education'].map(education_map)

In [15]:
df['parental level of education'].value_counts().sum()

1000

In [16]:
df['lunch'].value_counts()

standard        645
free/reduced    355
Name: lunch, dtype: int64

In [17]:
df['test preparation course'] = df['test preparation course'].map({'none':0,'completed':1})

In [18]:
df['test preparation course'].value_counts()

0    642
1    358
Name: test preparation course, dtype: int64

In [19]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,4,standard,0,72,72,74
1,female,group C,2,standard,1,69,90,88
2,female,group B,5,standard,0,90,95,93
3,male,group A,3,free/reduced,0,47,57,44
4,male,group C,2,standard,0,76,78,75


## Export data to new csv file

In [20]:
df.to_csv(parent+'/data/interim/cleaned_student_scores.csv')