In [1]:
# Add the pandas dependency.
import pandas as pd

In [2]:
# Files to load
file_to_load = 'Resources\missing_grades.csv'

In [3]:
# open and read file into dataframe
missing_grades_df = pd.read_csv(file_to_load)
missing_grades_df

Unnamed: 0,Student ID,student_name,gender,grade,reading_score,math_score
0,0,Paul Bradley,M,9th,66.0,79.0
1,1,Victor Smith,M,12th,94.0,61.0
2,2,Kevin Rodriguez,M,12th,,60.0
3,3,Dr. Richard Scott,M,12th,67.0,58.0
4,4,Bonnie Ray,F,9th,97.0,84.0
5,5,Bryan Miranda,M,9th,94.0,
6,6,Sheena Carter,F,11th,82.0,80.0
7,7,Nicole Baker,F,12th,96.0,69.0


## Find missing data

In [4]:
missing_grades_df.count()

Student ID       8
student_name     8
gender           8
grade            8
reading_score    7
math_score       7
dtype: int64

## Handle Missing Data

### Option 1: Do Nothing

In [5]:
# if we do nothing, when we sum or take the averages of the reading and math scores, those NaNs will not be considered in the sum or the averages (just as they are not considered in the sum or the averages in an Excel file). In this situation, the missing values have no impact.

# However, if we multiply or divide with a row that has a NaN, the answer will be NaN. This can cause problems if we need the answer for the rest of our code.

### Option 2: Drop the Row

In [6]:
# Another option is to drop the row where there are NaNs. When we remove the row containing the NaN, we will also remove all the data associated with that row. 
# This can cause problems later if there is data in the other rows that we need. 
# To drop a roe with NaNs, use dropna() method. 
# Drop the NaNs.
missing_grades_df.dropna()

Unnamed: 0,Student ID,student_name,gender,grade,reading_score,math_score
0,0,Paul Bradley,M,9th,66.0,79.0
1,1,Victor Smith,M,12th,94.0,61.0
3,3,Dr. Richard Scott,M,12th,67.0,58.0
4,4,Bonnie Ray,F,9th,97.0,84.0
6,6,Sheena Carter,F,11th,82.0,80.0
7,7,Nicole Baker,F,12th,96.0,69.0


## Important

    Dropping rows can affect the story you are trying to tell with the data. Before removing rows with NaN, you should ask yourself two key questions:

    How much data would be removed if NaNs are dropped?

    How would this impact the analysis?

    These questions need to be addressed for every dataset you work with.

### Option 3: Fill in the Row

In [8]:
# Fill in the empty rows with '85'
missing_grades_df.fillna(85)

Unnamed: 0,Student ID,student_name,gender,grade,reading_score,math_score
0,0,Paul Bradley,M,9th,66.0,79.0
1,1,Victor Smith,M,12th,94.0,61.0
2,2,Kevin Rodriguez,M,12th,85.0,60.0
3,3,Dr. Richard Scott,M,12th,67.0,58.0
4,4,Bonnie Ray,F,9th,97.0,84.0
5,5,Bryan Miranda,M,9th,94.0,85.0
6,6,Sheena Carter,F,11th,82.0,80.0
7,7,Nicole Baker,F,12th,96.0,69.0
