In [2]:
# import pandas
import pandas as pd
import os

school_data_to_load = os.path.join("Resources", "schools_complete.csv")
student_data_to_load = os.path.join("Resources","students_complete.csv")

school_data_df = pd.read_csv(school_data_to_load)
student_data_df = pd.read_csv(student_data_to_load)

In [3]:
# at-a-glance data of a DataFrame

# df.head() prints the first five rows of a DataFrame
# df.tail() prints the last five rows
# df.head(<int>) takes any integer and returns the number of rows specified counting from the top
# df.tail(<int>) also works the same

In [4]:
# Data cleaning

student_data_df.isnull()

# chaining sum() to isnull() sums  the number of values that are null in each column

student_data_df.isnull().sum()

# Looks like no missing values!

Student ID       0
student_name     0
gender           0
grade            0
school_name      0
reading_score    0
math_score       0
dtype: int64

In [7]:
# the dtype attribute can be chained or called on a DataFrame to reveal datatypes
# calling it on the dataframe returns all its data types
# calling it on a column of a dataframe returns the column's data types

# if the column name has no spaces, you can also call df.column.dtype
# where column is the column name and df is the dataframe name

school_data_df.dtypes

School ID       int64
school_name    object
type           object
size            int64
budget          int64
dtype: object

In [8]:
school_data_df["School ID"].dtype

dtype('int64')

In [10]:
student_data_df.dtypes

Student ID        int64
student_name     object
gender           object
grade            object
school_name      object
reading_score     int64
math_score        int64
dtype: object

In [11]:
# Data Cleaning: -------------------------------

# From our exploratory analysis in cleaning_student_names.ipynb, we know the following pre/suffixes are in our dataset

# Add each prefix and suffix to remove to a list.
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

# Iterate through the words in the "prefixes_suffixes" list and replace them with an empty space, "".
for word in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(word,"")

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
39150,39150,Jennifer Hamilton,F,11th,Thomas High School,80,75
39151,39151,Shannon Williams,F,10th,Thomas High School,84,73
39152,39152,Lori Moore,F,9th,Thomas High School,98,84
39153,39153,William Hubbard,M,9th,Thomas High School,80,75
39154,39154,Bradley Johnson,M,12th,Thomas High School,91,71
39155,39155,John Brooks,M,10th,Thomas High School,92,98
39156,39156,Stephanie Contreras,F,11th,Thomas High School,79,95
39157,39157,Kristen Gonzalez,F,9th,Thomas High School,79,94
39158,39158,Kari Holloway,F,10th,Thomas High School,87,90
39159,39159,Kimberly Cabrera,F,11th,Thomas High School,85,72
