## Replace the 9th grade reading and math scores at Thomas High School with NaN. 

In [1]:
# Dependencies and Setup
import pandas as pd

# File to Load (Remember to change the path if needed.)
school_data_to_load = "resources/schools_complete.csv"
student_data_to_load = "resources/students_complete.csv"

# Read the School Data and Student Data and store into a Pandas DataFrame
school_data_df = pd.read_csv(school_data_to_load)
student_data_df = pd.read_csv(student_data_to_load)

# Cleaning Student Names and Replacing Substrings in a Python String
# Add each prefix and suffix to remove to a list.
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

# Iterate through the words in the "prefixes_suffixes" list and replace them with an empty space, "".
for word in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(word,"")

# Check names.
student_data_df.tail(10)
school_data_df.head(10)

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500
5,5,Wilson High School,Charter,2283,1319574
6,6,Cabrera High School,Charter,1858,1081356
7,7,Bailey High School,District,4976,3124928
8,8,Holden High School,Charter,427,248087
9,9,Pena High School,Charter,962,585858


In [2]:
def calculatepercentage(value1, totalvalue):
    return (value1 / float(totalvalue)*100)

In [3]:
# Install numpy using conda install numpy or pip install numpy. 
# Step 1. Import numpy as np.
import numpy as np

In [4]:
#Set Student ID as index
#student_data_df.set_index("Student ID",inplace=True)
# Step 2. Use the loc method on the student_data_df to select all the reading scores from the 9th grade at Thomas High School and replace them with NaN.

student_data_df.loc[(student_data_df['school_name'] == 'Thomas High School') & (student_data_df['grade'] == "9th") & (student_data_df['reading_score'] >= 0), "reading_score"] = np.nan

In [5]:
#  Step 3. Refactor the code in Step 2 to replace the math scores with NaN.
student_data_df.loc[(student_data_df['school_name'] == 'Thomas High School') & (student_data_df['grade'] == "9th") & (student_data_df['math_score'] >= 0), "math_score"] = np.nan

In [6]:
#  Step 4. Check the student data for NaN's. 
student_data_df.tail(10)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
39160,39160,Katie Weaver,F,11th,Thomas High School,89.0,86.0
39161,39161,April Reyes,F,10th,Thomas High School,70.0,84.0
39162,39162,Derek Weeks,M,12th,Thomas High School,94.0,77.0
39163,39163,John Reese,M,11th,Thomas High School,90.0,75.0
39164,39164,Joseph Anthony,M,9th,Thomas High School,,
39165,39165,Donna Howard,F,12th,Thomas High School,99.0,90.0
39166,39166,Dawn Bell,F,10th,Thomas High School,95.0,70.0
39167,39167,Rebecca Tanner,F,9th,Thomas High School,,
39168,39168,Desiree Kidd,F,10th,Thomas High School,99.0,90.0
39169,39169,Carolyn Jackson,F,11th,Thomas High School,95.0,75.0


In [7]:
#Fill NaNs with 0s
student_data_df["corrected_reading_score"] = student_data_df["reading_score"].fillna(0)
student_data_df["corrected_math_score"] = student_data_df["math_score"].fillna(0)
#student_data_df.tail(10)

In [8]:
total_school_count = school_data_df["school_name"].count()
total_students_count = student_data_df["Student ID"].count()

In [9]:
#Get Data for District Summary
total_school_budget = school_data_df["budget"].sum()
corrected_average_math_score = student_data_df["math_score"].mean()
corrected_average_reading_score = student_data_df["reading_score"].mean()

#Calculate passing math percentage 
passing_math_score = student_data_df[student_data_df["math_score"] >= 70]
passing_math_percentage = calculatepercentage(passing_math_score.count(),total_students_count)["Student ID"]

#Calculate passing reading percentage 
passing_reading_score = student_data_df[student_data_df["reading_score"] >= 70]
passing_reading_percentage = calculatepercentage(passing_reading_score.count(),total_students_count)["Student ID"]

#Calculate overall passing percentage 
overall_passing_score = student_data_df[(student_data_df["math_score"] >= 70) & (student_data_df["reading_score"] >= 70)]
overall_passing_percentage = calculatepercentage(overall_passing_score.count(),total_students_count)["Student ID"]
print("---------The District Summary--------------")
district_summary_df = pd.DataFrame([{
    "Total Schools": total_school_count,
    "Total Students": ("{:,}".format(total_students_count)),
    "Total Budget": ("${:,.2f}".format(total_school_budget)),
    "Average Math Score": ("{:.1f}".format(corrected_average_math_score)),
    "Average Reading Score": ("{:.1f}".format(corrected_average_reading_score)),
    "% Passing Math": ("{:.0f}".format(passing_math_percentage)),
    "% Passing Reading": ("{:.0f}".format(passing_reading_percentage)),
    "% Overall Passing": ("{:.0f}".format(overall_passing_percentage))
}])
district_summary_df

---------The District Summary--------------


Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.9,81.9,74,85,64


In [10]:
# Merge the two data sets to get school wide report
complete_students_school_df = pd.merge(school_data_df,student_data_df, on=["school_name","school_name"])
#complete_students_school_df.tail(10)

In [11]:
# Get Data for Per School Summary
per_school_df = pd.DataFrame()
per_school_df["School Type"] = school_data_df.set_index("school_name")["type"]
per_school_df["Total Students"] = school_data_df.set_index("school_name")["size"]
per_school_df["Total School Budget"] = school_data_df.set_index("school_name")["budget"]
per_school_df["Per Student Capita"] = per_school_df["Total School Budget"]/per_school_df["Total Students"]
per_school_df["Average Math Score"] = complete_students_school_df.groupby(["school_name"]).mean()["math_score"]
per_school_df["Average Reading Score"] = complete_students_school_df.groupby(["school_name"]).mean()["reading_score"]
per_school_df_passing_math_score = passing_math_score.groupby(["school_name"]).count()["Student ID"]
per_school_df["% Passing Math"] = per_school_df_passing_math_score / per_school_df["Total Students"] * 100 
per_school_df_passing_reading_score = passing_reading_score.groupby(["school_name"]).count()["Student ID"]
per_school_df["% Passing Reading"] = per_school_df_passing_reading_score / per_school_df["Total Students"] * 100 
per_school_df_overall_passing_score = overall_passing_score.groupby(["school_name"]).count()["Student ID"]
per_school_df["% Overall Passing"] = per_school_df_overall_passing_score / per_school_df["Total Students"] * 100 
per_school_df = per_school_df.sort_values(["school_name"], ascending=True)

In [12]:
#Format per school data summary
print("-----------------Per School Summary-------------------")
per_school_df_formatted = pd.DataFrame({
            "School Type": per_school_df["School Type"], 
            "Total Students": per_school_df["Total Students"].map("{:,}".format),
            "Total Budget": per_school_df["Total School Budget"].map("${:,.2f}".format),
            "Average Math Score": per_school_df["Average Math Score"].map("{:.1f}".format),
            "Average Reading Score": per_school_df["Average Reading Score"].map("{:.1f}".format),
            "% Passing Math": per_school_df["% Passing Math"].map("{:.0f}".format),
            "% Passing Reading": per_school_df["% Passing Reading"].map("{:.0f}".format),
            "% Overall Passing": per_school_df["% Overall Passing"].map("{:.0f}".format),
            
})
per_school_df_formatted

-----------------Per School Summary-------------------


Unnamed: 0_level_0,School Type,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bailey High School,District,4976,"$3,124,928.00",77.0,81.0,67,82,55
Cabrera High School,Charter,1858,"$1,081,356.00",83.1,84.0,94,97,91
Figueroa High School,District,2949,"$1,884,411.00",76.7,81.2,66,81,53
Ford High School,District,2739,"$1,763,916.00",77.1,80.7,68,79,54
Griffin High School,Charter,1468,"$917,500.00",83.4,83.8,93,97,91
Hernandez High School,District,4635,"$3,022,020.00",77.3,80.9,67,81,54
Holden High School,Charter,427,"$248,087.00",83.8,83.8,93,96,89
Huang High School,District,2917,"$1,910,635.00",76.6,81.2,66,81,54
Johnson High School,District,4761,"$3,094,650.00",77.1,81.0,66,81,54
Pena High School,Charter,962,"$585,858.00",83.8,84.0,95,96,91


In [25]:
# Print 5 Highest performing schools
print("----------------------5 Highest performing schools--------------------------")
per_school_df_formatted.sort_values(["% Overall Passing"],ascending=False).head()

----------------------5 Highest performing schools--------------------------


Unnamed: 0_level_0,School Type,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Cabrera High School,Charter,1858,"$1,081,356.00",83.1,84.0,94,97,91
Griffin High School,Charter,1468,"$917,500.00",83.4,83.8,93,97,91
Pena High School,Charter,962,"$585,858.00",83.8,84.0,95,96,91
Wilson High School,Charter,2283,"$1,319,574.00",83.3,84.0,94,97,91
Shelton High School,Charter,1761,"$1,056,600.00",83.4,83.7,94,96,90


In [14]:
# Print 5 Least performing schools
print("----------------------5 Least performing schools--------------------------")
per_school_df_formatted.sort_values(["% Overall Passing"],ascending=False).tail()

----------------------5 Least performing schools--------------------------


Unnamed: 0_level_0,School Type,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Hernandez High School,District,4635,"$3,022,020.00",77.3,80.9,67,81,54
Huang High School,District,2917,"$1,910,635.00",76.6,81.2,66,81,54
Johnson High School,District,4761,"$3,094,650.00",77.1,81.0,66,81,54
Figueroa High School,District,2949,"$1,884,411.00",76.7,81.2,66,81,53
Rodriguez High School,District,3999,"$2,547,363.00",76.8,80.7,66,80,53


In [15]:
#Get Grade wise Average math scores
grade_wise_avg_math_scores_df = pd.DataFrame()
grade_wise_avg_math_scores_df["9th"] = complete_students_school_df[(complete_students_school_df["grade"] == "9th")].groupby(["school_name"]).mean()["math_score"]
grade_wise_avg_math_scores_df["10th"] = complete_students_school_df[(complete_students_school_df["grade"] == "10th")].groupby(["school_name"]).mean()["math_score"]
grade_wise_avg_math_scores_df["11th"] = complete_students_school_df[(complete_students_school_df["grade"] == "11th")].groupby(["school_name"]).mean()["math_score"]
grade_wise_avg_math_scores_df["12th"] = complete_students_school_df[(complete_students_school_df["grade"] == "12th")].groupby(["school_name"]).mean()["math_score"]

In [16]:
#Print Grade Wise Average Math Score
grade_wise_avg_math_scores_formatted_df = pd.DataFrame({
    "9th": grade_wise_avg_math_scores_df["9th"].map("{:.1f}".format),
    "10th": grade_wise_avg_math_scores_df["10th"].map("{:.1f}".format),
    "11th": grade_wise_avg_math_scores_df["11th"].map("{:.1f}".format),
    "12th": grade_wise_avg_math_scores_df["12th"].map("{:.1f}".format)
})
print("-------Grade Wise Average Math Score-------")
grade_wise_avg_math_scores_formatted_df

-------Grade Wise Average Math Score-------


Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.1,77.0,77.5,76.5
Cabrera High School,83.1,83.2,82.8,83.3
Figueroa High School,76.4,76.5,76.9,77.2
Ford High School,77.4,77.7,76.9,76.2
Griffin High School,82.0,84.2,83.8,83.4
Hernandez High School,77.4,77.3,77.1,77.2
Holden High School,83.8,83.4,85.0,82.9
Huang High School,77.0,75.9,76.4,77.2
Johnson High School,77.2,76.7,77.5,76.9
Pena High School,83.6,83.4,84.3,84.1


In [17]:
#Get Grade wise Average math scores
grade_wise_avg_reading_scores_df = pd.DataFrame()
grade_wise_avg_reading_scores_df["9th"] = complete_students_school_df[(complete_students_school_df["grade"] == "9th")].groupby(["school_name"]).mean()["reading_score"]
grade_wise_avg_reading_scores_df["10th"] = complete_students_school_df[(complete_students_school_df["grade"] == "10th")].groupby(["school_name"]).mean()["reading_score"]
grade_wise_avg_reading_scores_df["11th"] = complete_students_school_df[(complete_students_school_df["grade"] == "11th")].groupby(["school_name"]).mean()["reading_score"]
grade_wise_avg_reading_scores_df["12th"] = complete_students_school_df[(complete_students_school_df["grade"] == "12th")].groupby(["school_name"]).mean()["reading_score"]

In [18]:
#Print Grade Wise Average Reading Score
print("------Grade Wise Average Reading Score------")
grade_wise_avg_reading_scores_formatted_df = pd.DataFrame({
    "9th": grade_wise_avg_reading_scores_df["9th"].map("{:.1f}".format),
    "10th": grade_wise_avg_reading_scores_df["10th"].map("{:.1f}".format),
    "11th": grade_wise_avg_reading_scores_df["11th"].map("{:.1f}".format),
    "12th": grade_wise_avg_reading_scores_df["12th"].map("{:.1f}".format)
})
grade_wise_avg_reading_scores_formatted_df

------Grade Wise Average Reading Score------


Unnamed: 0_level_0,9th,10th,11th,12th
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.3,80.9,80.9,80.9
Cabrera High School,83.7,84.3,83.8,84.3
Figueroa High School,81.2,81.4,80.6,81.4
Ford High School,80.6,81.3,80.4,80.7
Griffin High School,83.4,83.7,84.3,84.0
Hernandez High School,80.9,80.7,81.4,80.9
Holden High School,83.7,83.3,83.8,84.7
Huang High School,81.3,81.5,81.4,80.3
Johnson High School,81.3,80.8,80.6,81.2
Pena High School,83.8,83.6,84.3,84.6


In [19]:
# The scores by school spending per student
spending_bins = [0,585,630,645,675]
group_names = ["<$584","$585-629","$630-644","$645-675"]
per_school_df["Spending Ranges (Per Student)"] = pd.cut(per_school_df["Per Student Capita"], spending_bins, labels=group_names)
avg_spending_math_scores = per_school_df.groupby(["Spending Ranges (Per Student)"]).mean()["Average Math Score"]
avg_spending_reading_scores = per_school_df.groupby(["Spending Ranges (Per Student)"]).mean()["Average Reading Score"]
avg_spending_passing_math_scores = per_school_df.groupby(["Spending Ranges (Per Student)"]).mean()["% Passing Math"]
avg_spending_passing_reading_scores = per_school_df.groupby(["Spending Ranges (Per Student)"]).mean()["% Passing Reading"]
avg_spending_overall_passing_scores = per_school_df.groupby(["Spending Ranges (Per Student)"]).mean()["% Overall Passing"]
print("--------------Scores by School Spending---------------------------")
# Assemble into DataFrame.
spending_summary_df = pd.DataFrame({
          "Average Math Score" : avg_spending_math_scores.map("{:.1f}".format),
          "Average Reading Score": avg_spending_reading_scores.map("{:.1f}".format),
          "% Passing Math": avg_spending_passing_math_scores.map("{:.0f}".format),
          "% Passing Reading": avg_spending_passing_reading_scores.map("{:.0f}".format),
          "% Overall Passing": avg_spending_overall_passing_scores.map("{:.0f}".format)})
spending_summary_df

--------------Scores by School Spending---------------------------


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$584,83.5,83.9,93,97,90
$585-629,81.9,83.2,87,93,81
$630-644,78.5,81.6,67,77,56
$645-675,77.0,81.0,66,81,54


In [20]:
# The scores by school size
school_size_bins = [0,1000,2000,5000]
bin_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]
per_school_df["School_size"] = pd.cut(per_school_df["Total Students"], school_size_bins, labels=bin_names)
avg_school_size_math_scores = per_school_df.groupby(["School_size"]).mean()["Average Math Score"]
avg_school_size_reading_scores = per_school_df.groupby(["School_size"]).mean()["Average Reading Score"]
avg_school_size_passing_math_scores = per_school_df.groupby(["School_size"]).mean()["% Passing Math"]
avg_school_size_passing_reading_scores = per_school_df.groupby(["School_size"]).mean()["% Passing Reading"]
avg_school_size_overall_passing_scores = per_school_df.groupby(["School_size"]).mean()["% Overall Passing"]
print("--------------Scores by School Size---------------------------")
# Assemble into DataFrame.
school_size_summary_df = pd.DataFrame({
          "Average Math Score" : avg_school_size_math_scores.map("{:.1f}".format),
          "Average Reading Score": avg_school_size_reading_scores.map("{:.1f}".format),
          "% Passing Math": avg_school_size_passing_math_scores.map("{:.0f}".format),
          "% Passing Reading": avg_school_size_passing_reading_scores.map("{:.0f}".format),
          "% Overall Passing": avg_school_size_overall_passing_scores.map("{:.0f}".format)})
school_size_summary_df

--------------Scores by School Size---------------------------


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.8,83.9,94,96,90
Medium (1000-2000),83.4,83.9,88,91,85
Large (2000-5000),77.7,81.3,70,83,58


In [21]:
# The scores by school Type
avg_school_type_math_scores = per_school_df.groupby(["School Type"]).mean()["Average Math Score"]
avg_school_type_reading_scores = per_school_df.groupby(["School Type"]).mean()["Average Reading Score"]
avg_school_type_passing_math_scores = per_school_df.groupby(["School Type"]).mean()["% Passing Math"]
avg_school_type_passing_reading_scores = per_school_df.groupby(["School Type"]).mean()["% Passing Reading"]
avg_school_type_overall_passing_scores = per_school_df.groupby(["School Type"]).mean()["% Overall Passing"]
print("--------------Scores by School Type---------------------------")
# Assemble into DataFrame.
school_size_summary_df = pd.DataFrame({
          "Average Math Score" : avg_school_type_math_scores.map("{:.1f}".format),
          "Average Reading Score": avg_school_type_reading_scores.map("{:.1f}".format),
          "% Passing Math": avg_school_type_passing_math_scores.map("{:.0f}".format),
          "% Passing Reading": avg_school_type_passing_reading_scores.map("{:.0f}".format),
          "% Overall Passing": avg_school_type_overall_passing_scores.map("{:.0f}".format)})
school_size_summary_df

--------------Scores by School Type---------------------------


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.5,83.9,90,93,87
District,77.0,81.0,67,81,54
