In [1]:
import pandas as pd
import os

In [2]:
school_data_to_load = os.path.join("resources","schools_complete.csv")
students_data_to_load = os.path.join("resources","students_complete.csv")
# Add each prefix and suffix to remove to a list.
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

In [3]:
school_data_df = pd.read_csv(school_data_to_load)
#school_data_df.head()

In [4]:
# Determine if there are any missing values in the school data.
#school_data_df.count()
#school_data_df.notnull()

In [5]:
#Read student data file and determine if there are any missing values
student_data_df = pd.read_csv(students_data_to_load)
#student_data_df.count()
#student_data_df.isnull().sum()

In [6]:
# Iterate through the words in the "prefixes_suffixes" list and replace them with an empty space, "".
for word in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(word,"")
#student_data_df.head()    

In [7]:
clean_students_data_to_load = os.path.join("resources","clean_students_complete.csv")
clean_students_data_df = pd.read_csv(clean_students_data_to_load)

In [8]:
def calculatepassingpercentage(passing_count,total_count):
    return (passing_count / float(total_count)*100)

In [9]:
complete_student_data_df = pd.merge(school_data_df,clean_students_data_df, on=["school_name","school_name"])

In [10]:
total_school_count = school_data_df.school_name.count()

In [11]:
total_student_count = clean_students_data_df["Student ID"].count()

In [12]:
total_budget = school_data_df["budget"].sum()

In [13]:
average_math_score = clean_students_data_df["math_score"].mean()
averrage_reading_score = clean_students_data_df["reading_score"].mean()

In [14]:
passing_math_score = complete_student_data_df[complete_student_data_df["math_score"] >= 70]
passing_reading_score = complete_student_data_df[complete_student_data_df["reading_score"] >= 70]
overall_reading_score = complete_student_data_df[(complete_student_data_df["math_score"] >= 70)  & (complete_student_data_df["reading_score"] >= 70)]
#overall_reading_score.head(10)

In [15]:
passing_math_percentage = calculatepassingpercentage(passing_math_score["student_name"].count(),total_student_count)
passing_reading_percentage = calculatepassingpercentage(passing_reading_score["student_name"].count(),total_student_count)
passing_overall_percentage = calculatepassingpercentage(overall_reading_score["student_name"].count(),total_student_count)

In [16]:
district_summary_df = pd.DataFrame([{
    "Total Schools": total_school_count,
    "Total Students": ("{:,}".format(total_student_count)),
    "Total Budget": ("${:,.2f}".format(total_budget)),
    "Average Math Score": ("{:.1f}".format(average_math_score)),
    "Average Reading Score": ("{:.1f}".format(averrage_reading_score)),
    "% Passing Math": ("{:.0f}".format(passing_math_percentage)),
    "% Passing Reading": ("{:.0f}".format(passing_reading_percentage)),
    "% Overall Passing": ("{:.0f}".format(passing_overall_percentage)),
}])
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",79.0,81.9,75,86,65


In [17]:
#Starting section 4.8
#Determine the school type
per_school_types = school_data_df.set_index(["school_name"])["type"]
df = pd.DataFrame(per_school_types)

In [18]:
per_school_student_counts = school_data_df.set_index(["school_name"])["size"]
#per_school_student_counts

In [19]:
# Calculate the total student count.
#per_school_counts = complete_student_data_df["school_name"].value_counts()
#per_school_counts

In [20]:
per_school_budget = school_data_df.set_index(["school_name"])["budget"]
#per_school_budget

In [21]:
per_school_capita = per_school_budget / per_school_student_counts
#per_school_capita

In [22]:
#per_school_averages = complete_student_data_df.groupby(["school_name"]).mean()
per_school_math = complete_student_data_df.groupby(["school_name"]).mean()["math_score"]
per_school_read = complete_student_data_df.groupby(["school_name"]).mean()["reading_score"]
#per_school_math

In [23]:
per_school_passing_math_score = complete_student_data_df[complete_student_data_df["math_score"] >= 70]
per_school_passing_reading_score = complete_student_data_df[complete_student_data_df["reading_score"] >= 70]
per_school_overall_passing_score = complete_student_data_df[(complete_student_data_df["math_score"] >= 70) & (complete_student_data_df["reading_score"] >= 70)]

In [24]:
per_school_passing_math_score = per_school_passing_math_score.groupby(["school_name"]).count()["student_name"]
per_school_passing_reading_score = per_school_passing_reading_score.groupby(["school_name"]).count()["student_name"]
per_school_overall_passing_score = per_school_overall_passing_score.groupby(["school_name"]).count()["student_name"]

In [25]:
# Calculate the percentage of passing math and reading scores per school.
per_school_passing_math = per_school_passing_math_score / per_school_student_counts * 100
per_school_passing_reading = per_school_passing_reading_score / per_school_student_counts * 100
per_school_passing_overall = per_school_overall_passing_score / per_school_student_counts * 100
#per_school_passing_overall

In [26]:
per_school_summary_df = pd.DataFrame({
             "School Type": per_school_types,
             "Total Students": per_school_student_counts,
             "Total School Budget": per_school_budget.map("${:,.2f}".format),
             "Per Student Budget": per_school_capita.map("${:,.2f}".format),
             "Average Math Score": per_school_math.map("{:.1f}".format),
           "Average Reading Score": per_school_read.map("{:.1f}".format),
           "% Passing Math": per_school_passing_math.map("{:.0f}".format),
           "% Passing Reading": per_school_passing_reading.map("{:.0f}".format),
           "% Overall Passing": per_school_passing_overall.map("{:.0f}".format)})
per_school_summary_df.head()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.0,81.0,67,82,55
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.1,84.0,94,97,91
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.7,81.2,66,81,53
Ford High School,District,2739,"$1,763,916.00",$644.00,77.1,80.7,68,79,54
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.4,83.8,93,97,91


In [27]:
#sort and show top five schools
top_schools = per_school_summary_df.sort_values(["% Overall Passing"],ascending=False)
top_schools.head()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.1,84.0,94,97,91
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.4,83.8,93,97,91
Pena High School,Charter,962,"$585,858.00",$609.00,83.8,84.0,95,96,91
Thomas High School,Charter,1635,"$1,043,130.00",$638.00,83.4,83.8,93,97,91
Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.3,84.0,94,97,91


In [28]:
top_schools.tail()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.3,80.9,67,81,54
Huang High School,District,2917,"$1,910,635.00",$655.00,76.6,81.2,66,81,54
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.1,81.0,66,81,54
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.7,81.2,66,81,53
Rodriguez High School,District,3999,"$2,547,363.00",$637.00,76.8,80.7,66,80,53


In [29]:
ninth_graders = complete_student_data_df[(complete_student_data_df["grade"] == "9th")]
tenth_graders = complete_student_data_df[(complete_student_data_df["grade"] == "10th")]
eleventh_graders = complete_student_data_df[(complete_student_data_df["grade"] == "11th")]
twelfth_graders = complete_student_data_df[(complete_student_data_df["grade"] == "12th")]

In [30]:
ninth_graders_avg_math_score = ninth_graders.groupby(["school_name"]).mean()["math_score"]
tenth_graders_avg_math_score = tenth_graders.groupby(["school_name"]).mean()["math_score"]
eleventh_graders_avg_math_score = eleventh_graders.groupby(["school_name"]).mean()["math_score"]
twelfth_graders_avg_math_score = twelfth_graders.groupby(["school_name"]).mean()["math_score"]

In [31]:
ninth_graders_avg_reading_score = ninth_graders.groupby(["school_name"]).mean()["reading_score"]
tenth_graders_avg_reading_score = tenth_graders.groupby(["school_name"]).mean()["reading_score"]
eleventh_graders_avg_reading_score = eleventh_graders.groupby(["school_name"]).mean()["reading_score"]
twelfth_graders_avg_reading_score = twelfth_graders.groupby(["school_name"]).mean()["reading_score"]

In [32]:
# Combine each Series for average math scores by school into single DataFrame.
math_scores_by_grade = pd.DataFrame({
               "9th": ninth_graders_avg_math_score.map("{:,.1f}".format),
               "10th": tenth_graders_avg_math_score.map("{:,.1f}".format),
               "11th": eleventh_graders_avg_math_score.map("{:,.1f}".format),
               "12th": twelfth_graders_avg_math_score.map("{:,.1f}".format)})

math_scores_by_grade = math_scores_by_grade[
                 ["9th", "10th", "11th", "12th"]]

# Remove the index name.
math_scores_by_grade.index.name = None
# Display the DataFrame.
math_scores_by_grade.head()

Unnamed: 0,9th,10th,11th,12th
Bailey High School,77.1,77.0,77.5,76.5
Cabrera High School,83.1,83.2,82.8,83.3
Figueroa High School,76.4,76.5,76.9,77.2
Ford High School,77.4,77.7,76.9,76.2
Griffin High School,82.0,84.2,83.8,83.4


In [33]:
# Combine each Series for average reading scores by school into single DataFrame.
reading_scores_by_grade = pd.DataFrame({
              "9th": ninth_graders_avg_reading_score.map("{:.1f}".format),
              "10th": tenth_graders_avg_reading_score.map("{:.1f}".format),
              "11th": eleventh_graders_avg_reading_score.map("{:.1f}".format),
              "12th": twelfth_graders_avg_reading_score.map("{:.1f}".format)})

reading_scores_by_grade = reading_scores_by_grade[
                 ["9th", "10th", "11th", "12th"]]

# Remove the index name.
reading_scores_by_grade.index.name = None
# Display the DataFrame.
reading_scores_by_grade.head()

Unnamed: 0,9th,10th,11th,12th
Bailey High School,81.3,80.9,80.9,80.9
Cabrera High School,83.7,84.3,83.8,84.3
Figueroa High School,81.2,81.4,80.6,81.4
Ford High School,80.6,81.3,80.4,80.7
Griffin High School,83.4,83.7,84.3,84.0
