#### Gather dependencies

In [124]:
import pandas as pd
import numpy as np

school_csv = "Resources/schools_complete.csv"
student_csv = "Resources/students_complete.csv"

In [125]:
# Read csv files
school = pd.read_csv(school_csv)
student = pd.read_csv(student_csv)

##### Merge

In [126]:
# merge the two data sets
all_school_data = pd.merge(school, student, how="left", on="school_name")
all_school_data

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84
...,...,...,...,...,...,...,...,...,...,...,...
39165,14,Thomas High School,Charter,1635,1043130,39165,Donna Howard,F,12th,99,90
39166,14,Thomas High School,Charter,1635,1043130,39166,Dawn Bell,F,10th,95,70
39167,14,Thomas High School,Charter,1635,1043130,39167,Rebecca Tanner,F,9th,73,84
39168,14,Thomas High School,Charter,1635,1043130,39168,Desiree Kidd,F,10th,99,90


##### Totals

In [134]:
total_schools = len(all_school_data["School ID"].unique())
total_students = len(all_school_data["Student ID"].unique())
total_budget = school["budget"].sum()
avg_math_score = round(student["math_score"].mean(), 2)
avg_reading_score = round(student["reading_score"].mean(), 2)
overall_avg = student.loc[:, "reading_score":"math_score"].mean(axis=1)

In [135]:
all_school_data

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84
...,...,...,...,...,...,...,...,...,...,...,...
39165,14,Thomas High School,Charter,1635,1043130,39165,Donna Howard,F,12th,99,90
39166,14,Thomas High School,Charter,1635,1043130,39166,Dawn Bell,F,10th,95,70
39167,14,Thomas High School,Charter,1635,1043130,39167,Rebecca Tanner,F,9th,73,84
39168,14,Thomas High School,Charter,1635,1043130,39168,Desiree Kidd,F,10th,99,90


#### Passing scores

In [136]:
student["passing_math"] = student["math_score"]>=70
student["passing_reading"] = student["reading_score"]>=70
pct_passing_math = ((student["passing_math"].mean())) * 100
pct_passing_reading = ((student["passing_reading"].mean())) * 100
overall_pct = (pct_passing_math + pct_passing_reading) / 2

##### District Summary

In [137]:
results = [{"All Schools":total_schools, "All Students":total_students, "Total Budget":total_budget, "Avg Math Score":avg_math_score, 
"Pct Passing Math":pct_passing_math, "Avg Reading Score":avg_reading_score, "Pct Passing Reading":pct_passing_reading, "Overall Pct":overall_pct}]
district_summary = pd.DataFrame(results)

# format the output
district_summary["All Students"] = district_summary["All Students"].map("{:,}".format)
district_summary["Total Budget"] = district_summary["Total Budget"].map("${:,}".format)
district_summary["Pct Passing Math"] = district_summary["Pct Passing Math"].map("{:.2f}%".format)
district_summary["Pct Passing Reading"] = district_summary["Pct Passing Reading"].map("{:.2f}%".format)
district_summary["Overall Pct"] = district_summary["Overall Pct"].map("{:.2f}%".format)
district_summary

Unnamed: 0,All Schools,All Students,Total Budget,Avg Math Score,Pct Passing Math,Avg Reading Score,Pct Passing Reading,Overall Pct
0,15,39170,"$24,649,428",78.99,74.98%,81.88,85.81%,80.39%


#### Grouping

In [139]:
# school summary
all_school_data["pass_math"] = all_school_data["math_score"] >=70
all_school_data["pass_reading"] = all_school_data["reading_score"] >=70
all_school_data

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score,pass_math,pass_reading
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79,True,False
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61,False,True
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60,False,True
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58,False,False
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39165,14,Thomas High School,Charter,1635,1043130,39165,Donna Howard,F,12th,99,90,True,True
39166,14,Thomas High School,Charter,1635,1043130,39166,Dawn Bell,F,10th,95,70,True,True
39167,14,Thomas High School,Charter,1635,1043130,39167,Rebecca Tanner,F,9th,73,84,True,True
39168,14,Thomas High School,Charter,1635,1043130,39168,Desiree Kidd,F,10th,99,90,True,True


In [1]:
school_group = all_school_data.groupby(["school_name"]).mean()
school_group["Per Student Budget"] = school_group["budget"] / school_group["size"]
school_group["Pct Passing Math"] = round(school_group["pass_math"] * 100, 2)
school_group["Pct Passing Reading"] = round(school_group["pass_reading"] * 100, 2)
school_group["Overall Passing Pct"] = (school_group["pass_math"] + school_group["pass_reading"]) / 2 * 100
# school_group

# merge with school data
school_data_summary = pd.merge(school_group, school, how="left", on=["school_name", "school_name"])
# clean up output
del school_data_summary['School ID_x']
del school_data_summary['size_y']
del school_data_summary['Student ID']
del school_data_summary['budget_y']

school_data_summary.rename(columns{"size_x":"School Size"})

SyntaxError: invalid syntax (2771864218.py, line 16)

In [169]:
# create a df to capture the results
school_summary_df = pd.DataFrame({
    "School Name":school_data_summary["school_name"],"School Type":school_data_summary["type"], 
    "Total Students":school_data_summary["size_x"],
    "Total Budget":school_data_summary["budget_x"],
    "Per Student Budget":school_data_summary["Per Student Budget"],
    "Avg Reading Score":school_data_summary["reading_score"],
    "Avg Math Score":school_data_summary["math_score"],
    "Pct Passing Reading":school_data_summary["Pct Passing Reading"],
    "Pct Passing Math":school_data_summary["Pct Passing Math"],
    "Overall Pct":school_data_summary["Overall Passing Pct"]
})

#school_summary_df
# apply formatting
school_summary_df["Total Students"] = school_summary_df["Total Students"].map("{:,}".format)
school_summary_df["Total Budget"] = school_summary_df["Total Budget"].map("${:,.2f}".format)
school_summary_df["Per Student Budget"] = school_summary_df["Per Student Budget"].map("${:,.2f}".format)
school_summary_df["Avg Reading Score"] = school_summary_df["Avg Reading Score"].map("{:.2f}".format)
school_summary_df["Avg Math Score"] = school_summary_df["Avg Math Score"].map("{:.2f}".format)
school_summary_df["Pct Passing Reading"] = school_summary_df["Pct Passing Reading"].map("{:.2f}%".format)
school_summary_df["Pct Passing Math"] = school_summary_df["Pct Passing Math"].map("{:.2f}%".format)
school_summary_df["Overall Pct"] = school_summary_df["Overall Pct"].map("{:.2f}%".format)
school_summary_df

Unnamed: 0,School Name,School Type,Total Students,Total Budget,Per Student Budget,Avg Reading Score,Avg Math Score,Pct Passing Reading,Pct Passing Math,Overall Pct
0,Bailey High School,District,4976.0,"$3,124,928.00",$628.00,81.03,77.05,81.93%,66.68%,74.31%
1,Cabrera High School,Charter,1858.0,"$1,081,356.00",$582.00,83.98,83.06,97.04%,94.13%,95.59%
2,Figueroa High School,District,2949.0,"$1,884,411.00",$639.00,81.16,76.71,80.74%,65.99%,73.36%
3,Ford High School,District,2739.0,"$1,763,916.00",$644.00,80.75,77.1,79.30%,68.31%,73.80%
4,Griffin High School,Charter,1468.0,"$917,500.00",$625.00,83.82,83.35,97.14%,93.39%,95.27%
5,Hernandez High School,District,4635.0,"$3,022,020.00",$652.00,80.93,77.29,80.86%,66.75%,73.81%
6,Holden High School,Charter,427.0,"$248,087.00",$581.00,83.81,83.8,96.25%,92.51%,94.38%
7,Huang High School,District,2917.0,"$1,910,635.00",$655.00,81.18,76.63,81.32%,65.68%,73.50%
8,Johnson High School,District,4761.0,"$3,094,650.00",$650.00,80.97,77.07,81.22%,66.06%,73.64%
9,Pena High School,Charter,962.0,"$585,858.00",$609.00,84.04,83.84,95.95%,94.59%,95.27%


#### Performance Metrics

In [170]:
top_five_schools = school_summary_df.sort_values(["Overall Pct"], ascending=False)
top_five_schools.head(5)

Unnamed: 0,School Name,School Type,Total Students,Total Budget,Per Student Budget,Avg Reading Score,Avg Math Score,Pct Passing Reading,Pct Passing Math,Overall Pct
1,Cabrera High School,Charter,1858.0,"$1,081,356.00",$582.00,83.98,83.06,97.04%,94.13%,95.59%
12,Thomas High School,Charter,1635.0,"$1,043,130.00",$638.00,83.85,83.42,97.31%,93.27%,95.29%
4,Griffin High School,Charter,1468.0,"$917,500.00",$625.00,83.82,83.35,97.14%,93.39%,95.27%
9,Pena High School,Charter,962.0,"$585,858.00",$609.00,84.04,83.84,95.95%,94.59%,95.27%
13,Wilson High School,Charter,2283.0,"$1,319,574.00",$578.00,83.99,83.27,96.54%,93.87%,95.20%


In [171]:
bottom_five_schools = school_summary_df.sort_values(["Overall Pct"], ascending=True)
bottom_five_schools.head(5)

Unnamed: 0,School Name,School Type,Total Students,Total Budget,Per Student Budget,Avg Reading Score,Avg Math Score,Pct Passing Reading,Pct Passing Math,Overall Pct
10,Rodriguez High School,District,3999.0,"$2,547,363.00",$637.00,80.74,76.84,80.22%,66.37%,73.29%
2,Figueroa High School,District,2949.0,"$1,884,411.00",$639.00,81.16,76.71,80.74%,65.99%,73.36%
7,Huang High School,District,2917.0,"$1,910,635.00",$655.00,81.18,76.63,81.32%,65.68%,73.50%
8,Johnson High School,District,4761.0,"$3,094,650.00",$650.00,80.97,77.07,81.22%,66.06%,73.64%
3,Ford High School,District,2739.0,"$1,763,916.00",$644.00,80.75,77.1,79.30%,68.31%,73.80%


#### Grade Level

In [175]:
# create a df for each grade
ninth_grade = all_school_data[all_school_data["grade"] == "9th"].groupby("school_name").mean()["math_score"]
tenth_grade = all_school_data[all_school_data["grade"] == "10th"].groupby("school_name").mean()["math_score"]
eleventh_grade = all_school_data[all_school_data["grade"] == "11th"].groupby("school_name").mean()["math_score"]
twelfth_grade = all_school_data[all_school_data["grade"] == "12th"].groupby("school_name").mean()["math_score"]

math_grade_df = pd.DataFrame({
    "9th Grade":round(ninth_grade, 2),
    "10th Grade": round(tenth_grade, 2),
    "11th Grade": round(eleventh_grade, 2),
    "12th Grade": round(twelfth_grade, 2)
})
math_grade_df

Unnamed: 0_level_0,9th Grade,10th Grade,11th Grade,12th Grade
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.08,77.0,77.52,76.49
Cabrera High School,83.09,83.15,82.77,83.28
Figueroa High School,76.4,76.54,76.88,77.15
Ford High School,77.36,77.67,76.92,76.18
Griffin High School,82.04,84.23,83.84,83.36
Hernandez High School,77.44,77.34,77.14,77.19
Holden High School,83.79,83.43,85.0,82.86
Huang High School,77.03,75.91,76.45,77.23
Johnson High School,77.19,76.69,77.49,76.86
Pena High School,83.63,83.37,84.33,84.12


In [176]:
ninth_grade = all_school_data[all_school_data["grade"] == "9th"].groupby("school_name").mean()["reading_score"]
tenth_grade = all_school_data[all_school_data["grade"] == "10th"].groupby("school_name").mean()["reading_score"]
eleventh_grade = all_school_data[all_school_data["grade"] == "11th"].groupby("school_name").mean()["reading_score"]
twelfth_grade = all_school_data[all_school_data["grade"] == "12th"].groupby("school_name").mean()["reading_score"]

reading_grade_df = pd.DataFrame({
    "9th Grade":round(ninth_grade, 2),
    "10th Grade": round(tenth_grade, 2),
    "11th Grade": round(eleventh_grade, 2),
    "12th Grade": round(twelfth_grade, 2)
})
reading_grade_df

Unnamed: 0_level_0,9th Grade,10th Grade,11th Grade,12th Grade
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,81.3,80.91,80.95,80.91
Cabrera High School,83.68,84.25,83.79,84.29
Figueroa High School,81.2,81.41,80.64,81.38
Ford High School,80.63,81.26,80.4,80.66
Griffin High School,83.37,83.71,84.29,84.01
Hernandez High School,80.87,80.66,81.4,80.86
Holden High School,83.68,83.32,83.82,84.7
Huang High School,81.29,81.51,81.42,80.31
Johnson High School,81.26,80.77,80.62,81.23
Pena High School,83.81,83.61,84.34,84.59


#### Top Five By Spending

In [177]:
top_five_schools = top_five_schools.sort_values(["Per Student Budget"], ascending=False)
top_five_schools.head(5)

Unnamed: 0,School Name,School Type,Total Students,Total Budget,Per Student Budget,Avg Reading Score,Avg Math Score,Pct Passing Reading,Pct Passing Math,Overall Pct
7,Huang High School,District,2917.0,"$1,910,635.00",$655.00,81.18,76.63,81.32%,65.68%,73.50%
5,Hernandez High School,District,4635.0,"$3,022,020.00",$652.00,80.93,77.29,80.86%,66.75%,73.81%
8,Johnson High School,District,4761.0,"$3,094,650.00",$650.00,80.97,77.07,81.22%,66.06%,73.64%
3,Ford High School,District,2739.0,"$1,763,916.00",$644.00,80.75,77.1,79.30%,68.31%,73.80%
2,Figueroa High School,District,2949.0,"$1,884,411.00",$639.00,81.16,76.71,80.74%,65.99%,73.36%


#### Bins, Bins, and . . .

In [182]:
spending_bins = [0, 590, 625, 650, 695]
group_names = ["<$590", "$590-$625", "$625-$650", "$650+"]
school_data_summary["Spending Range Per Student"] = pd.cut(school_data_summary["Per Student Budget"], spending_bins, labels=group_names)
school_spending_group = school_data_summary.groupby("Spending Range Per Student").mean().round(1)

# remove unneeded columns
del school_spending_group['size_x']
del school_spending_group['budget_x']
del school_spending_group['Per Student Budget']
del school_spending_group['School ID_y']
del school_spending_group['pass_math']
del school_spending_group['pass_reading']
school_spending_group


Unnamed: 0_level_0,reading_score,math_score,Pct Passing Math,Pct Passing Reading,Overall Passing Pct
Spending Range Per Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$590,83.9,83.5,93.5,96.6,95.0
$590-$625,83.9,83.5,94.0,96.3,95.1
$625-$650,81.4,78.0,71.1,83.5,77.3
$650+,81.1,77.0,66.2,81.1,73.7


In [183]:
# bins by school size
size_bins = [0, 1000, 2000, 5000]
group_names = ["Small (<1k)", "Medium (1k-2k)", "Large (2k+)"]

school_data_summary["School Size"] = pd.cut(school_data_summary['size_x'], size_bins, labels=group_names)
school_size_group = school_data_summary.groupby("School Size").mean().round(1)
# get rid of the noise
del school_size_group['budget_x']
del school_size_group['Per Student Budget']
del school_size_group['School ID_y']
del school_size_group['pass_math']
del school_size_group['pass_reading']

school_size_group


Unnamed: 0_level_0,size_x,reading_score,math_score,Pct Passing Math,Pct Passing Reading,Overall Passing Pct
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Small (<1k),694.5,83.9,83.8,93.6,96.1,94.8
Medium (1k-2k),1704.4,83.9,83.4,93.6,96.8,95.2
Large (2k+),3657.4,81.3,77.7,70.0,82.8,76.4


In [184]:
school_type_group = school_data_summary.groupby("type").mean().round(1)

del school_type_group['size_x']
del school_type_group['budget_x']
del school_type_group['Per Student Budget']
del school_type_group['School ID_y']
del school_type_group['pass_math']
del school_type_group['pass_reading']

school_type_group

Unnamed: 0_level_0,reading_score,math_score,Pct Passing Math,Pct Passing Reading,Overall Passing Pct
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.9,83.5,93.6,96.6,95.1
District,81.0,77.0,66.5,80.8,73.7


## Final Observations

#### 1. Charter schools outperform district schools
#### 2. Reading scores are consistent by grade-level, by school
#### 3. Math scores are consistent by grade-level, by school
#### 4. Small schools significantly outperform large schools