### gather dependencies

In [65]:
import pandas as pd
import numpy as np

schools = "Resources/schools_complete.csv"
students = "Resources/students_complete.csv"

In [66]:
school = pd.read_csv(schools)
student = pd.read_csv(students)

### Merge the csv files

In [67]:
all_school_data = pd.merge(school, student, how="left", on=["school_name"])
# all_school_data = all_school_data.rename(columns={"school_name":"School"})
all_school_data

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84
...,...,...,...,...,...,...,...,...,...,...,...
39165,14,Thomas High School,Charter,1635,1043130,39165,Donna Howard,F,12th,99,90
39166,14,Thomas High School,Charter,1635,1043130,39166,Dawn Bell,F,10th,95,70
39167,14,Thomas High School,Charter,1635,1043130,39167,Rebecca Tanner,F,9th,73,84
39168,14,Thomas High School,Charter,1635,1043130,39168,Desiree Kidd,F,10th,99,90


### Gather totals

In [68]:
# Totals
total_schools = len(all_school_data["School ID"].unique())
total_students = len(all_school_data["Student ID"])
total_budget = school["budget"].sum()
avg_math_score = round(student["math_score"].mean(), 1)
avg_reading_score = round(student["reading_score"].mean(), 1)
# overall_avg = student.loc[:, "reading_score":"math_score"].mean(axis=1)
overall_avg = round((avg_math_score + avg_reading_score)/2, 1)

### All school data

In [69]:
all_school_data.sort_values("school_name")

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score
19584,7,Bailey High School,District,4976,3124928,19584,Tammie Fox,F,11th,82,92
21193,7,Bailey High School,District,4976,3124928,21193,Jennifer Murray,F,9th,88,89
21192,7,Bailey High School,District,4976,3124928,21192,Lisa Pineda,F,9th,86,67
21191,7,Bailey High School,District,4976,3124928,21191,Cameron Miller,M,11th,70,75
21190,7,Bailey High School,District,4976,3124928,21190,Thomas Rasmussen,M,12th,77,82
...,...,...,...,...,...,...,...,...,...,...,...
24829,10,Wright High School,Charter,1800,1049400,24829,John Lozano,M,11th,84,77
24828,10,Wright High School,Charter,1800,1049400,24828,Jonathan Thomas,M,12th,95,80
24827,10,Wright High School,Charter,1800,1049400,24827,Lori Ramirez,F,10th,74,74
24797,10,Wright High School,Charter,1800,1049400,24797,Scott Scott,M,10th,90,83


#### Passing scores

In [70]:
# Passing scores
student["passing_math"] = student["math_score"]>=70
student["passing_reading"] = student["reading_score"]>70
pct_passing_math = ((student["passing_math"].mean())) * 100
pct_passing_reading = ((student["passing_reading"].mean())) * 100
overall_pct = (pct_passing_math + pct_passing_reading) / 2

### District summary

In [71]:
# District summary
results = [{
    "All Schools":total_schools, 
    "All Students":total_students, 
    "Total Budget":total_budget, 
    "Avg Math Score":avg_math_score, 
    "Pct Passing Math":pct_passing_math, 
    "Avg Reading Score":avg_reading_score,
    "Pct Passing Reading":pct_passing_reading,
    "Overall Pct":overall_pct
    }]

district_summary = pd.DataFrame(results)

# Apply formatting
district_summary["All Students"] = district_summary["All Students"].map("{:,}".format)
district_summary["Total Budget"] = district_summary["Total Budget"].map("${:,}".format)
district_summary["Pct Passing Math"] = district_summary["Pct Passing Math"].map("{:.1f}%".format)
district_summary["Pct Passing Reading"] = district_summary["Pct Passing Reading"].map("{:.1f}%".format)
district_summary["Overall Pct"] = district_summary["Overall Pct"].map("{:.1f}%".format)
district_summary

Unnamed: 0,All Schools,All Students,Total Budget,Avg Math Score,Pct Passing Math,Avg Reading Score,Pct Passing Reading,Overall Pct
0,15,39170,"$24,649,428",79.0,75.0%,81.9,83.0%,79.0%


### Grouping . . .

In [72]:
# Grouping
all_school_data["pass_math"] = all_school_data["math_score"]>=70
all_school_data["pass_reading"] = all_school_data["reading_score"]>=70

school_group = all_school_data.groupby(["school_name"]).mean()
school_group["Per Student Budget"] = (school_group["budget"] / school_group["size"])
school_group["Pct Passing Math"] = round(school_group["pass_math"] * 100, 1)
school_group["Pct Passing Reading"] = round(school_group["pass_reading"] * 100, 1)
school_group["Overall Pass Pct"] = (school_group["Pct Passing Math"] + school_group["Pct Passing Reading"]) / 2

# Merge with school data

school_data_summary = pd.merge(school_group, school, how="left", on=["school_name", "school_name"])
# # Clean up df
del school_data_summary['School ID_x']
del school_data_summary['School ID_y']
del school_data_summary['size_y']
del school_data_summary['budget_y']
del school_data_summary['pass_math']
del school_data_summary['pass_reading']
del school_data_summary['budget_x']
del school_data_summary['Student ID']

all_schools_df = school_data_summary.rename(columns={
    "size_x":"Total Students",
    "budget_x": "Budget",
    "math_score": "Math Scores",
    "reading_score": "Reading Scores",
    "type": "Type",
    "school_name": "School"

    })
# # Add formatting
all_schools_df["Total Students"] = all_schools_df["Total Students"].map("{:,.0f}".format)
all_schools_df["Per Student Budget"] = all_schools_df["Per Student Budget"].map("${:,.0f}".format)
all_schools_df["Reading Scores"] = all_schools_df["Reading Scores"].map("{:.1f}".format)
all_schools_df["Math Scores"] = all_schools_df["Math Scores"].map("{:.1f}".format)
all_schools_df["Pct Passing Math"] = all_schools_df["Pct Passing Math"].map("{:.1f}%".format)
all_schools_df["Pct Passing Reading"] = all_schools_df["Pct Passing Reading"].map("{:.1f}%".format)
all_schools_df["Overall Pass Pct"] = all_schools_df["Overall Pass Pct"].map("{:.1f}%".format)

# # Sort by school
all_schools_df.sort_values(['School'])

# # Reorder columns
all_schools_df.reindex(columns=[
    'Type', 'School', 'Total Students', 'Per Student Budget', 'Reading Scores', 'Math Scores', 
    'Pct Passing Math', 'Pct Passing Reading', 'Overall Pass Pct'])


Unnamed: 0,Type,School,Total Students,Per Student Budget,Reading Scores,Math Scores,Pct Passing Math,Pct Passing Reading,Overall Pass Pct
0,District,Bailey High School,4976,$628,81.0,77.0,66.7%,81.9%,74.3%
1,Charter,Cabrera High School,1858,$582,84.0,83.1,94.1%,97.0%,95.5%
2,District,Figueroa High School,2949,$639,81.2,76.7,66.0%,80.7%,73.3%
3,District,Ford High School,2739,$644,80.7,77.1,68.3%,79.3%,73.8%
4,Charter,Griffin High School,1468,$625,83.8,83.4,93.4%,97.1%,95.2%
5,District,Hernandez High School,4635,$652,80.9,77.3,66.8%,80.9%,73.8%
6,Charter,Holden High School,427,$581,83.8,83.8,92.5%,96.3%,94.4%
7,District,Huang High School,2917,$655,81.2,76.6,65.7%,81.3%,73.5%
8,District,Johnson High School,4761,$650,81.0,77.1,66.1%,81.2%,73.7%
9,Charter,Pena High School,962,$609,84.0,83.8,94.6%,95.9%,95.2%


### Top/Bottom performers

In [73]:
# Top five schools
top_five_schools = all_schools_df.sort_values(["Overall Pass Pct"], ascending=False)
top_five_schools.head(5)

Unnamed: 0,School,Total Students,Reading Scores,Math Scores,Per Student Budget,Pct Passing Math,Pct Passing Reading,Overall Pass Pct,Type
1,Cabrera High School,1858,84.0,83.1,$582,94.1%,97.0%,95.5%,Charter
12,Thomas High School,1635,83.8,83.4,$638,93.3%,97.3%,95.3%,Charter
4,Griffin High School,1468,83.8,83.4,$625,93.4%,97.1%,95.2%,Charter
9,Pena High School,962,84.0,83.8,$609,94.6%,95.9%,95.2%,Charter
13,Wilson High School,2283,84.0,83.3,$578,93.9%,96.5%,95.2%,Charter


In [74]:
#Bottom five schools
bottom_five_schools = all_schools_df.sort_values(["Overall Pass Pct"], ascending=True)
bottom_five_schools.head(5)

Unnamed: 0,School,Total Students,Reading Scores,Math Scores,Per Student Budget,Pct Passing Math,Pct Passing Reading,Overall Pass Pct,Type
2,Figueroa High School,2949,81.2,76.7,$639,66.0%,80.7%,73.3%,District
10,Rodriguez High School,3999,80.7,76.8,$637,66.4%,80.2%,73.3%,District
7,Huang High School,2917,81.2,76.6,$655,65.7%,81.3%,73.5%,District
8,Johnson High School,4761,81.0,77.1,$650,66.1%,81.2%,73.7%,District
3,Ford High School,2739,80.7,77.1,$644,68.3%,79.3%,73.8%,District


### Scores by grade level

In [75]:
# create a df for each grade
all_school_data = all_school_data.rename(columns={"school_name":"School"})
ninth_grade = all_school_data[all_school_data["grade"] == "9th"].groupby("School").mean()["math_score"]
tenth_grade = all_school_data[all_school_data["grade"] == "10th"].groupby("School").mean()["math_score"]
eleventh_grade = all_school_data[all_school_data["grade"] == "11th"].groupby("School").mean()["math_score"]
twelfth_grade = all_school_data[all_school_data["grade"] == "12th"].groupby("School").mean()["math_score"]

math_grade_df = pd.DataFrame({
    "9th Grade":round(ninth_grade, 2),
    "10th Grade": round(tenth_grade, 2),
    "11th Grade": round(eleventh_grade, 2),
    "12th Grade": round(twelfth_grade, 2)
})
math_grade_df


Unnamed: 0_level_0,9th Grade,10th Grade,11th Grade,12th Grade
School,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,77.08,77.0,77.52,76.49
Cabrera High School,83.09,83.15,82.77,83.28
Figueroa High School,76.4,76.54,76.88,77.15
Ford High School,77.36,77.67,76.92,76.18
Griffin High School,82.04,84.23,83.84,83.36
Hernandez High School,77.44,77.34,77.14,77.19
Holden High School,83.79,83.43,85.0,82.86
Huang High School,77.03,75.91,76.45,77.23
Johnson High School,77.19,76.69,77.49,76.86
Pena High School,83.63,83.37,84.33,84.12


In [76]:
# Top five by spending
top_five_by_spending = all_schools_df.sort_values(["Per Student Budget"], ascending=False)
top_five_by_spending.head(5)

Unnamed: 0,School,Total Students,Reading Scores,Math Scores,Per Student Budget,Pct Passing Math,Pct Passing Reading,Overall Pass Pct,Type
7,Huang High School,2917,81.2,76.6,$655,65.7%,81.3%,73.5%,District
5,Hernandez High School,4635,80.9,77.3,$652,66.8%,80.9%,73.8%,District
8,Johnson High School,4761,81.0,77.1,$650,66.1%,81.2%,73.7%,District
3,Ford High School,2739,80.7,77.1,$644,68.3%,79.3%,73.8%,District
2,Figueroa High School,2949,81.2,76.7,$639,66.0%,80.7%,73.3%,District


In [77]:
spending_bins = [0, 590, 625, 650, 695]
group_names = ["<$590", "$590-$625", "$625-$650", "$650+"]
school_data_summary["Spending Range Per Student"] = pd.cut(school_data_summary["Per Student Budget"], spending_bins, labels=group_names)
school_spending_group = school_data_summary.groupby("Spending Range Per Student").mean().round(1)

# remove unneeded columns
# del school_spending_group['size_x']
# del school_spending_group['budget_x']
# del school_spending_group['Per Student Budget']
# del school_spending_group['School ID_y']
# del school_spending_group['pass_math']
# del school_spending_group['pass_reading']
school_spending_group = school_spending_group.rename(
    columns={
        "size_x": "School Size",
        "reading_score": "Reading Score",
        "math_score": "Math Score"
    })

school_spending_group["School Size"] = school_spending_group["School Size"].map("{:,}".format)
school_spending_group["Per Student Budget"] = school_spending_group["Per Student Budget"].map("${:,}".format)
school_spending_group["Pct Passing Reading"] = school_spending_group["Pct Passing Reading"].map("{:.1f}%".format)
school_spending_group["Pct Passing Math"] = school_spending_group["Pct Passing Math"].map("{:.1f}%".format)
school_spending_group["Overall Pass Pct"] = school_spending_group["Overall Pass Pct"].map("{:.1f}%".format)
school_spending_group

Unnamed: 0_level_0,School Size,Reading Score,Math Score,Per Student Budget,Pct Passing Math,Pct Passing Reading,Overall Pass Pct
Spending Range Per Student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
<$590,1592.0,83.9,83.5,$581.0,93.4%,96.6%,95.0%
$590-$625,1397.0,83.9,83.5,$611.3,94.0%,96.3%,95.1%
$625-$650,3509.8,81.4,78.0,$639.3,71.1%,83.4%,77.3%
$650+,3776.0,81.1,77.0,$653.5,66.2%,81.1%,73.7%


In [78]:
# Bins by school type

school_type_group = school_data_summary.groupby("type").mean().round(1)

del school_type_group['size_x']
# del school_type_group['budget_x']
# del school_type_group['Per Student Budget']
# del school_type_group['School ID_y']
# del school_type_group['pass_math']
# del school_type_group['pass_reading']

school_type_group = school_type_group.rename(
    columns={
        "size_x": "School Size",
        "reading_score": "Reading Score",
        "math_score": "Math Score"
    })

school_type_group["Per Student Budget"] = school_type_group["Per Student Budget"].map("${:,}".format)
school_type_group["Pct Passing Reading"] = school_type_group["Pct Passing Reading"].map("{:.1f}%".format)
school_type_group["Pct Passing Math"] = school_type_group["Pct Passing Math"].map("{:.1f}%".format)
school_type_group["Overall Pass Pct"] = school_type_group["Overall Pass Pct"].map("{:.1f}%".format)

school_type_group

Unnamed: 0_level_0,Reading Score,Math Score,Per Student Budget,Pct Passing Math,Pct Passing Reading,Overall Pass Pct
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Charter,83.9,83.5,$599.5,93.6%,96.6%,95.1%
District,81.0,77.0,$643.6,66.6%,80.8%,73.7%


# Final Observations
### 1. Charter schools outperform district schools
### 2. Reading scores are consistent by grade-level, by school
### 3. Math scores are consistent by grade-level, by school
### 4. Small schools significantly outperform large schools