# PyCity Schools Analysis

* Your analysis here
---

In [1]:
# Dependencies and Setup
from pathlib import Path
import pandas as pd

# File to Load (Remember to Change These)
school_data_to_load = Path("Resources/schools_complete.csv")
student_data_to_load = Path("Resources/students_complete.csv")

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])


## Local Government Area Summary

In [2]:
# Calculate the Totals (Schools and Students)
school_count =len(school_data_complete["school_name"].unique())
student_count =len(school_data_complete.index)

# Calculate the Total Budget
total_budget =sum(school_data_complete["budget"].unique())


In [3]:
# Calculate the Average Scores
average_maths_score =school_data_complete["maths_score"].mean()
average_reading_score =school_data_complete["reading_score"].mean()


In [4]:
# Calculate the Percentage Pass Rates
passing_maths_count = school_data_complete[(school_data_complete["maths_score"] >= 50)].count()["student_name"]
passing_maths_percentage = passing_maths_count / float(student_count) * 100
passing_reading_count =school_data_complete[(school_data_complete["reading_score"] >= 70)].count()["student_name"]
passing_reading_percentage =passing_reading_count/float(student_count)*100

passing_maths_reading_count = school_data_complete[(school_data_complete["maths_score"] >= 70) & (school_data_complete["reading_score"] >= 70)
].count()["student_name"]

overall_passing_rate =passing_maths_reading_count/float(student_count) * 100


In [5]:
# Convert to DataFrame
area_summary =pd.DataFrame({
    "Total Number of Unique Schools": [school_count],
    "Total Students": [student_count],
    "Total Budget": [total_budget],
    "Average Math Score": [average_maths_score],
    "Average Reading Score": [average_reading_score],
    "% of Students Passing Math": [passing_maths_percentage],
    "% of Students Passing Reading": [passing_reading_percentage]
})

# Formatting
area_summary["Total Students"] = area_summary["Total Students"].map("{:,}".format)
area_summary["Total Budget"] = area_summary["Total Budget"].map("${:,.2f}".format)

# Display the DataFrame
area_summary


Unnamed: 0,Total Number of Unique Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% of Students Passing Math,% of Students Passing Reading
0,15,39170,"$24,649,428.00",70.338192,69.980138,86.078632,50.814399


## School Summary

In [6]:
# Use the code provided to select the type per school from school_data
school_types = school_data.set_index(["school_name"])["type"]
school_summary = school_data_complete.groupby(['school_name'])
school_names = school_data_complete.school_name.sort_values().unique()

# Calculate the total student count per school from school_data
per_school_counts =school_data.groupby(["school_name"]).sum()["size"]
school_budget = list(school_summary.budget.mean())
school_total_students = list(school_summary.student_name.count())

# Calculate the total school budget and per capita spending per school from school_data
per_school_budget = [i/j for i,j in zip(school_budget,school_total_students)]
#per_school_capita =per_school_budget / per_school_counts
per_school_capita = per_school_budget / per_school_counts

# Calculate the average test scores per school from school_data_complete
per_school_maths =list(school_summary.maths_score.mean())
per_school_reading =list(school_summary.reading_score.mean())



In [7]:
# Get the students who passed maths and passed reading by creating separate filtered DataFrames from school_data_complete.
school_passing_maths =school_data_complete[(school_data_complete["maths_score"] >= 70)]
school_passing_reading =school_data_complete[(school_data_complete["reading_score"] >= 70)]

# Get the students who passed both reading and maths in a separate DataFrame from school_data_complete.
passing_maths_and_reading =school_data_complete[
    (school_data_complete["reading_score"] >= 70) & (school_data_complete["maths_score"] >= 70)
]


In [8]:
#  Calculate the Percentage Pass Rates
per_school_passing_maths = school_passing_maths.groupby(["school_name"]).count()["student_name"] / per_school_counts * 100
per_school_passing_reading =school_passing_reading.groupby(["school_name"]).count()["student_name"] / per_school_counts * 100
overall_passing_rate =passing_maths_and_reading.groupby(["school_name"]).count()["student_name"] / per_school_counts * 100


In [9]:
# Convert to DataFrame
school_summary = school_data_complete.groupby(['school_name'])
school_names = school_data_complete.school_name.sort_values().unique()
school_types = school_data.sort_values(by="school_name").type
school_total_students = list(school_summary.student_name.count())
school_budget = list(school_summary.budget.mean())
school_per_student_budget = [i/j for i,j in zip(school_budget,school_total_students)]
school_avg_math_score = list(school_summary.maths_score.mean())
school_avg_reading_score = list(school_summary.reading_score.mean())

# Calculating passing percentages
school_summary = school_data_complete[school_data_complete['maths_score'] >= 70].groupby(['school_name'])
school_pct_passing_math = [(i/j)*100 for i,j in zip(school_summary.maths_score.count(),school_total_students)]
school_summary = school_data_complete[school_data_complete['reading_score'] >= 70].groupby(['school_name'])
school_pct_passing_reading = [(i/j)*100 for i,j in zip(school_summary.reading_score.count(),school_total_students)]
school_overall_passing = [(i+j)/2 for i,j in zip(school_pct_passing_math,school_pct_passing_reading)]

# Compile all calculations into dataframe
school_summary_df = pd.DataFrame({"School Names":school_names,
                                  "School Type":school_types,
                                  "Total Students":school_total_students,
                                  "Total School Budget":school_budget,
                                  "Per Student Budget":school_per_student_budget,
                                  "Average Math Score":school_avg_math_score,
                                  "Average Reading Score":school_avg_reading_score,
                                  "% Passing Math":school_pct_passing_math,
                                  "% Passing Reading":school_pct_passing_reading,
                                  "Overall Passing Rate":school_overall_passing})

school_summary_df = school_summary_df.reset_index(drop=True)
school_summary_df
# Display the DataFrame


Unnamed: 0,School Names,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
0,Bailey High School,Government,4976,3124928.0,628.0,72.352894,71.008842,55.22508,52.451768,53.838424
1,Cabrera High School,Independent,1858,1081356.0,582.0,71.657158,71.359526,53.175457,53.2831,53.229279
2,Figueroa High School,Government,2949,1884411.0,639.0,68.698542,69.077993,47.677179,49.16921,48.423194
3,Ford High School,Government,2739,1763916.0,644.0,69.091274,69.572472,48.959474,50.492881,49.726177
4,Griffin High School,Independent,1468,917500.0,625.0,71.788147,71.245232,54.700272,54.155313,54.427793
5,Hernandez High School,Government,4635,3022020.0,652.0,68.874865,69.186408,49.255663,49.471413,49.363538
6,Holden High School,Independent,427,248087.0,581.0,72.583138,71.660422,57.142857,55.503513,56.323185
7,Huang High School,Government,2917,1910635.0,655.0,68.935207,68.910525,48.920123,49.777168,49.348646
8,Johnson High School,Government,4761,3094650.0,650.0,68.8431,69.039277,48.771267,48.603235,48.687251
9,Pena High School,Independent,962,585858.0,609.0,72.088358,71.613306,55.405405,55.093555,55.24948


## Top Performing Schools (By % Overall Passing)

In [10]:
# Sort and show top five schools
top_schools =school_summary_df.sort_values(["Overall Passing Rate"], ascending=False)
top_schools.head(5)


Unnamed: 0,School Names,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
6,Holden High School,Independent,427,248087.0,581.0,72.583138,71.660422,57.142857,55.503513,56.323185
9,Pena High School,Independent,962,585858.0,609.0,72.088358,71.613306,55.405405,55.093555,55.24948
4,Griffin High School,Independent,1468,917500.0,625.0,71.788147,71.245232,54.700272,54.155313,54.427793
0,Bailey High School,Government,4976,3124928.0,628.0,72.352894,71.008842,55.22508,52.451768,53.838424
10,Rodriguez High School,Government,3999,2547363.0,637.0,72.047762,70.935984,54.513628,52.788197,53.650913


## Bottom Performing Schools (By % Overall Passing)

In [11]:
# Sort and show bottom five schools
bottom_schools =school_summary_df.sort_values(["Overall Passing Rate"], ascending=True)
bottom_schools.head(5)


Unnamed: 0,School Names,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
2,Figueroa High School,Government,2949,1884411.0,639.0,68.698542,69.077993,47.677179,49.16921,48.423194
8,Johnson High School,Government,4761,3094650.0,650.0,68.8431,69.039277,48.771267,48.603235,48.687251
13,Wilson High School,Independent,2283,1319574.0,578.0,69.170828,68.876916,49.364871,48.313622,48.839247
7,Huang High School,Government,2917,1910635.0,655.0,68.935207,68.910525,48.920123,49.777168,49.348646
5,Hernandez High School,Government,4635,3022020.0,652.0,68.874865,69.186408,49.255663,49.471413,49.363538


## Maths Scores by Year

In [12]:
# Create data series of scores by year levels using conditionals
year_nine = school_data_complete[(school_data_complete["year"] == 9)]
year_ten = school_data_complete[(school_data_complete["year"] == 10)]
year_eleven = school_data_complete[(school_data_complete["year"] == 11)]
year_twelve = school_data_complete[(school_data_complete["year"] == 12)]

# Group each by school name
year_nine_scores =school_data_complete.loc[school_data_complete["year"] == 9].groupby("school_name")["maths_score"].mean()
year_ten_scores =school_data_complete.loc[school_data_complete["year"] == 10].groupby("school_name")["maths_score"].mean()
year_eleven_scores =school_data_complete.loc[school_data_complete["year"] == 11].groupby("school_name")["maths_score"].mean()
year_twelve_scores =school_data_complete.loc[school_data_complete["year"] == 12].groupby("school_name")["maths_score"].mean()

# Combine series into single DataFrame
maths_scores_by_year =pd.concat([year_nine_scores, year_ten_scores, year_eleven_scores, year_twelve_scores], axis=1)


# Minor data wrangling
maths_scores_by_year.index.name = None
maths_scores_by_year.columns = ["9th Grade", "10th Grade", "11th Grade", "12th Grade"]
maths_scores_by_year[["9th Grade", "10th Grade", "11th Grade", "12th Grade"]] = maths_scores_by_year[["9th Grade", "10th Grade", "11th Grade", "12th Grade"]].applymap("{:,.2f}%".format)

# Display the DataFrame
maths_scores_by_year


  maths_scores_by_year[["9th Grade", "10th Grade", "11th Grade", "12th Grade"]] = maths_scores_by_year[["9th Grade", "10th Grade", "11th Grade", "12th Grade"]].applymap("{:,.2f}%".format)


Unnamed: 0,9th Grade,10th Grade,11th Grade,12th Grade
Bailey High School,72.49%,71.90%,72.37%,72.68%
Cabrera High School,72.32%,72.44%,71.01%,70.60%
Figueroa High School,68.48%,68.33%,68.81%,69.33%
Ford High School,69.02%,69.39%,69.25%,68.62%
Griffin High School,72.79%,71.09%,71.69%,71.47%
Hernandez High School,68.59%,68.87%,69.15%,68.99%
Holden High School,70.54%,75.11%,71.64%,73.41%
Huang High School,69.08%,68.53%,69.43%,68.64%
Johnson High School,69.47%,67.99%,68.64%,69.29%
Pena High School,72.00%,72.40%,72.52%,71.19%


## Reading Score by Year

In [13]:
# Create data series of scores by year levels using conditionals
year_nine = school_data_complete[(school_data_complete["year"] == 9)]
year_ten = school_data_complete[(school_data_complete["year"] == 10)]
year_eleven = school_data_complete[(school_data_complete["year"] == 11)]
year_twelve = school_data_complete[(school_data_complete["year"] == 12)]

# Group each by school name
year_nine_scores =school_data_complete.loc[school_data_complete["year"] == 9].groupby("school_name")["reading_score"].mean()
year_ten_scores =school_data_complete.loc[school_data_complete["year"] == 10].groupby("school_name")["reading_score"].mean()
year_eleven_scores =school_data_complete.loc[school_data_complete["year"] == 11].groupby("school_name")["reading_score"].mean()
year_twelve_scores =school_data_complete.loc[school_data_complete["year"] == 12].groupby("school_name")["reading_score"].mean()

# Combine series into single DataFrame
reading_scores_by_year =pd.concat([year_nine_scores,year_ten_scores,year_eleven_scores,year_twelve_scores], axis=1)


# Minor data wrangling
reading_scores_by_year.index.name = None
reading_scores_by_year.columns = ["9th Grade", "10th Grade", "11th Grade", "12th Grade"]
reading_scores_by_year[["9th Grade", "10th Grade", "11th Grade", "12th Grade"]] = reading_scores_by_year[["9th Grade", "10th Grade", "11th Grade", "12th Grade"]].applymap("{:,.2f}%".format)

# Display the DataFrame
reading_scores_by_year


  reading_scores_by_year[["9th Grade", "10th Grade", "11th Grade", "12th Grade"]] = reading_scores_by_year[["9th Grade", "10th Grade", "11th Grade", "12th Grade"]].applymap("{:,.2f}%".format)


Unnamed: 0,9th Grade,10th Grade,11th Grade,12th Grade
Bailey High School,70.90%,70.85%,70.32%,72.20%
Cabrera High School,71.17%,71.33%,71.20%,71.86%
Figueroa High School,70.26%,67.68%,69.15%,69.08%
Ford High School,69.62%,68.99%,70.74%,68.85%
Griffin High School,72.03%,70.75%,72.39%,69.43%
Hernandez High School,68.48%,70.62%,68.42%,69.24%
Holden High School,71.60%,71.10%,73.31%,70.48%
Huang High School,68.67%,69.52%,68.74%,68.67%
Johnson High School,68.72%,69.30%,69.97%,67.99%
Pena High School,70.95%,72.32%,71.70%,71.51%


## Scores by School Spending

In [14]:
# Establish the bins
spending_bins = [0, 585, 630, 645, 680]
group_names = ["<$585", "$585-630", "$630-645", "$645-680"]


In [15]:
# Create a copy of the school summary since it has the "Per Student Budget"
#  This step can be skipped but its best to make a copy.
school_spending_df = school_summary_df


In [16]:
scores_spending = school_summary_df.loc[:,['Average Math Score',
                                  'Average Reading Score','% Passing Math',
                                  '% Passing Reading','Overall Passing Rate',]]
# Add a new columns named Spending Ranges (Per Student) and binning based off budget per student
scores_spending['Spending Ranges (Per Student)']= pd.cut(school_summary_df['Per Student Budget'],spending_bins,labels=group_names)
# Create a group based off of the bins
scores_spending = scores_spending.groupby('Spending Ranges (Per Student)').mean()
scores_spending.head()

  scores_spending = scores_spending.groupby('Spending Ranges (Per Student)').mean()


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$585,71.364587,70.716577,53.531907,52.316726,52.924316
$585-630,72.065868,71.031297,55.074882,52.747703,53.911292
$630-645,69.854807,69.838814,50.12702,50.788413,50.457716
$645-680,68.884391,69.045403,48.982351,49.283939,49.133145


## Scores by School Size

In [17]:
# Establish the bins.
size_bins = [0, 1000, 2000, 5000]
group_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]


In [18]:
# Create a new data frame by locating the desired columns
scores_size = school_summary_df.loc[:,['Average Math Score',
                                  'Average Reading Score','% Passing Math',
                                  '% Passing Reading','Overall Passing Rate',]]
# Add a new columns named School Size and binning based off total students
scores_size['School Size']= pd.cut(school_summary_df['Total Students'],size_bins,labels=group_names)
# Create a group based off of the bins
scores_size = scores_size.groupby('School Size').mean()
scores_size.head()

  scores_size = scores_size.groupby('School Size').mean()


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),72.335748,71.636864,56.274131,55.298534,55.786333
Medium (1000-2000),71.42165,70.720164,53.329348,51.919724,52.624536
Large (2000-5000),69.751809,69.576052,50.335911,50.133437,50.234674


## Scores by School Type

In [19]:
# Create new series using groupby for:
# Type | Average Maths Score | Average Reading Score | % Passing Maths | % Passing Reading | % Overall Passing

type_maths_scores = school_summary_df.groupby(["School Type"])["Average Math Score"].mean()
type_reading_scores = school_summary_df.groupby(["School Type"])["Average Reading Score"].mean()
type_passing_maths = school_summary_df.groupby(["School Type"])["% Passing Math"].mean()
type_passing_reading = school_summary_df.groupby(["School Type"])["% Passing Reading"].mean()
type_overall_passing = school_summary_df.groupby(["School Type"])["Overall Passing Rate"].mean()


In [20]:
scores_type = school_summary_df[['School Type','Average Math Score',
                                  'Average Reading Score','% Passing Math',
                                  '% Passing Reading','Overall Passing Rate',]]
# Create a group based off of the school type
scores_type = scores_type.groupby('School Type').mean()
scores_type.head()

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Government,69.834806,69.675929,50.474631,50.39341,50.434021
Independent,71.368822,70.718933,53.569984,52.313664,52.941824
