In [271]:
# Dependencies and Setup
from pathlib import Path
import pandas as pd

# File to Load (Remember to Change These)
school_data_to_load = Path("Resources/schools_complete.csv")
student_data_to_load = Path("Resources/students_complete.csv")

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.
school_data_complete = pd.merge(student_data, school_data, how="left", on="school_name")

In [272]:
school_data_complete

Unnamed: 0,Student ID,student_name,gender,year,school_name,reading_score,maths_score,School ID,type,size,budget
0,0,Paul Bradley,M,9,Huang High School,96,94,0,Government,2917,1910635
1,1,Victor Smith,M,12,Huang High School,90,43,0,Government,2917,1910635
2,2,Kevin Rodriguez,M,12,Huang High School,41,76,0,Government,2917,1910635
3,3,Richard Scott,M,12,Huang High School,89,86,0,Government,2917,1910635
4,4,Bonnie Ray,F,9,Huang High School,87,69,0,Government,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12,Thomas High School,51,48,14,Independent,1635,1043130
39166,39166,Dawn Bell,F,10,Thomas High School,81,89,14,Independent,1635,1043130
39167,39167,Rebecca Tanner,F,9,Thomas High School,99,99,14,Independent,1635,1043130
39168,39168,Desiree Kidd,F,10,Thomas High School,72,77,14,Independent,1635,1043130


In [273]:
# Local Government Area (LGA) Summary
# unique schools count
school_count = school_data_complete['school_name'].nunique()

In [274]:
# students count
student_count = school_data_complete['student_name'].count()

In [275]:
# calc total budget
schools_budget = school_data_complete.drop_duplicates(subset='school_name').reset_index(drop=True)['budget'].sum()

In [276]:
average_maths_score = school_data_complete['maths_score'].mean()

In [277]:
average_reading_score = school_data_complete['reading_score'].mean()

In [278]:
passing_maths_count = school_data_complete[(school_data_complete["maths_score"] >= 50)].count()["student_name"]
passing_maths_percentage = passing_maths_count / float(student_count) * 100

In [279]:
passing_reading_count = school_data_complete[(school_data_complete["reading_score"] >= 50)].count()["student_name"]
passing_reading_percentage = passing_reading_count/ float(student_count) * 100

In [280]:
students_passing_math = school_data_complete[(school_data_complete["maths_score"] >= 50)]
students_passing_reading = school_data_complete[(school_data_complete["reading_score"] >= 50)]
intersection_count = pd.merge(students_passing_math, students_passing_reading, how='inner').count()["student_name"]

In [281]:
# percentage pf students passing both reading & math
overall_passing_percentage = intersection_count/ float(student_count) * 100

In [282]:
# LGA Summary
LGA_summary_map= {
    'Total Schools': [school_count],
    'Total Students': [student_count],
    'Total Budget': [schools_budget],
    'Average Maths Score': [average_maths_score],
    'Average Reading Score': [average_reading_score],
    '% Passing Maths': [passing_maths_percentage],
    '% Passing Reading': [passing_reading_percentage],
    '% Overall Passing': [overall_passing_percentage]
}

LGA_summary_data = pd.DataFrame(LGA_summary_map)

In [283]:
# Local Government Area (LGA) Summary
LGA_summary_data

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
0,15,39170,24649428,70.338192,69.980138,86.078632,84.426857,72.808272


In [284]:
# School Summary work below

In [285]:
# Use the code provided to select the type per school from school_data
school_types = school_data.set_index(["school_name"])["type"].reset_index(name='type')

# Calculate the total student count per school
per_school_counts = student_data.groupby(["school_name"]).size().reset_index(name='student_count')

# Calculate the total school budget and per capita spending per school from school_data
per_school_budget = school_data.set_index(["school_name"])["budget"].reset_index(name='school_budget')

df_merged = pd.merge(per_school_counts, per_school_budget, on='school_name')
df_merged['per_capita_budget'] = df_merged['school_budget'] / df_merged['student_count']
per_capita_budget = df_merged[['school_name', 'per_capita_budget']]

# Calculate the average test scores per school from school_data_complete
per_school_maths = school_data_complete.groupby('school_name')['maths_score'].mean().reset_index()
per_school_maths.columns = ['school_name', 'average_maths_score']

per_school_reading = school_data_complete.groupby('school_name')['reading_score'].mean().reset_index()
per_school_reading.columns = ['school_name', 'average_reading_score']

In [286]:
# Get the students who passed maths and passed reading by creating separate filtered DataFrames from school_data_complete.
total_students_per_school = school_data_complete.groupby('school_name').size()
students_passing_math = school_data_complete[school_data_complete['maths_score'] >= 50].groupby('school_name').size()
school_passing_maths = (students_passing_math / total_students_per_school * 100).reset_index(name='math_passing_percentage')

students_passing_reading = school_data_complete[school_data_complete['reading_score'] >= 50].groupby('school_name').size()
school_passing_reading = (students_passing_reading / total_students_per_school * 100).reset_index(name='reading_passing_percentage')

# Get the students who passed both reading and maths in a separate DataFrame from school_data_complete.
students_passing_maths_and_reading = school_data_complete[(school_data_complete['reading_score'] >= 50) & (school_data_complete['maths_score'] >= 50)]
school_overall_passing_rate = (students_passing_maths_and_reading.groupby('school_name').size() / total_students_per_school * 100).reset_index(name='overall_passing_rate')

In [304]:
# School Summary
school_summary_data = pd.merge(school_types, per_school_counts, on='school_name')
school_summary_data = pd.merge(school_summary_data, per_school_budget, on='school_name')
school_summary_data = pd.merge(school_summary_data, per_capita_budget, on='school_name')
school_summary_data = pd.merge(school_summary_data, per_school_maths, on='school_name')
school_summary_data = pd.merge(school_summary_data, per_school_reading, on='school_name')
school_summary_data = pd.merge(school_summary_data, school_passing_reading, on='school_name')
school_summary_data = pd.merge(school_summary_data, school_passing_maths, on='school_name')
school_summary_data = pd.merge(school_summary_data, school_overall_passing_rate, on='school_name')
school_summary_data['school_budget'] = school_summary_data['school_budget'].apply(lambda x: '${:,.2f}'.format(x))
school_summary_data['per_capita_budget'] = school_summary_data['per_capita_budget'].apply(lambda x: '${:,.2f}'.format(x))


In [294]:
school_summary_data

Unnamed: 0,school_name,type,student_count,school_budget,per_capita_budget,average_maths_score,average_reading_score,reading_passing_percentage,math_passing_percentage,overall_passing_rate
0,Huang High School,Government,2917,"$1,910,635.00",$655.00,68.935207,68.910525,81.453548,81.693521,66.712376
1,Figueroa High School,Government,2949,"$1,884,411.00",$639.00,68.698542,69.077993,82.807731,81.654798,67.650051
2,Shelton High School,Independent,1761,"$1,056,600.00",$600.00,72.034072,70.257808,86.712095,91.538898,78.875639
3,Hernandez High School,Government,4635,"$3,022,020.00",$652.00,68.874865,69.186408,81.877023,80.949299,66.364617
4,Griffin High School,Independent,1468,"$917,500.00",$625.00,71.788147,71.245232,88.487738,91.212534,81.33515
5,Wilson High School,Independent,2283,"$1,319,574.00",$578.00,69.170828,68.876916,81.29654,82.785808,67.455103
6,Cabrera High School,Independent,1858,"$1,081,356.00",$582.00,71.657158,71.359526,89.074273,90.850377,80.785791
7,Bailey High School,Government,4976,"$3,124,928.00",$628.00,72.352894,71.008842,87.379421,91.639871,80.084405
8,Holden High School,Independent,427,"$248,087.00",$581.00,72.583138,71.660422,88.52459,89.929742,78.922717
9,Pena High School,Independent,962,"$585,858.00",$609.00,72.088358,71.613306,86.590437,91.683992,79.209979
