In [1]:
### Note - Analysis Write-up is in the readme file.

# Dependencies and Setup
import pandas as pd

# Read CSV Files
school_data = pd.read_csv("Resources/schools_complete.csv")
student_data = pd.read_csv("Resources/students_complete.csv")

# Left join merge both csvs into a single dataset
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
school_data_complete.head()

Unnamed: 0,Student ID,student_name,gender,year,school_name,reading_score,maths_score,School ID,type,size,budget
0,0,Paul Bradley,M,9,Huang High School,96,94,0,Government,2917,1910635
1,1,Victor Smith,M,12,Huang High School,90,43,0,Government,2917,1910635
2,2,Kevin Rodriguez,M,12,Huang High School,41,76,0,Government,2917,1910635
3,3,Richard Scott,M,12,Huang High School,89,86,0,Government,2917,1910635
4,4,Bonnie Ray,F,9,Huang High School,87,69,0,Government,2917,1910635


In [2]:
### LGA Summary -> Number of unique schools | Total students + budget | Average maths + reading score | % passing (50%) in reading/maths + overall (passed both)

# Count of unique schools
school_count = len(school_data_complete["school_name"].unique())

# Total students - just counting rows | Created function to format number with comma spacing
def comma_spacing(value):
    return '{:,}'.format(value)
student_count = comma_spacing(school_data_complete["student_name"].count())

# Total budget - sum of budget from each unique school
budget_list = []
schools_budget = list(school_data_complete["budget"])
for i in schools_budget:
    if i not in budget_list:
        budget_list.append(i)

# Create function to automatically convert formatting to currency (dollar sign + 2 decimal places)
def currency(value):
    return "${:,.2f}".format(value)

total_budget = currency(sum(budget_list))

# Average maths score - take maths column and average it | Create function to format to 6 decimal places
def six_decimals(value):
    return '{:.6f}'.format(value)

average_maths = six_decimals(school_data_complete["maths_score"].mean())

# Average reading score - same as above
average_reading = six_decimals(school_data_complete["reading_score"].mean())

# % Passing maths - filter with loc -> get the percentage between counts of new/old * 100
passing_maths = school_data_complete.loc[school_data_complete["maths_score"] >= 50]
passing_maths_percent = six_decimals(passing_maths["maths_score"].count()/school_data_complete["maths_score"].count() * 100)

# % Passing reading - same as above
passing_reading = school_data_complete.loc[school_data_complete["reading_score"] >= 50]
passing_reading_percent = six_decimals(passing_reading["reading_score"].count()/school_data_complete["reading_score"].count() * 100)

# % overall passing - students that passed both math + reading
passing_overall = school_data_complete.loc[(school_data_complete["reading_score"] >= 50) & (school_data_complete["maths_score"] >= 50)]
passing_overall_percent = six_decimals(passing_overall["Student ID"].count()/school_data_complete["Student ID"].count() * 100)

# Create new summary dataframe/table with the above variables - should have 2 rows with 8 columns
summary_list = [
    {
    "Total Schools":school_count,
    "Total Students":student_count,
    "Total Budget":total_budget,
    "Average Maths Score":average_maths,
    "Average Reading Score":average_reading,
    "% Passing Maths":passing_maths_percent,
    "% Passing Reading":passing_reading_percent,
    "% Overall Passing":passing_overall_percent
    }
]

lga_summary = pd.DataFrame(summary_list)
lga_summary


Unnamed: 0,Total Schools,Total Students,Total Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",70.338192,69.980138,86.078632,84.426857,72.808272


In [3]:
### School Summary - School name + type | total students + budget + per student budget | average reading/maths score | % Passing

# Sort whole dataset by school
groupby_school = school_data_complete.groupby(["school_name"])

# Get type for each school
school_name = groupby_school["school_name"].unique()

# Get type for each school
school_type = groupby_school["type"].unique()
school_type = school_type.astype(str)

# Total students per school
total_student = groupby_school.size()

# Total budget per school
total_school_budget = groupby_school["budget"].unique()
total_school_budget = total_school_budget.astype(int)

# Per student budget
per_student_budget = total_school_budget/total_student

# Average subject scores per school
average_school_reading = groupby_school["reading_score"].mean()
average_school_maths = groupby_school["maths_score"].mean()

# % Pass per school for each subject + overall
pass_school_reading = school_data_complete[school_data_complete["reading_score"] >= 50].groupby(["school_name"]).count()
pass_school_reading = pass_school_reading["reading_score"]/total_student * 100

pass_school_maths = school_data_complete[school_data_complete["maths_score"] >= 50].groupby(["school_name"]).count()
pass_school_maths = pass_school_maths["reading_score"]/total_student * 100

pass_school_overall = school_data_complete[(school_data_complete["maths_score"] >= 50) & (school_data_complete["reading_score"] >= 50)].groupby(["school_name"]).count()
pass_school_overall = pass_school_overall["student_name"]/total_student * 100


# Create a school-grouped summary dataframe with .concat

school_summary = pd.concat([school_type, total_student, total_school_budget, per_student_budget, average_school_maths, average_school_reading, pass_school_maths, pass_school_reading, pass_school_overall],\
axis=1,\
keys=["School Type","Total Students","Total School Budget","Per Student Budget","Average Maths Score","Average Reading Score","% Passing Maths","% Passing Reading","% Overall Passing"])

# Formatting summary dataframe
## Create new formatted dataframe for the sole purpose of printing
school_summary_formatted = pd.concat([school_type, total_student, total_school_budget, per_student_budget, average_school_maths, average_school_reading, pass_school_maths, pass_school_reading, pass_school_overall],\
axis=1,\
keys=["School Type","Total Students","Total School Budget","Per Student Budget","Average Maths Score","Average Reading Score","% Passing Maths","% Passing Reading","% Overall Passing"])
## Remove all square brackets in [School Type] - can't remove these quotation marks for some reason (pls help)
school_summary_formatted["School Type"] = school_summary_formatted["School Type"].str.replace("["," ", regex=True).str.replace("]"," ",regex=True)

## Convert [Total School Budget] & [Per Student Budget] into currency format
school_summary_formatted[["Total School Budget","Per Student Budget"]] = school_summary_formatted[["Total School Budget","Per Student Budget"]].astype("float")

school_summary_formatted["Total School Budget"] = school_summary_formatted["Total School Budget"].apply(currency)
school_summary_formatted["Per Student Budget"] = school_summary_formatted["Per Student Budget"].apply(currency)

school_summary_formatted

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,'Government',4976,"$3,124,928.00",$628.00,72.352894,71.008842,91.639871,87.379421,80.084405
Cabrera High School,'Independent',1858,"$1,081,356.00",$582.00,71.657158,71.359526,90.850377,89.074273,80.785791
Figueroa High School,'Government',2949,"$1,884,411.00",$639.00,68.698542,69.077993,81.654798,82.807731,67.650051
Ford High School,'Government',2739,"$1,763,916.00",$644.00,69.091274,69.572472,82.438846,82.219788,67.46988
Griffin High School,'Independent',1468,"$917,500.00",$625.00,71.788147,71.245232,91.212534,88.487738,81.33515
Hernandez High School,'Government',4635,"$3,022,020.00",$652.00,68.874865,69.186408,80.949299,81.877023,66.364617
Holden High School,'Independent',427,"$248,087.00",$581.00,72.583138,71.660422,89.929742,88.52459,78.922717
Huang High School,'Government',2917,"$1,910,635.00",$655.00,68.935207,68.910525,81.693521,81.453548,66.712376
Johnson High School,'Government',4761,"$3,094,650.00",$650.00,68.8431,69.039277,82.062592,81.978576,67.191766
Pena High School,'Independent',962,"$585,858.00",$609.00,72.088358,71.613306,91.683992,86.590437,79.209979


In [4]:
### Top 5 Performing Schools - % Overall Passing

top_school_overall = school_summary_formatted.sort_values(["% Overall Passing"],ascending=False)
top_school_overall.head(5)

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Griffin High School,'Independent',1468,"$917,500.00",$625.00,71.788147,71.245232,91.212534,88.487738,81.33515
Cabrera High School,'Independent',1858,"$1,081,356.00",$582.00,71.657158,71.359526,90.850377,89.074273,80.785791
Bailey High School,'Government',4976,"$3,124,928.00",$628.00,72.352894,71.008842,91.639871,87.379421,80.084405
Wright High School,'Independent',1800,"$1,049,400.00",$583.00,72.047222,70.969444,91.777778,86.666667,79.722222
Rodriguez High School,'Government',3999,"$2,547,363.00",$637.00,72.047762,70.935984,90.797699,87.396849,79.419855


In [11]:
### Bottom 5 Performing Schools - % Overall Passing
bottom_school_overall = school_summary_formatted.sort_values(["% Overall Passing"],ascending=True)
bottom_school_overall.head(5)

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Hernandez High School,'Government',4635,"$3,022,020.00",$652.00,68.874865,69.186408,80.949299,81.877023,66.364617
Huang High School,'Government',2917,"$1,910,635.00",$655.00,68.935207,68.910525,81.693521,81.453548,66.712376
Johnson High School,'Government',4761,"$3,094,650.00",$650.00,68.8431,69.039277,82.062592,81.978576,67.191766
Wilson High School,'Independent',2283,"$1,319,574.00",$578.00,69.170828,68.876916,82.785808,81.29654,67.455103
Ford High School,'Government',2739,"$1,763,916.00",$644.00,69.091274,69.572472,82.438846,82.219788,67.46988


In [6]:
### Average Maths Scores by Year across Schools

# Create series for each year
year_9_maths = school_data_complete.loc[school_data_complete["year"] == 9].groupby("school_name").mean()
year_9_maths = year_9_maths["maths_score"]
# year_10_maths
year_10_maths = school_data_complete.loc[school_data_complete["year"] == 10].groupby("school_name").mean()
year_10_maths = year_10_maths["maths_score"]
# year_11_maths
year_11_maths = school_data_complete.loc[school_data_complete["year"] == 11].groupby("school_name").mean()
year_11_maths = year_11_maths["maths_score"]
# year_12_maths
year_12_maths = school_data_complete.loc[school_data_complete["year"] == 12].groupby("school_name").mean()
year_12_maths = year_12_maths["maths_score"]

year_maths_summary = pd.concat([year_9_maths,year_10_maths,year_11_maths,year_12_maths],\
    axis=1,\
    keys=["Year 9", "Year 10", "Year 11", "Year 12"])

year_maths_summary


Unnamed: 0_level_0,Year 9,Year 10,Year 11,Year 12
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,72.493827,71.897498,72.3749,72.675097
Cabrera High School,72.32197,72.437768,71.008299,70.604712
Figueroa High School,68.477804,68.331586,68.811001,69.325282
Ford High School,69.021609,69.387006,69.248862,68.617811
Griffin High School,72.789731,71.093596,71.692521,71.469178
Hernandez High School,68.586831,68.867156,69.154412,68.985075
Holden High School,70.543307,75.105263,71.640777,73.409639
Huang High School,69.081754,68.533246,69.431345,68.639316
Johnson High School,69.469286,67.99022,68.63773,69.287393
Pena High School,71.996364,72.396,72.523438,71.187845


In [7]:
### Average Reading Scores by Year across Schools

# Create series for each year
year_9_reading = school_data_complete.loc[school_data_complete["year"] == 9].groupby("school_name").mean()
year_9_reading = year_9_reading["reading_score"]
# year_10_maths
year_10_reading = school_data_complete.loc[school_data_complete["year"] == 10].groupby("school_name").mean()
year_10_reading = year_10_reading["reading_score"]
# year_11_maths
year_11_reading = school_data_complete.loc[school_data_complete["year"] == 11].groupby("school_name").mean()
year_11_reading = year_11_reading["reading_score"]
# year_12_maths
year_12_reading = school_data_complete.loc[school_data_complete["year"] == 12].groupby("school_name").mean()
year_12_reading = year_12_reading["reading_score"]

year_reading_summary = pd.concat([year_9_reading,year_10_reading,year_11_reading,year_12_reading],\
    axis=1,\
    keys=["Year 9", "Year 10", "Year 11", "Year 12"])

year_reading_summary

Unnamed: 0_level_0,Year 9,Year 10,Year 11,Year 12
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,70.90192,70.848265,70.317346,72.195525
Cabrera High School,71.172348,71.328326,71.201245,71.856021
Figueroa High School,70.261682,67.677588,69.152327,69.082126
Ford High School,69.615846,68.988701,70.735964,68.849722
Griffin High School,72.026895,70.746305,72.385042,69.434932
Hernandez High School,68.477569,70.621842,68.418199,69.244136
Holden High School,71.598425,71.096491,73.31068,70.481928
Huang High School,68.670616,69.516297,68.740638,68.671795
Johnson High School,68.719286,69.295029,69.969115,67.992521
Pena High School,70.949091,72.324,71.703125,71.513812


In [8]:
### Scores by School Spending
# Create bins + labels
spending_bins = [0, 585, 630, 645, 680]
labels = ["<$585", "$585-630", "$630-645", "$645-680"]

# Cut data into those spending ranges
school_summary_spending = pd.cut(school_summary["Per Student Budget"], spending_bins, labels=labels,include_lowest=True)

# Calculating mean scores per spending range in five variables
spending_maths_scores = school_summary.groupby([school_summary_spending]).mean()["Average Maths Score"]
spending_reading_scores = school_summary.groupby([school_summary_spending]).mean()["Average Reading Score"]
spending_passing_maths = school_summary.groupby([school_summary_spending]).mean()["% Passing Maths"]
spending_passing_reading = school_summary.groupby([school_summary_spending]).mean()["% Passing Reading"]
overall_passing_spending = school_summary.groupby([school_summary_spending]).mean()["% Overall Passing"]

# Creating summary dataframe
spending_summary = pd.concat([spending_maths_scores,spending_reading_scores,spending_passing_maths,spending_passing_reading,overall_passing_spending],\
    axis=1,\
    keys=["Average Maths Score", "Average Reading Score", "% Passing Maths", "% Passing Reading","% Overall Passing"])

# Formatting to two decimal places
spending_summary["Average Maths Score"] = spending_summary["Average Maths Score"].map("{:.2f}".format)
spending_summary["Average Reading Score"] = spending_summary["Average Reading Score"].map("{:.2f}".format)
spending_summary["% Passing Maths"] = spending_summary["% Passing Maths"].map("{:.2f}".format)
spending_summary["% Passing Reading"] = spending_summary["% Passing Reading"].map("{:.2f}".format)
spending_summary["% Overall Passing"] = spending_summary["% Overall Passing"].map("{:.2f}".format)

spending_summary

Unnamed: 0_level_0,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
Per Student Budget,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$585,71.36,70.72,88.84,86.39,76.72
$585-630,72.07,71.03,91.52,87.29,79.88
$630-645,69.85,69.84,84.69,83.76,71.0
$645-680,68.88,69.05,81.57,81.77,66.76


In [9]:
### Scores by School Size
# Create bins + Labels
size_bins = [0, 1000, 2000, 5000]
labels = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

# Sort by school size
school_summary_size = pd.cut(school_summary["Total Students"], size_bins, labels=labels,include_lowest=True)

# Calculating mean scores per school size in five variables
size_maths_scores = school_summary.groupby([school_summary_size]).mean()["Average Maths Score"]
size_reading_scores = school_summary.groupby([school_summary_size]).mean()["Average Reading Score"]
size_passing_maths = school_summary.groupby([school_summary_size]).mean()["% Passing Maths"]
size_passing_reading = school_summary.groupby([school_summary_size]).mean()["% Passing Reading"]
overall_passing_size = school_summary.groupby([school_summary_size]).mean()["% Overall Passing"]

# Create summary dataframe
size_summary = pd.concat([size_maths_scores,size_reading_scores,size_passing_maths,size_passing_reading,overall_passing_size],\
    axis=1,\
    keys=["Average Maths Score", "Average Reading Score", "% Passing Maths", "% Passing Reading","% Overall Passing"])

# Formatted to two decimal places
size_summary["Average Maths Score"] = size_summary["Average Maths Score"].map("{:.2f}".format)
size_summary["Average Reading Score"] = size_summary["Average Reading Score"].map("{:.2f}".format)
size_summary["% Passing Maths"] = size_summary["% Passing Maths"].map("{:.2f}".format)
size_summary["% Passing Reading"] = size_summary["% Passing Reading"].map("{:.2f}".format)
size_summary["% Overall Passing"] = size_summary["% Overall Passing"].map("{:.2f}".format)

size_summary

Unnamed: 0_level_0,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
Total Students,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),72.34,71.64,90.81,87.56,79.07
Medium (1000-2000),71.42,70.72,89.85,86.71,78.04
Large (2000-5000),69.75,69.58,84.25,83.3,70.29


In [21]:
### Scores by School Type

# Groupby school type

type_maths_scores = school_summary.groupby(["School Type"]).mean()["Average Maths Score"]
type_reading_scores = school_summary.groupby(["School Type"]).mean()["Average Reading Score"]
type_passing_maths = school_summary.groupby(["School Type"]).mean()["% Passing Maths"]
type_passing_reading = school_summary.groupby(["School Type"]).mean()["% Passing Reading"]
overall_passing_type = school_summary.groupby(["School Type"]).mean()["% Overall Passing"]

# Create summary dataframe
type_summary = pd.concat([type_maths_scores,type_reading_scores,type_passing_maths,type_passing_reading,overall_passing_type],\
    axis=1,\
    keys=["Average Maths Score", "Average Reading Score", "% Passing Maths", "% Passing Reading","% Overall Passing"])

type_summary



Unnamed: 0_level_0,Average Maths Score,Average Reading Score,% Passing Maths,% Passing Reading,% Overall Passing
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
['Government'],69.834806,69.675929,84.462375,83.587562,70.698993
['Independent'],71.368822,70.718933,89.204043,86.247789,76.97334
