# PyCity Schools Analysis
### Import, Merge & Explore Data

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np

# Read School and Student Data File and store into Pandas Data Frames
school_data = pd.read_csv("Resources/schools_complete.csv")
student_data = pd.read_csv("Resources/students_complete.csv")

In [2]:
#Explore School Data to see what the fields are and their data types
school_data.info()
school_data.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
School ID      15 non-null int64
school_name    15 non-null object
type           15 non-null object
size           15 non-null int64
budget         15 non-null int64
dtypes: int64(3), object(2)
memory usage: 680.0+ bytes


Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411


In [3]:
#Explore Student Data to see what the fields are 
student_data.info()
student_data.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39170 entries, 0 to 39169
Data columns (total 7 columns):
Student ID       39170 non-null int64
student_name     39170 non-null object
gender           39170 non-null object
grade            39170 non-null object
school_name      39170 non-null object
reading_score    39170 non-null int64
math_score       39170 non-null int64
dtypes: int64(3), object(4)
memory usage: 2.1+ MB


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61


In [5]:
# We see that school name is the common factor so we will merge on that column

combo_data = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
combo_data.head(2)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635


### Task 1: District Summary

* Create a high level snapshot (in table form) of the district's key metrics, including:
  * Total Schools
  * Total Students
  * Total Budget
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * Overall Passing Rate (Average of the above two)

In [7]:
#Use school_data to gather information about total budget and total schools 

total_budget = school_data["budget"].sum()
total_schools = school_data["school_name"].count()
total_students = school_data["size"].sum()

#Use student_data gather information about student count, average math score, average reading score
avgread = student_data["reading_score"].mean()
avgmath = student_data["math_score"].mean()

# Passing grades are considered to be 70 and above 
passing_read = student_data.loc[student_data["reading_score"] >= 70]
passing_read.sort_values("reading_score") # checking to make sure the lowest number in reading score is 70
pct_read = round((passing_read["reading_score"].count() / total_students)*100, 6)

passing_math = student_data.loc[student_data["math_score"] >= 70]
pct_math = round((passing_math["math_score"].count() / total_students)*100, 6)

#overall passing rate is avg of average math and average read vs. avg of the passing scores 
pct_ov = ((avgmath + avgread)/2) 


# We'll create a new Pandas DataFrame using dictionaries 

d = {"Total Schools": [total_schools], "Total Students": "{:,}".format(total_students)
, "Total Budget": "${:,.2f}".format(total_budget), "Average Math Score" : [avgmath],
                                "Average Reading Score" : [avgread], "% Passing Math" :[pct_math],
                                "% Passing Reading": [pct_read],"Overall Passing Rate": [pct_ov]}

district_summary =pd.DataFrame(data =d)

district_summary


Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Overall Passing Rate
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,80.431606


### Task 2: School Summary

* Create an overview table that summarizes key metrics about each school, including:
  * School Name
  * School Type
  * Total Students
  * Total School Budget
  * Per Student Budget
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * Overall Passing Rate (Average of the above two)

In [21]:
combo_ss = combo_data
combo_ss = combo_ss.loc[:, ["school_name", "type", "size", "budget", 
                              "reading_score", "math_score"]]

combo_ss['Passed Math'] = [ 1 if x >= 70 else 0 for x in combo_ss['math_score']]
combo_ss['Passed Read'] = [ 1 if x >= 70 else 0 for x in combo_ss['reading_score']]

combo_ss = combo_ss.rename(columns={"type":"School Type", "size": "Total Students",
                                   "budget": "Total School Budget", 
                                    "reading_score":"Average Reading Score",
                                   "math_score":"Average Math Score"})
ss_groupby = combo_ss.groupby("school_name").agg({"Total Students":"mean", "Total School Budget":"mean",
                                                    "Average Reading Score" : "mean",
                                                   "Average Math Score": "mean", 
                                                    "Passed Math" : "sum",
                                                   "Passed Read" : "sum"})

ss_groupby['% Passing Math'] = ss_groupby['Passed Math'] / ss_groupby['Total Students']
ss_groupby['% Passing Reading'] = ss_groupby['Passed Read'] / ss_groupby['Total Students']
ss_groupby['Overall Passing Rate'] = (ss_groupby['% Passing Math'] + ss_groupby['% Passing Reading'])/ 2
total_students_ss = ss_groupby['Total Students']

ss_groupby


Unnamed: 0_level_0,Total Students,Total School Budget,Average Reading Score,Average Math Score,Passed Math,Passed Read,% Passing Math,% Passing Reading,Overall Passing Rate
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,4976,3124928,81.033963,77.048432,3318,4077,0.666801,0.819333,0.743067
Cabrera High School,1858,1081356,83.97578,83.061895,1749,1803,0.941335,0.970398,0.955867
Figueroa High School,2949,1884411,81.15802,76.711767,1946,2381,0.659885,0.807392,0.733639
Ford High School,2739,1763916,80.746258,77.102592,1871,2172,0.683096,0.79299,0.738043
Griffin High School,1468,917500,83.816757,83.351499,1371,1426,0.933924,0.97139,0.952657
Hernandez High School,4635,3022020,80.934412,77.289752,3094,3748,0.66753,0.80863,0.73808
Holden High School,427,248087,83.814988,83.803279,395,411,0.925059,0.962529,0.943794
Huang High School,2917,1910635,81.182722,76.629414,1916,2372,0.656839,0.813164,0.735002
Johnson High School,4761,3094650,80.966394,77.072464,3145,3867,0.660576,0.812224,0.7364
Pena High School,962,585858,84.044699,83.839917,910,923,0.945946,0.959459,0.952703


### Task 3: Top Performing Schools (By Passing Rate)

* Create a table that highlights the top 5 performing schools based on Overall Passing Rate. Include:
  * School Name
  * School Type
  * Total Students
  * Total School Budget
  * Per Student Budget
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * Overall Passing Rate (Average of the above two)

In [22]:
top_school = ss_groupby.sort_values(["Overall Passing Rate"], ascending = False).head(5)
top_school

Unnamed: 0_level_0,Total Students,Total School Budget,Average Reading Score,Average Math Score,Passed Math,Passed Read,% Passing Math,% Passing Reading,Overall Passing Rate
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,1858,1081356,83.97578,83.061895,1749,1803,0.941335,0.970398,0.955867
Thomas High School,1635,1043130,83.84893,83.418349,1525,1591,0.932722,0.973089,0.952905
Pena High School,962,585858,84.044699,83.839917,910,923,0.945946,0.959459,0.952703
Griffin High School,1468,917500,83.816757,83.351499,1371,1426,0.933924,0.97139,0.952657
Wilson High School,2283,1319574,83.989488,83.274201,2143,2204,0.938677,0.965396,0.952037


### Task 4: Bottom Performing Schools (By Passing Rate)

* Create a table that highlights the bottom 5 performing schools based on Overall Passing Rate. Include all of the same metrics as above.

In [23]:
bottom_school = ss_groupby.sort_values(["Overall Passing Rate"], ascending = True).head(5)
bottom_school

Unnamed: 0_level_0,Total Students,Total School Budget,Average Reading Score,Average Math Score,Passed Math,Passed Read,% Passing Math,% Passing Reading,Overall Passing Rate
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,3999,2547363,80.744686,76.842711,2654,3208,0.663666,0.802201,0.732933
Figueroa High School,2949,1884411,81.15802,76.711767,1946,2381,0.659885,0.807392,0.733639
Huang High School,2917,1910635,81.182722,76.629414,1916,2372,0.656839,0.813164,0.735002
Johnson High School,4761,3094650,80.966394,77.072464,3145,3867,0.660576,0.812224,0.7364
Ford High School,2739,1763916,80.746258,77.102592,1871,2172,0.683096,0.79299,0.738043


### Task 5: Math Scores by Grade\*\*

* Create a table that lists the average Math Score for students of each grade level (9th, 10th, 11th, 12th) at each school.