### Note
* Instructions have been included for each segment. You do not have to follow them exactly, but they are included to help you think through the steps.

In [1]:
# Dependencies and Setup
import pandas as pd

#check Pandas version 
#print(pd.__version__)

# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_data_complete_df = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
#school_data_complete_df.head()

In [2]:
#drop unneccessary information
school_cleaned_df = school_data_complete_df.drop(school_data_complete_df.columns[1:4], axis = 1)
#rename School ID column
school_cleaned_df = school_cleaned_df.rename(columns={"School ID": "school_id", "Student ID": "student_id"})
school_cleaned_df.head()

Unnamed: 0,student_id,school_name,reading_score,math_score,school_id,type,size,budget
0,0,Huang High School,66,79,0,District,2917,1910635
1,1,Huang High School,94,61,0,District,2917,1910635
2,2,Huang High School,90,60,0,District,2917,1910635
3,3,Huang High School,67,58,0,District,2917,1910635
4,4,Huang High School,97,84,0,District,2917,1910635


In [3]:
school_cleaned_df.dtypes

student_id        int64
school_name      object
reading_score     int64
math_score        int64
school_id         int64
type             object
size              int64
budget            int64
dtype: object

## District Summary

* Calculate the total number of schools

* Calculate the total number of students

* Calculate the total budget

* Calculate the average math score 

* Calculate the average reading score

* Calculate the percentage of students with a passing math score (70 or greater)

* Calculate the percentage of students with a passing reading score (70 or greater)

* Calculate the percentage of students who passed math **and** reading (% Overall Passing)

* Create a dataframe to hold the above results

* Optional: give the displayed data cleaner formatting

In [4]:
#calculate total number of schools
num_schools = school_cleaned_df["school_name"].value_counts()
print(num_schools)

Bailey High School       4976
Johnson High School      4761
Hernandez High School    4635
Rodriguez High School    3999
Figueroa High School     2949
Huang High School        2917
Ford High School         2739
Wilson High School       2283
Cabrera High School      1858
Wright High School       1800
Shelton High School      1761
Thomas High School       1635
Griffin High School      1468
Pena High School          962
Holden High School        427
Name: school_name, dtype: int64


In [5]:
total_schools = num_schools.count()
print(total_schools)

15


In [6]:
#Calculate total number of students
dist_students = len(school_cleaned_df)
print(dist_students)

39170


In [7]:
#Create dictionaries keyed on school id
budgets_dict = dict(zip(school_cleaned_df.school_id, school_cleaned_df.budget))
names_dict = dict(zip(school_cleaned_df.school_id, school_cleaned_df.school_name))
types_dict = dict(zip(school_cleaned_df.school_id, school_cleaned_df.type))
print(budgets_dict)
print(names_dict)
print(types_dict)

{0: 1910635, 1: 1884411, 2: 1056600, 3: 3022020, 4: 917500, 5: 1319574, 6: 1081356, 7: 3124928, 8: 248087, 9: 585858, 10: 1049400, 11: 2547363, 12: 3094650, 13: 1763916, 14: 1043130}
{0: 'Huang High School', 1: 'Figueroa High School', 2: 'Shelton High School', 3: 'Hernandez High School', 4: 'Griffin High School', 5: 'Wilson High School', 6: 'Cabrera High School', 7: 'Bailey High School', 8: 'Holden High School', 9: 'Pena High School', 10: 'Wright High School', 11: 'Rodriguez High School', 12: 'Johnson High School', 13: 'Ford High School', 14: 'Thomas High School'}
{0: 'District', 1: 'District', 2: 'Charter', 3: 'District', 4: 'Charter', 5: 'Charter', 6: 'Charter', 7: 'District', 8: 'Charter', 9: 'Charter', 10: 'Charter', 11: 'District', 12: 'District', 13: 'District', 14: 'Charter'}


In [8]:
#Calculate the total budget
budget_total = sum(budgets_dict.values())
print(budget_total)

24649428


In [9]:
#Add total district math scores
dist_math = school_cleaned_df.loc[:,"math_score"].sum()
print(dist_math)

3093857


In [10]:
#Calculate the average math score 
dist_math_avg = dist_math / dist_students
print(dist_math_avg)

78.98537145774827


In [11]:
#Add total district reading scores
dist_reading = school_cleaned_df.loc[:,"reading_score"].sum()
print(dist_reading)

3207155


In [12]:
#Calculate the average reading score
dist_reading_avg = dist_reading / dist_students
print(dist_reading_avg)

81.87784018381414


In [13]:
#Calculate number of students w passing math score
dist_math_passing_df = school_cleaned_df.loc[school_cleaned_df["math_score"] >= 70]
#dist_math_passing_df.head()

dist_num_math = dist_math_passing_df["student_id"].count()
print(dist_num_math)

29370


In [51]:
#Calculate number of passing math students by school
#https://stackoverflow.com/questions/29876184/groupby-results-to-dictionary-of-lists
school_math_passing_dict = {k: list(v) for k, v in dist_math_passing_df.groupby("school_id")["student_id"]}
#print(school_math_passing_dict)
#len(school_math_passing)

In [24]:
#Calculate number of students w passing reading score
dist_reading_passing_df = school_cleaned_df.loc[school_cleaned_df["reading_score"] >= 70]
#dist_reading_passing_df.head()

dist_num_reading = dist_reading_passing_df["student_id"].count()
print(dist_num_reading)

33610


In [53]:
#Calculate number of passing reading students by school
school_reading_passing_dict = {k: list(v) for k, v in dist_reading_passing_df.groupby("school_id")["student_id"]}
#print(school_reading_passing_dict)
#len(school_reading_passing)

In [26]:
#Calculate Percentage of passing students
dist_math_percent = (dist_num_math / dist_students) * 100
dist_reading_percent = (dist_num_reading / dist_students) * 100
print(dist_math_percent)
print(dist_reading_percent)

74.9808526933878
85.80546336482001


In [27]:
#Calculate number of students passing both
#This feels inelegant; come back to this section
dist_passing_merged_df = pd.merge(dist_math_passing_df, dist_reading_passing_df, on="student_id", how="inner")
dist_passing_merged_df.head()

Unnamed: 0,student_id,school_name_x,reading_score_x,math_score_x,school_id_x,type_x,size_x,budget_x,school_name_y,reading_score_y,math_score_y,school_id_y,type_y,size_y,budget_y
0,4,Huang High School,97,84,0,District,2917,1910635,Huang High School,97,84,0,District,2917,1910635
1,5,Huang High School,94,94,0,District,2917,1910635,Huang High School,94,94,0,District,2917,1910635
2,6,Huang High School,82,80,0,District,2917,1910635,Huang High School,82,80,0,District,2917,1910635
3,8,Huang High School,95,87,0,District,2917,1910635,Huang High School,95,87,0,District,2917,1910635
4,9,Huang High School,96,84,0,District,2917,1910635,Huang High School,96,84,0,District,2917,1910635


In [32]:
dist_num_both = dist_passing_merged_df["student_id"].count()
print(dist_num_both)

25528


In [55]:
#Calculate the number of students passing both by school
school_passing_both_dict = {k: list(v) for k, v in dist_passing_merged_df.groupby("school_id_x")["student_id"]}
#print(school_passing_both_dict)

In [37]:
#Calculate percentage of dual-passing students
dist_both_percent = (dist_num_both / dist_students) * 100
print(dist_both_percent)

65.17232575950983


In [38]:
#Build district summary data frame
district_summary_df = pd.DataFrame({"Total Schools" : [total_schools],
                                    "Total Students" : dist_students,
                                    "Total Budget" : budget_total,
                                    "Average Math Score" : dist_math_avg,
                                    "Average Reading Score" : dist_reading_avg,
                                    "% Passing Math" : dist_math_percent,
                                    "% Passing Reading" : dist_reading_percent,
                                    "% Overall Passing": dist_both_percent
                                   })
#Format entries : 
#https://stackoverflow.com/questions/32744997/python-pandas-apply-formatting-to-each-column-in-dataframe-using-a-dict-mapping
format_mapping = {"Total Students" : "{:,}", 
                  "Total Budget" : "${:,.2f}", 
                  "Average Math Score" : "{:.6f}", 
                  "Average Reading Score" : "{:.6f}", 
                  "% Passing Math" : "{:.6f}%",
                  "% Passing Reading" : "{:.6f}%",
                  "% Overall Passing" : "{:.6f}%"
                 }
#apply formatting
for key, value in format_mapping.items():
       district_summary_df[key] = district_summary_df[key].apply(value.format)
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853%,85.805463%,65.172326%


## School Summary

* Create an overview table that summarizes key metrics about each school, including:
  * School Name
  * School Type
  * Total Students
  * Total School Budget
  * Per Student Budget
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * % Overall Passing (The percentage of students that passed math **and** reading.)
  
* Create a dataframe to hold the above results

In [39]:
#Group by school and count students per school
school_data_grouped = school_cleaned_df.groupby(school_cleaned_df.school_id)
school_students = school_data_grouped["size"].count()
school_students_list = list(school_students)
print(school_students_list)

[2917, 2949, 1761, 4635, 1468, 2283, 1858, 4976, 427, 962, 1800, 3999, 4761, 2739, 1635]


In [40]:
#Retrieve school names
school_name_list = list(names_dict.values())
print(school_name_list)

['Huang High School', 'Figueroa High School', 'Shelton High School', 'Hernandez High School', 'Griffin High School', 'Wilson High School', 'Cabrera High School', 'Bailey High School', 'Holden High School', 'Pena High School', 'Wright High School', 'Rodriguez High School', 'Johnson High School', 'Ford High School', 'Thomas High School']


In [41]:
#Retrieve school types
school_type_list = list(types_dict.values())
print(school_type_list)

['District', 'District', 'Charter', 'District', 'Charter', 'Charter', 'Charter', 'District', 'Charter', 'Charter', 'Charter', 'District', 'District', 'District', 'Charter']


In [42]:
#Retrieve individual school budgets
school_budget_list = list(budgets_dict.values())
print(school_budget_list)

[1910635, 1884411, 1056600, 3022020, 917500, 1319574, 1081356, 3124928, 248087, 585858, 1049400, 2547363, 3094650, 1763916, 1043130]


In [43]:
#Calculate per-student budgets per school
per_student_budget = [b/s for b, s in zip(school_budget_list, school_students_list)]
print(per_student_budget)

[655.0, 639.0, 600.0, 652.0, 625.0, 578.0, 582.0, 628.0, 581.0, 609.0, 583.0, 637.0, 650.0, 644.0, 638.0]


In [47]:
#calculate total math score by school
school_math = (school_data_grouped["math_score"].sum()) 
#print(school_math)

In [45]:
#Calculate average math score by school
school_math_avg = list(school_math / school_students)
print(school_math_avg)

[76.62941378128214, 76.71176670057646, 83.3594548551959, 77.28975188781014, 83.35149863760218, 83.2742006132282, 83.06189451022605, 77.04843247588424, 83.80327868852459, 83.83991683991684, 83.68222222222222, 76.84271067766942, 77.07246376811594, 77.10259218692954, 83.4183486238532]


In [48]:
#Calculate total reading score by school
school_reading = school_data_grouped["reading_score"].sum()
#print(school_reading)

In [49]:
#Calcualte average reading score by school
school_reading_avg = list(school_reading / school_students)
print(school_reading_avg)

[81.18272197463148, 81.15801966768396, 83.72572402044293, 80.9344120819849, 83.816757493188, 83.98948751642575, 83.97578040904197, 81.03396302250803, 83.81498829039812, 84.04469854469855, 83.955, 80.74468617154288, 80.96639361478681, 80.74625775830594, 83.84892966360856]


In [75]:
#Calcualte number of passing math scores per school
#print(school_math_passing_dict)
school_math_passing = {k:len(v) for k, v in school_math_passing_dict.items()}        
print(school_math_passing)

{0: 1916, 1: 1946, 2: 1653, 3: 3094, 4: 1371, 5: 2143, 6: 1749, 7: 3318, 8: 395, 9: 910, 10: 1680, 11: 2654, 12: 3145, 13: 1871, 14: 1525}


In [76]:
#Calculate number of passing reading scores per school
school_reading_passing = {k:len(v) for k, v in school_reading_passing_dict.items()}
print(school_reading_passing)

{0: 2372, 1: 2381, 2: 1688, 3: 3748, 4: 1426, 5: 2204, 6: 1803, 7: 4077, 8: 411, 9: 923, 10: 1739, 11: 3208, 12: 3867, 13: 2172, 14: 1591}


In [77]:
#Calculate number of passing students per school
school_passing_both = {k:len(v) for k, v in school_passing_both_dict.items()}
print(school_passing_both)

{0: 1561, 1: 1569, 2: 1583, 3: 2481, 4: 1330, 5: 2068, 6: 1697, 7: 2719, 8: 381, 9: 871, 10: 1626, 11: 2119, 12: 2549, 13: 1487, 14: 1487}


In [None]:
school_summary_df = pd.DataFrame({"School Name" : school_name_list,
                                  "School Type" : school_type_list,
                                  "Total Students" : school_students_list,
                                  "Total School Budget" : school_budget_list,
                                  "Per Student Budget" : per_student_budget,
                                  "Average Math Score" : school_math_avg,
                                  "Average Reading Score" : school_reading_avg,
                                  })
school_summary_df.head()

## Top Performing Schools (By % Overall Passing)

* Sort and display the top five performing schools by % overall passing.

## Bottom Performing Schools (By % Overall Passing)

* Sort and display the five worst-performing schools by % overall passing.

## Math Scores by Grade

* Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.

  * Create a pandas series for each grade. Hint: use a conditional statement.
  
  * Group each series by school
  
  * Combine the series into a dataframe
  
  * Optional: give the displayed data cleaner formatting

## Reading Score by Grade 

* Perform the same operations as above for reading scores

## Scores by School Spending

* Create a table that breaks down school performances based on average Spending Ranges (Per Student). Use 4 reasonable bins to group school spending. Include in the table each of the following:
  * Average Math Score
  * Average Reading Score
  * % Passing Math
  * % Passing Reading
  * Overall Passing Rate (Average of the above two)

## Scores by School Size

* Perform the same operations as above, based on school size.

## Scores by School Type

* Perform the same operations as above, based on school type