In [143]:
# import pandas from Sichuan
import pandas as pd

# Navigate to the source data
school_data_csv = "Resources/schools_complete.csv"
student_data_csv = "Resources/students_complete.csv"

# read in the CSVs as dataframes, then create 1 "master" dataframe to hold all data
school = pd.read_csv(school_data_csv,index_col = "school_name")
student = pd.read_csv(student_data_csv)
complete_data = pd.merge(student, school, how="left", on=["school_name", "school_name"]) 

In [165]:
student.head(25)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,overallPass,passMath,passRead
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,1,0
1,1,Victor Smith,M,12th,Huang High School,94,61,0,0,1
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,0,1
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,0,0
4,4,Bonnie Ray,F,9th,Huang High School,97,84,1,1,1
5,5,Bryan Miranda,M,9th,Huang High School,94,94,1,1,1
6,6,Sheena Carter,F,11th,Huang High School,82,80,1,1,1
7,7,Nicole Baker,F,12th,Huang High School,96,69,0,0,1
8,8,Michael Roth,M,10th,Huang High School,95,87,1,1,1
9,9,Matthew Greene,M,10th,Huang High School,96,84,1,1,1


### District Summary
***
 - Total schools
 - Total students
 - Total budget
 - Average math score
 - Average reading score
 - % passing math (the percentage of students who passed math)
 - % passing reading (the percentage of students who passed reading)
 - % overall passing (the percentage of students who passed math AND reading)

In [137]:
# This code will create a new dictionary that will store values from the combined complete_data dataframe
# This dictionary will form the basis for a new dataframe we'll create that will answer the "District Summary"
# bullet points from above
dict1 = {"Total Schools":"{:,}".format(len(school)),
         "Total Students":"{:,}".format(school['size'].sum()),
         "Total Budget":"${:,.2f}".format(school['budget'].sum()),
         "Average Math Score":complete_data['math_score'].mean(),
         "Average Reading Score":complete_data['reading_score'].mean(),
         "% Passing Math":(complete_data[complete_data['math_score'] >= 70]['Student ID'].count() / len(complete_data['Student ID']) * 100),
         "% Passing Reading":(complete_data[complete_data['reading_score'] >= 70]['Student ID'].count() / len(complete_data['Student ID']) * 100),
         "% Overall Passing":(complete_data[(complete_data['math_score']>=70) & (complete_data['reading_score']>=70)]['Student ID'].count() / len(complete_data['Student ID']) * 100)           
}

# Now we create a new "Summary" dataframe by reading in the dictionary we just created. Wah-lah!
Summary = pd.DataFrame(dict1,index=[0])

# Here she blows:
Summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,65.172326


### School Summary
***
 - School name
 - School type
 - Total students
 - Total school budget
 - Per student budget
 - Average math score
 - Average reading score
 - % passing math (the percentage of students who passed math)
 - % passing reading (the percentage of students who passed reading)
 - % overall passing (the percentage of students who passed math AND reading)

In [187]:
# Start this dataframe by adding 2 new columns to the Student dataframe 

student['passMath'] = [1 if x >= 70 else 0 for x in student['math_score']]
student['passRead'] = [1 if x >= 70 else 0 for x in student['reading_score']]

# The last column we'll add will be done so using a function that will add a '1' if the reading and math grades
# are both above 70
def overallPass (row):
    if row['reading_score'] >= 70 and row['math_score'] >= 70:
        return 1
    else:
        return 0
student['overallPass'] = student.apply(lambda row: overallPass(row), axis = 1)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,overallPass,passMath,passRead
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,1,0
1,1,Victor Smith,M,12th,Huang High School,94,61,0,0,1
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,0,1
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,0,0
4,4,Bonnie Ray,F,9th,Huang High School,97,84,1,1,1
...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,1,1,1
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,1,1,1
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,1,1,1
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,1,1,1


In [186]:
# Start by creating a unique list of school names that will serve as the index for the dataframe
schoolNames = complete_data['school_name'].unique().tolist()

# Create list for school types
schoolType = []
for x in schoolNames:
    schoolType.append(school.loc[x,'type'])

# Create list for school size
schoolSize = []
for x in schoolNames:
    schoolSize.append("{:,}".format(school.loc[x,'size']))

# Create list for school budget
schoolBud = []
for x in schoolNames:
    schoolBud.append("${:,.2f}".format(school.loc[x,'budget']))

# Create list for per-student budget
psBud = []
for x in schoolNames:
    psBud.append("${:,.2f}".format(school.loc[x,'budget'] / school.loc[x,'size']))

# Create list (using groupby method) for avg. math score. The first step will be to create a new dataframe that
# performs a groupby
df2 = student.groupby('school_name').mean()
amScore = []
for x in schoolNames:
    amScore.append(df2.loc[x,'math_score'])

# Create list (using groupby method) for avg. reading score
arScore = []
for x in schoolNames:
    arScore.append(df2.loc[x,'reading_score'])

# For % passing math, we'll append an empty list by summing up all the '1's, per school, for all students with passing
# grades
df3 = student.groupby('school_name').sum()
perPassMath = []
for x in schoolNames:
    perPassMath.append("{:.6%}".format(df3.loc[x,'passMath'] / school.loc[x,'size']))
    
# Percent passing reading values
perPassRead = []
for x in schoolNames:
    perPassRead.append("{:.6%}".format(df3.loc[x,'passRead'] / school.loc[x,'size']))


dict2 = {"School Type":schoolType,
         "Total Students":schoolSize,
         "Total School Budget":schoolBud,
         "Per Student Budget":psBud,
         "Average Math Score":amScore,
         "Average Reading Score":arScore,
         "% Passing Math":perPassMath,
         "% Passing Reading":85,
         "% Overall Passing":'None'
}

Summary2 = pd.DataFrame(dict2,index=[schoolNames])
Summary2.sort_index()

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.048432,81.033963,66.680064%,85,
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.133477%,85,
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.988471%,85,
Ford High School,District,2739,"$1,763,916.00",$644.00,77.102592,80.746258,68.309602%,85,
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.392371%,85,
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.289752,80.934412,66.752967%,85,
Holden High School,Charter,427,"$248,087.00",$581.00,83.803279,83.814988,92.505855%,85,
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,65.683922%,85,
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072464,80.966394,66.057551%,85,
Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,94.594595%,85,
