In [1]:
# import pandas from Sichuan
import pandas as pd

# Navigate to the source data
school_data_csv = "Resources/schools_complete.csv"
student_data_csv = "Resources/students_complete.csv"

# read in the CSVs as dataframes, then create 1 "master" dataframe to hold all data
school = pd.read_csv(school_data_csv,index_col = "school_name")
student = pd.read_csv(student_data_csv)
complete_data = pd.merge(student, school, how="left", on=["school_name", "school_name"]) 

### District Summary
***
 - Total schools
 - Total students
 - Total budget
 - Average math score
 - Average reading score
 - % passing math (the percentage of students who passed math)
 - % passing reading (the percentage of students who passed reading)
 - % overall passing (the percentage of students who passed math AND reading)

In [None]:
# This code will create a new dictionary that will store values from the combined complete_data dataframe
# This dictionary will form the basis for a new dataframe we'll create that will answer the "District Summary"
# bullet points from above
dict1 = {"Total Schools":"{:,}".format(len(school)),
         "Total Students":"{:,}".format(school['size'].sum()),
         "Total Budget":"${:,.2f}".format(school['budget'].sum()),
         "Average Math Score":complete_data['math_score'].mean(),
         "Average Reading Score":complete_data['reading_score'].mean(),
         "% Passing Math":(complete_data[complete_data['math_score'] >= 70]['Student ID'].count() / len(complete_data['Student ID']) * 100),
         "% Passing Reading":(complete_data[complete_data['reading_score'] >= 70]['Student ID'].count() / len(complete_data['Student ID']) * 100),
         "% Overall Passing":(complete_data[(complete_data['math_score']>=70) & (complete_data['reading_score']>=70)]['Student ID'].count() / len(complete_data['Student ID']) * 100)           
}

# Now we create a new "Summary" dataframe by reading in the dictionary we just created. Wah-lah!
Summary = pd.DataFrame(dict1,index=[0])

# Here she blows:
Summary

### School Summary
***
 - School name
 - School type
 - Total students
 - Total school budget
 - Per student budget
 - Average math score
 - Average reading score
 - % passing math (the percentage of students who passed math)
 - % passing reading (the percentage of students who passed reading)
 - % overall passing (the percentage of students who passed math AND reading)

In [None]:
# Start this dataframe by adding 2 new columns to the Student dataframe 

student['passMath'] = [1 if x >= 70 else 0 for x in student['math_score']]
student['passRead'] = [1 if x >= 70 else 0 for x in student['reading_score']]

# The last column we'll add will be done so using a function that will add a '1' if the reading and math grades
# are both above 70
def overallPass (row):
    if row['reading_score'] >= 70 and row['math_score'] >= 70:
        return 1
    else:
        return 0
student['overallPass'] = student.apply(lambda row: overallPass(row), axis = 1)

# Create a unique list of school names that will serve as the index for the dataframe
schoolNames = complete_data['school_name'].unique().tolist()

# Create list for school types
schoolType = []
for x in schoolNames:
    schoolType.append(school.loc[x,'type'])

# Create list for school size
schoolSize = []
for x in schoolNames:
    schoolSize.append("{:,}".format(school.loc[x,'size']))

# Create list for school budget
schoolBud = []
for x in schoolNames:
    schoolBud.append("${:,.2f}".format(school.loc[x,'budget']))

# Create list for per-student budget
psBud = []
for x in schoolNames:
    psBud.append("${:,.2f}".format(school.loc[x,'budget'] / school.loc[x,'size']))

# Create list (using groupby method) for avg. math score. The first step will be to create a new dataframe that
# performs a groupby
df2 = student.groupby('school_name').mean()
amScore = []
for x in schoolNames:
    amScore.append(df2.loc[x,'math_score'])

# Create list (using groupby method) for avg. reading score
arScore = []
for x in schoolNames:
    arScore.append(df2.loc[x,'reading_score'])

# For % passing math, we'll append an empty list by summing up all the '1's, per school, for all students with passing
# grades
df3 = student.groupby('school_name').sum()
perPassMath = []
for x in schoolNames:
    perPassMath.append("{:.6%}".format(df3.loc[x,'passMath'] / school.loc[x,'size']))
    
# Percent passing reading values
perPassRead = []
for x in schoolNames:
    perPassRead.append("{:.6%}".format(df3.loc[x,'passRead'] / school.loc[x,'size']))
    
# Percent overall passing
perPassOv = []
for x in schoolNames:
    perPassOv.append("{:.6%}".format(df3.loc[x,'overallPass'] / school.loc[x,'size']))

dict2 = {"School Type":schoolType,
         "Total Students":schoolSize,
         "Total School Budget":schoolBud,
         "Per Student Budget":psBud,
         "Average Math Score":amScore,
         "Average Reading Score":arScore,
         "% Passing Math":perPassMath,
         "% Passing Reading":perPassRead,
         "% Overall Passing":perPassOv
}

Summary2 = pd.DataFrame(dict2,index=[schoolNames])
Summary2.sort_index()

### Highest-Performing Schools (by % Overall Passing)
***
 - Sort and display the top five performing schools by % overall passing

In [None]:
# To get a dataframe of the top 5 performing schools by % overall passing, we sort in descending order the previous
# dataframe we made by the '% Overall Passing' column.
Summary3 = Summary2.sort_values(by=['% Overall Passing'],ascending=False).head(5)
Summary3

### Bottom Performing Schools (by % Overall Passing)
***
 - Sort and display the top five performing schools by % overall passing

In [None]:
# To get a dataframe of the bottom 5 performing schools by % overall passing, we sort in ascending order a previous
# dataframe we made by the '% Overall Passing' column.
Summary4 = Summary2.sort_values(by=['% Overall Passing']).head(5)
Summary4

### Math Scores By Grade
***
Create a DataFrame that lists the average math score for students of each grade level (9th, 10th, 11th, 12th) at each school.

In [7]:
df2 = pd.DataFrame()
df2['9th'] = student.loc[(student.grade == '9th'),['math_score','school_name']].groupby('school_name').mean()
df2['10th'] = student.loc[(student.grade == '10th'),['math_score','school_name']].groupby('school_name').mean()
df2['11th'] = student.loc[(student.grade == '11h'),['math_score','school_name']].groupby('school_name').mean()
df2['12th'] = student.loc[(student.grade == '12th'),['math_score','school_name']].groupby('school_name').mean()
df2.index.name = None
df2

Unnamed: 0,9th,10th,11th,12th
Bailey High School,77.083676,76.996772,,76.492218
Cabrera High School,83.094697,83.154506,,83.277487
Figueroa High School,76.403037,76.539974,,77.151369
Ford High School,77.361345,77.672316,,76.179963
Griffin High School,82.04401,84.229064,,83.356164
Hernandez High School,77.438495,77.337408,,77.186567
Holden High School,83.787402,83.429825,,82.855422
Huang High School,77.027251,75.908735,,77.225641
Johnson High School,77.187857,76.691117,,76.863248
Pena High School,83.625455,83.372,,84.121547
