In [1]:
# Import dependencies 
import pandas as pd

In [2]:
# Set up and read schools files
school_path = "Resources/schools_complete.csv"
school_df = pd.read_csv(school_path)

student_path = "Resources/students_complete.csv"
student_df = pd.read_csv(student_path)

In [3]:
# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_df, school_df, how="left", on=["school_name", "school_name"])
school_data_complete

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130


In [4]:
# Make a district summary table
# Total schools
a = len(school_data_complete["school_name"].unique())
a

15

In [5]:
# Total students
stud = len(school_data_complete["student_name"])
b = ("{:,}".format(stud))
b
# Code below shows the repeat names but they are different people with the same name (organized in the student_name column)
# pd.concat(g for _, g in school_data_complete.groupby("student_name") if len(g) > 1)

'39,170'

In [6]:
# Total budget
bud = school_df["budget"].sum()
c = ("${:,.2f}".format(bud))
c
#I could also groupby school name to then get rid of repeat budgets but then that woud throw off the student names column

'$24,649,428.00'

In [7]:
# Average math score
avg_math = school_data_complete["math_score"].mean()
d = round(avg_math, 2)
d

78.99

In [8]:
# Average reading score
avg_read = school_data_complete["reading_score"].mean()
e = round(avg_read,  2)
e

81.88

In [9]:
# % passing math
math_percent = (len(school_data_complete[school_data_complete.math_score >= 70])/stud)*100
f = round(math_percent, 2)
f

74.98

In [10]:
# % passing reading
read_percent = (len(school_data_complete[school_data_complete.reading_score >= 70])/stud)*100
g = round(read_percent, 2)
g

85.81

In [11]:
# % overall passing
overall_pass = school_data_complete[(school_data_complete['math_score']>= 70) & (school_data_complete['reading_score'] >= 70)]
h = round((len(overall_pass)/stud)*100, 2)
h

65.17

In [12]:
# District summary table
district_summary_df = pd.DataFrame({"Total Schools": [a],
                           "Total Students": [b],
                           "Total Budget": [c],
                           "Average Math Score": [d],
                           "Average Reading Score": [e],
                           "% Passing Math": [f],
                            "% Passing Reading": [g],
                            "% Overall Passing": [h]})

district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.99,81.88,74.98,85.81,65.17


In [13]:
# Make a school summary table
#list of the school names
name = school_data_complete["school_name"].unique()
name

array(['Huang High School', 'Figueroa High School', 'Shelton High School',
       'Hernandez High School', 'Griffin High School',
       'Wilson High School', 'Cabrera High School', 'Bailey High School',
       'Holden High School', 'Pena High School', 'Wright High School',
       'Rodriguez High School', 'Johnson High School', 'Ford High School',
       'Thomas High School'], dtype=object)

In [14]:
#name = school_data_complete["school_name"].unique()
#name
name_type= school_data_complete[["school_name", "type"]].drop_duplicates(subset=['school_name', 'type'])
name = name_type["school_name"].tolist()
types = name_type["type"].tolist()

In [15]:
#Total number of students per school
students = school_data_complete["school_name"].value_counts()
#students.tolist()
students

Bailey High School       4976
Johnson High School      4761
Hernandez High School    4635
Rodriguez High School    3999
Figueroa High School     2949
Huang High School        2917
Ford High School         2739
Wilson High School       2283
Cabrera High School      1858
Wright High School       1800
Shelton High School      1761
Thomas High School       1635
Griffin High School      1468
Pena High School          962
Holden High School        427
Name: school_name, dtype: int64

In [16]:
#beginning summary dataframe
school_summary_df = pd.DataFrame({"Type": (types), #already a list so you use parentheses instead for 'name' 
                                 "Student Pop.": (students)}) 
school_sum_df = school_summary_df.sort_index()


In [17]:
school_sum_df.index.name = "School Name"
school_sum_df

Unnamed: 0_level_0,Type,Student Pop.
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bailey High School,District,4976
Cabrera High School,Charter,1858
Figueroa High School,Charter,2949
Ford High School,Charter,2739
Griffin High School,District,1468
Hernandez High School,Charter,4635
Holden High School,Charter,427
Huang High School,Charter,2917
Johnson High School,District,4761
Pena High School,District,962


In [18]:
#adding in budget per school
budget = school_data_complete["budget"].unique()
school_sum_df["Total Budget"] = budget
school_sum_df

Unnamed: 0_level_0,Type,Student Pop.,Total Budget
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bailey High School,District,4976,1910635
Cabrera High School,Charter,1858,1884411
Figueroa High School,Charter,2949,1056600
Ford High School,Charter,2739,3022020
Griffin High School,District,1468,917500
Hernandez High School,Charter,4635,1319574
Holden High School,Charter,427,1081356
Huang High School,Charter,2917,3124928
Johnson High School,District,4761,248087
Pena High School,District,962,585858


In [19]:
#add budget per student to summary table
bud_per_studs = round(school_sum_df['Total Budget']/ school_sum_df['Student Pop.'], 2)
school_sum_df["Budget per Student"] = bud_per_studs
school_sum_df

Unnamed: 0_level_0,Type,Student Pop.,Total Budget,Budget per Student
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bailey High School,District,4976,1910635,383.97
Cabrera High School,Charter,1858,1884411,1014.21
Figueroa High School,Charter,2949,1056600,358.29
Ford High School,Charter,2739,3022020,1103.33
Griffin High School,District,1468,917500,625.0
Hernandez High School,Charter,4635,1319574,284.7
Holden High School,Charter,427,1081356,2532.45
Huang High School,Charter,2917,3124928,1071.28
Johnson High School,District,4761,248087,52.11
Pena High School,District,962,585858,609.0


In [20]:
#add average math score per school
avg_math_student = school_data_complete.groupby(['school_name'])['math_score'].agg(lambda x: x.mean())
avg_math_student
school_sum_df["Average Math Score"] = round(avg_math_student, 2)
school_sum_df

Unnamed: 0_level_0,Type,Student Pop.,Total Budget,Budget per Student,Average Math Score
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bailey High School,District,4976,1910635,383.97,77.05
Cabrera High School,Charter,1858,1884411,1014.21,83.06
Figueroa High School,Charter,2949,1056600,358.29,76.71
Ford High School,Charter,2739,3022020,1103.33,77.1
Griffin High School,District,1468,917500,625.0,83.35
Hernandez High School,Charter,4635,1319574,284.7,77.29
Holden High School,Charter,427,1081356,2532.45,83.8
Huang High School,Charter,2917,3124928,1071.28,76.63
Johnson High School,District,4761,248087,52.11,77.07
Pena High School,District,962,585858,609.0,83.84


In [21]:
#add average reading score per school
avg_read_student = school_data_complete.groupby(['school_name'])['reading_score'].agg(lambda x: x.mean())
school_sum_df["Average Reading Score"] = round(avg_read_student, 2)
school_sum_df

Unnamed: 0_level_0,Type,Student Pop.,Total Budget,Budget per Student,Average Math Score,Average Reading Score
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bailey High School,District,4976,1910635,383.97,77.05,81.03
Cabrera High School,Charter,1858,1884411,1014.21,83.06,83.98
Figueroa High School,Charter,2949,1056600,358.29,76.71,81.16
Ford High School,Charter,2739,3022020,1103.33,77.1,80.75
Griffin High School,District,1468,917500,625.0,83.35,83.82
Hernandez High School,Charter,4635,1319574,284.7,77.29,80.93
Holden High School,Charter,427,1081356,2532.45,83.8,83.81
Huang High School,Charter,2917,3124928,1071.28,76.63,81.18
Johnson High School,District,4761,248087,52.11,77.07,80.97
Pena High School,District,962,585858,609.0,83.84,84.04


In [22]:
# add % passing math per school
cond3 = school_data_complete[school_data_complete.math_score >= 70]
#cond3
percent_math = cond3['school_name'].value_counts()
#percent_math
school_sum_df["% Passing Math"] = round((percent_math/school_sum_df["Student Pop."])*100, 2)
school_sum_df

Unnamed: 0_level_0,Type,Student Pop.,Total Budget,Budget per Student,Average Math Score,Average Reading Score,% Passing Math
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Bailey High School,District,4976,1910635,383.97,77.05,81.03,66.68
Cabrera High School,Charter,1858,1884411,1014.21,83.06,83.98,94.13
Figueroa High School,Charter,2949,1056600,358.29,76.71,81.16,65.99
Ford High School,Charter,2739,3022020,1103.33,77.1,80.75,68.31
Griffin High School,District,1468,917500,625.0,83.35,83.82,93.39
Hernandez High School,Charter,4635,1319574,284.7,77.29,80.93,66.75
Holden High School,Charter,427,1081356,2532.45,83.8,83.81,92.51
Huang High School,Charter,2917,3124928,1071.28,76.63,81.18,65.68
Johnson High School,District,4761,248087,52.11,77.07,80.97,66.06
Pena High School,District,962,585858,609.0,83.84,84.04,94.59


In [23]:
# add % passing reading per school
cond4 = school_data_complete[school_data_complete.reading_score >= 70]
#cond4
percent_read = cond4['school_name'].value_counts()
#percent_read
school_sum_df["% Passing Reading"] = round((percent_read/school_sum_df["Student Pop."])*100, 2)
school_sum_df

Unnamed: 0_level_0,Type,Student Pop.,Total Budget,Budget per Student,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Bailey High School,District,4976,1910635,383.97,77.05,81.03,66.68,81.93
Cabrera High School,Charter,1858,1884411,1014.21,83.06,83.98,94.13,97.04
Figueroa High School,Charter,2949,1056600,358.29,76.71,81.16,65.99,80.74
Ford High School,Charter,2739,3022020,1103.33,77.1,80.75,68.31,79.3
Griffin High School,District,1468,917500,625.0,83.35,83.82,93.39,97.14
Hernandez High School,Charter,4635,1319574,284.7,77.29,80.93,66.75,80.86
Holden High School,Charter,427,1081356,2532.45,83.8,83.81,92.51,96.25
Huang High School,Charter,2917,3124928,1071.28,76.63,81.18,65.68,81.32
Johnson High School,District,4761,248087,52.11,77.07,80.97,66.06,81.22
Pena High School,District,962,585858,609.0,83.84,84.04,94.59,95.95


In [24]:
# add % overall passing per school
cond5 = school_data_complete[(school_data_complete.math_score >= 70) & (school_data_complete.reading_score >= 70)]
percent_overall = cond5['school_name'].value_counts()
school_sum_df["% Overall Passing"] = round((percent_overall/school_sum_df["Student Pop."])*100, 2)

In [25]:
# Completed school summary table
school_sum_df

Unnamed: 0_level_0,Type,Student Pop.,Total Budget,Budget per Student,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,District,4976,1910635,383.97,77.05,81.03,66.68,81.93,54.64
Cabrera High School,Charter,1858,1884411,1014.21,83.06,83.98,94.13,97.04,91.33
Figueroa High School,Charter,2949,1056600,358.29,76.71,81.16,65.99,80.74,53.2
Ford High School,Charter,2739,3022020,1103.33,77.1,80.75,68.31,79.3,54.29
Griffin High School,District,1468,917500,625.0,83.35,83.82,93.39,97.14,90.6
Hernandez High School,Charter,4635,1319574,284.7,77.29,80.93,66.75,80.86,53.53
Holden High School,Charter,427,1081356,2532.45,83.8,83.81,92.51,96.25,89.23
Huang High School,Charter,2917,3124928,1071.28,76.63,81.18,65.68,81.32,53.51
Johnson High School,District,4761,248087,52.11,77.07,80.97,66.06,81.22,53.54
Pena High School,District,962,585858,609.0,83.84,84.04,94.59,95.95,90.54


In [26]:
#top five performing schools by % overall passing
top = school_sum_df.sort_values('% Overall Passing', ascending=False)
top_5 = top.head(5)
top_5

Unnamed: 0_level_0,Type,Student Pop.,Total Budget,Budget per Student,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,1884411,1014.21,83.06,83.98,94.13,97.04,91.33
Thomas High School,District,1635,3094650,1892.75,83.42,83.85,93.27,97.31,90.95
Griffin High School,District,1468,917500,625.0,83.35,83.82,93.39,97.14,90.6
Wilson High School,District,2283,1763916,772.63,83.27,83.99,93.87,96.54,90.58
Pena High School,District,962,585858,609.0,83.84,84.04,94.59,95.95,90.54


In [27]:
#the five worst-performing schools by % overall passing
worst = school_sum_df.sort_values('% Overall Passing')
worst_5 = worst.head(5)
worst_5

Unnamed: 0_level_0,Type,Student Pop.,Total Budget,Budget per Student,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,1049400,262.42,76.84,80.74,66.37,80.22,52.99
Figueroa High School,Charter,2949,1056600,358.29,76.71,81.16,65.99,80.74,53.2
Huang High School,Charter,2917,3124928,1071.28,76.63,81.18,65.68,81.32,53.51
Hernandez High School,Charter,4635,1319574,284.7,77.29,80.93,66.75,80.86,53.53
Johnson High School,District,4761,248087,52.11,77.07,80.97,66.06,81.22,53.54
