# PyCitySchools

### Dependencies and data

In [72]:
# Dependencies
import numpy as np
import pandas as pd

In [73]:
# School data
schools_df = pd.read_csv('data/schools_complete.csv')
print(schools_df.info())
schools_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   School ID    15 non-null     int64 
 1   school_name  15 non-null     object
 2   type         15 non-null     object
 3   size         15 non-null     int64 
 4   budget       15 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 728.0+ bytes
None


Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411


In [74]:
# Student data
students_df = pd.read_csv('data/students_complete.csv')
print(students_df.info())
students_df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39170 entries, 0 to 39169
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Student ID     39170 non-null  int64 
 1   student_name   39170 non-null  object
 2   gender         39170 non-null  object
 3   grade          39170 non-null  object
 4   school_name    39170 non-null  object
 5   reading_score  39170 non-null  int64 
 6   math_score     39170 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 2.1+ MB
None


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61


In [75]:
# Merge data
df = pd.merge(students_df, schools_df, on='school_name', how='left')
df.head(2)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635


### Clean student names

In [76]:
# Prefixes and suffixes
student_names = [name.split() for name in students_df.student_name.values if len(name.split()) != 2]
prefs = list(set([name[0] for name in student_names if len(name[0]) < 5]))
sufs = list(set([name[-1] for name in student_names if len(name[-1]) < 5]))
print(prefs, sufs)

['Anne', 'Miss', 'Dana', 'Ms.', 'Judy', 'Gary', 'Tara', 'Kara', 'Gail', 'Dawn', 'Lori', 'Marc', 'Troy', 'Todd', 'Greg', 'Eric', 'Ian', 'Ruth', 'Chad', 'Mark', 'Jose', 'Kari', 'Mike', 'Joe', 'Seth', 'Mrs.', 'Mary', 'Emma', 'Gina', 'Carl', 'Anna', 'Lynn', 'Lisa', 'Jill', 'Leah', 'Adam', 'Tony', 'Luke', 'Ryan', 'John', 'Sara', 'Erin', 'Cody', 'Noah', 'Kyle', 'Dale', 'Amy', 'Sean', 'Mr.', 'Dr.', 'Jodi', 'Tina', 'Erik', 'Omar', 'Toni', 'Paul', 'Cory', 'Kim', 'Jon'] ['Reed', 'Odom', 'Pham', 'Cox', 'Hill', 'Tran', 'Ford', 'Tate', 'Vang', 'Sosa', 'PhD', 'Ware', 'Roy', 'Cain', 'King', 'York', 'Duke', 'V', 'Mays', 'Rose', 'Hood', 'Jr.', 'Moss', 'DVM', 'Hall', 'West', 'Chen', 'Hale', 'Moon', 'Cobb', 'Cook', 'Love', 'Li', 'Day', 'Wood', 'Diaz', 'Page', 'Dunn', 'Park', 'Kemp', 'Ryan', 'Gray', 'IV', 'Dyer', 'Pace', 'DDS', 'Kim', 'Holt', 'Bell', 'Koch', 'Neal', 'Hays', 'Gill', 'Shea', 'MD', 'III', 'Cole', 'II', 'Webb', 'Ross', 'Levy', 'Lowe', 'Lee', 'Bond']


In [77]:
# Remove inappropriate prefixes and suffixes
students_df.student_name = students_df.student_name.str.replace('Miss ', '')
students_df.student_name = students_df.student_name.str.replace(r'.+\. ', '')
students_df.student_name = students_df.student_name.str.replace(r' [pMD].*[DMS]', '')

### District summary

In [78]:
# District summary
district_summary = pd.DataFrame(schools_df[['size', 'budget']].sum()).T
district_summary.columns = ['Total Students', 'Total Budget']
district_summary.index = ['District']
district_summary['Total Schools'] = schools_df.shape[0]
district_summary = district_summary[['Total Schools', 'Total Students', 'Total Budget']]
district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget
District,15,39170,24649428


In [79]:
# Add passing features
df['pass_read'] = (df.reading_score >= 70).astype(int)
df['pass_math'] = (df.math_score >= 70).astype(int)
df['pass_both'] = ((df.pass_read + df.pass_math) / 2).astype(int)
df.head(2)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,pass_read,pass_math,pass_both
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,0,1,0
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,1,0,0


In [80]:
# District scores
district_scores = df[['reading_score', 'math_score', 'pass_read', 'pass_math', 'pass_both']].mean()
score_columns = ['Average Reading Score', 'Average Math Score', '% Passing Reading', '% Passing Math', '% Overall Passing']
district_scores

reading_score    81.877840
math_score       78.985371
pass_read         0.858055
pass_math         0.749809
pass_both         0.651723
dtype: float64

In [81]:
# Add scores to district summary
for i in range(5):
    col = district_scores.index[i]
    val = district_scores[col]
    if col[:4] == 'pass':
        val *= 100
    district_summary[score_columns[i]] = val
    
district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Reading Score,Average Math Score,% Passing Reading,% Passing Math,% Overall Passing
District,15,39170,24649428,81.87784,78.985371,85.805463,74.980853,65.172326


In [82]:
# Format columns
for col in district_summary.columns[1:]:
    if col[0] == 'T':
        district_summary[col] = district_summary[col].map('{:,}'.format)
    if col[0] == 'A':
        district_summary[col] = district_summary[col].round(2)
    if col[0] == '%':
        district_summary[col] = district_summary[col].astype(int)
        
district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Reading Score,Average Math Score,% Passing Reading,% Passing Math,% Overall Passing
District,15,39170,24649428,81.88,78.99,85,74,65


### School summary

In [83]:
# Add budget per student featuure
df['budget_per_student'] = df['budget'] // df['size']
df.head(2)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,pass_read,pass_math,pass_both,budget_per_student
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,0,1,0,655
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,1,0,0,655


In [84]:
# School summary
school_feats = ['type', 'size', 'budget', 'budget_per_student', 'reading_score', 'math_score', 'pass_read', 'pass_math', 'pass_both']
school_summary = df.groupby('school_name')[school_feats].agg({
    'type': 'max',
    'size': 'max',
    'budget': 'max',
    'budget_per_student': 'max',
    'reading_score': 'mean',
    'math_score': 'mean',
    'pass_read': 'mean',
    'pass_math': 'mean',
    'pass_both': 'mean'
})
school_summary

Unnamed: 0_level_0,type,size,budget,budget_per_student,reading_score,math_score,pass_read,pass_math,pass_both
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,District,4976,3124928,628,81.033963,77.048432,0.819333,0.666801,0.546423
Cabrera High School,Charter,1858,1081356,582,83.97578,83.061895,0.970398,0.941335,0.913348
Figueroa High School,District,2949,1884411,639,81.15802,76.711767,0.807392,0.659885,0.532045
Ford High School,District,2739,1763916,644,80.746258,77.102592,0.79299,0.683096,0.542899
Griffin High School,Charter,1468,917500,625,83.816757,83.351499,0.97139,0.933924,0.905995
Hernandez High School,District,4635,3022020,652,80.934412,77.289752,0.80863,0.66753,0.535275
Holden High School,Charter,427,248087,581,83.814988,83.803279,0.962529,0.925059,0.892272
Huang High School,District,2917,1910635,655,81.182722,76.629414,0.813164,0.656839,0.535139
Johnson High School,District,4761,3094650,650,80.966394,77.072464,0.812224,0.660576,0.535392
Pena High School,Charter,962,585858,609,84.044699,83.839917,0.959459,0.945946,0.905405


In [85]:
# Format school summary
school_summary.index.name = None
school_summary.columns = ['School Type', 'Total Students', 'Total Budget', 'Budget Per Student',
                          'Average Reading Score', 'Average Math Score', '% Passing Reading', '% Passing Math', '% Overall Passing']

# Format values
for col in school_summary.columns[1:]:
    if col[0] == '%':
        school_summary[col] = (school_summary[col] * 100).round(0).astype(int)
    elif col[0] == 'A':
        school_summary[col] = school_summary[col].round(2)
    else:
        school_summary[col] = school_summary[col].map('{:,}'.format)
        
school_summary

Unnamed: 0,School Type,Total Students,Total Budget,Budget Per Student,Average Reading Score,Average Math Score,% Passing Reading,% Passing Math,% Overall Passing
Bailey High School,District,4976,3124928,628,81.03,77.05,82,67,55
Cabrera High School,Charter,1858,1081356,582,83.98,83.06,97,94,91
Figueroa High School,District,2949,1884411,639,81.16,76.71,81,66,53
Ford High School,District,2739,1763916,644,80.75,77.1,79,68,54
Griffin High School,Charter,1468,917500,625,83.82,83.35,97,93,91
Hernandez High School,District,4635,3022020,652,80.93,77.29,81,67,54
Holden High School,Charter,427,248087,581,83.81,83.8,96,93,89
Huang High School,District,2917,1910635,655,81.18,76.63,81,66,54
Johnson High School,District,4761,3094650,650,80.97,77.07,81,66,54
Pena High School,Charter,962,585858,609,84.04,83.84,96,95,91


### Average scores by grade

In [86]:
# Schools in descending order of pass rate
df.groupby('school_name')['pass_both'].mean().sort_values(ascending=False)

school_name
Cabrera High School      0.913348
Thomas High School       0.909480
Griffin High School      0.905995
Wilson High School       0.905826
Pena High School         0.905405
Wright High School       0.903333
Shelton High School      0.898921
Holden High School       0.892272
Bailey High School       0.546423
Ford High School         0.542899
Johnson High School      0.535392
Hernandez High School    0.535275
Huang High School        0.535139
Figueroa High School     0.532045
Rodriguez High School    0.529882
Name: pass_both, dtype: float64

In [87]:
# Reading scores by grade of each school
grade_reading_scores = pd.pivot_table(data=df, index='school_name', columns='grade', values='reading_score', aggfunc='mean').round(2)
grade_reading_scores.index.name = None
grade_reading_scores.columns.name = None
grade_reading_scores

Unnamed: 0,10th,11th,12th,9th
Bailey High School,80.91,80.95,80.91,81.3
Cabrera High School,84.25,83.79,84.29,83.68
Figueroa High School,81.41,80.64,81.38,81.2
Ford High School,81.26,80.4,80.66,80.63
Griffin High School,83.71,84.29,84.01,83.37
Hernandez High School,80.66,81.4,80.86,80.87
Holden High School,83.32,83.82,84.7,83.68
Huang High School,81.51,81.42,80.31,81.29
Johnson High School,80.77,80.62,81.23,81.26
Pena High School,83.61,84.34,84.59,83.81


In [88]:
# Math scores by grade of each school
grade_math_scores = pd.pivot_table(data=df, index='school_name', columns='grade', values='math_score', aggfunc='mean').round(2)
grade_math_scores.index.name = None
grade_math_scores.columns.name = None
grade_math_scores

Unnamed: 0,10th,11th,12th,9th
Bailey High School,77.0,77.52,76.49,77.08
Cabrera High School,83.15,82.77,83.28,83.09
Figueroa High School,76.54,76.88,77.15,76.4
Ford High School,77.67,76.92,76.18,77.36
Griffin High School,84.23,83.84,83.36,82.04
Hernandez High School,77.34,77.14,77.19,77.44
Holden High School,83.43,85.0,82.86,83.79
Huang High School,75.91,76.45,77.23,77.03
Johnson High School,76.69,77.49,76.86,77.19
Pena High School,83.37,84.33,84.12,83.63


### Scores by budget per student

In [94]:
# Bin budget per student feature
schools_df['budget_per_student'] = schools_df['budget'] // schools_df['size']
schools_df['bps_bin'] = pd.qcut(schools_df['budget_per_student'], 4, labels=[f'Spending Level {n}' for n in range(1, 5)])
schools_df.bps_bin.value_counts()

Spending Level 4    4
Spending Level 2    4
Spending Level 1    4
Spending Level 3    3
Name: bps_bin, dtype: int64

In [95]:
# Bin school size
schools_df['size_bin'] = pd.qcut(schools_df['size'], 3, labels=['Small', 'Medium', 'Large'])
schools_df.size_bin.value_counts()

Large     5
Medium    5
Small     5
Name: size_bin, dtype: int64

In [96]:
# Merge bin features into combined data
df = pd.merge(df, schools_df[['school_name', 'bps_bin', 'size_bin']], on='school_name', how='left')
df.head(2)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,pass_read,pass_math,pass_both,budget_per_student,bps_bin,size_bin
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,0,1,0,655,Spending Level 4,Medium
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,1,0,0,655,Spending Level 4,Medium


In [100]:
# Scores by spending
spending_scores = df.groupby('bps_bin')[['reading_score', 'math_score', 'pass_read', 'pass_math', 'pass_both']].mean().round(2)
for col in spending_scores.columns[2:]:
    spending_scores[col] = (spending_scores[col] * 100).astype(int)
spending_scores

Unnamed: 0_level_0,reading_score,math_score,pass_read,pass_math,pass_both
bps_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Spending Level 1,83.96,83.36,97,94,91
Spending Level 2,82.31,79.98,89,79,71
Spending Level 3,81.48,78.05,84,71,60
Spending Level 4,80.96,77.06,81,67,54


In [101]:
# Formatting
spending_scores.index.name = None
spending_scores.columns = ['Average Reading Score', 'Average Math Score', '% Passing Reading', '% Passing Math', '% Overall Passing']
spending_scores

Unnamed: 0,Average Reading Score,Average Math Score,% Passing Reading,% Passing Math,% Overall Passing
Spending Level 1,83.96,83.36,97,94,91
Spending Level 2,82.31,79.98,89,79,71
Spending Level 3,81.48,78.05,84,71,60
Spending Level 4,80.96,77.06,81,67,54


### Scores by school size

In [103]:
# Scores by spending
size_scores = df.groupby('size_bin')[['reading_score', 'math_score', 'pass_read', 'pass_math', 'pass_both']].mean().round(2)
for col in size_scores.columns[2:]:
    size_scores[col] = (size_scores[col] * 100).astype(int)
size_scores

Unnamed: 0_level_0,reading_score,math_score,pass_read,pass_math,pass_both
size_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small,83.83,83.48,97,94,90
Medium,82.51,80.17,89,81,73
Large,80.96,77.02,81,66,54


In [104]:
# Formatting
size_scores.index.name = None
size_scores.columns = ['Average Reading Score', 'Average Math Score', '% Passing Reading', '% Passing Math', '% Overall Passing']
size_scores

Unnamed: 0,Average Reading Score,Average Math Score,% Passing Reading,% Passing Math,% Overall Passing
Small,83.83,83.48,97,94,90
Medium,82.51,80.17,89,81,73
Large,80.96,77.02,81,66,54


### Scores by school type

In [105]:
# Scores by spending
type_scores = df.groupby('type')[['reading_score', 'math_score', 'pass_read', 'pass_math', 'pass_both']].mean().round(2)
for col in type_scores.columns[2:]:
    type_scores[col] = (type_scores[col] * 100).astype(int)
type_scores

Unnamed: 0_level_0,reading_score,math_score,pass_read,pass_math,pass_both
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.9,83.41,97,94,91
District,80.96,76.99,81,67,54


In [106]:
# Formatting
type_scores.index.name = None
type_scores.columns = ['Average Reading Score', 'Average Math Score', '% Passing Reading', '% Passing Math', '% Overall Passing']
type_scores

Unnamed: 0,Average Reading Score,Average Math Score,% Passing Reading,% Passing Math,% Overall Passing
Charter,83.9,83.41,97,94,91
District,80.96,76.99,81,67,54
