In [21]:
#use os.path.join(), indirectly load file
#use Pandas.read_csv to read (without open files) into DataFrame

In [22]:
import os
import pandas as pd

school_data_to_load = os.path.join('Resources','schools_complete.csv')
student_data_to_load = os.path.join('Resources', 'students_complete.csv')

school_data_df = pd.read_csv(school_data_to_load)
student_data_df = pd.read_csv(student_data_to_load)

In [23]:
print(student_data_df)
print(student_data_df.head())
print(student_data_df.tail(10))

       Student ID       student_name gender grade         school_name  \
0               0       Paul Bradley      M   9th   Huang High School   
1               1       Victor Smith      M  12th   Huang High School   
2               2    Kevin Rodriguez      M  12th   Huang High School   
3               3  Dr. Richard Scott      M  12th   Huang High School   
4               4         Bonnie Ray      F   9th   Huang High School   
...           ...                ...    ...   ...                 ...   
39165       39165       Donna Howard      F  12th  Thomas High School   
39166       39166          Dawn Bell      F  10th  Thomas High School   
39167       39167     Rebecca Tanner      F   9th  Thomas High School   
39168       39168       Desiree Kidd      F  10th  Thomas High School   
39169       39169    Carolyn Jackson      F  11th  Thomas High School   

       reading_score  math_score  
0                 66          79  
1                 94          61  
2                 

In [24]:
student_data_df.count()

Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
dtype: int64

In [25]:
school_data_df.isnull()

Unnamed: 0,School ID,school_name,type,size,budget
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


In [26]:
student_data_df.isnull().sum()

Student ID       0
student_name     0
gender           0
grade            0
school_name      0
reading_score    0
math_score       0
dtype: int64

In [27]:
student_data_df.notnull().sum()

Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
dtype: int64

In [28]:
#use _df.dropna() & _df.fillna(0) to deal with missing data

In [29]:
#check data types, !! Attribute
school_data_df.dtypes

School ID       int64
school_name    object
type           object
size            int64
budget          int64
dtype: object

In [30]:
student_data_df.dtypes

Student ID        int64
student_name     object
gender           object
grade            object
school_name      object
reading_score     int64
math_score        int64
dtype: object

In [31]:
student_data_df.grade.dtype
student_data_df['reading_score'].dtype

dtype('int64')

In [32]:
# clean incorrect student names
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

for word in prefixes_suffixes:
    student_data_df['student_name'] = student_data_df['student_name'].str.replace(word,'')
print(student_data_df.head(10))

   Student ID     student_name gender grade        school_name  reading_score  \
0           0     Paul Bradley      M   9th  Huang High School             66   
1           1     Victor Smith      M  12th  Huang High School             94   
2           2  Kevin Rodriguez      M  12th  Huang High School             90   
3           3    Richard Scott      M  12th  Huang High School             67   
4           4       Bonnie Ray      F   9th  Huang High School             97   
5           5    Bryan Miranda      M   9th  Huang High School             94   
6           6    Sheena Carter      F  11th  Huang High School             82   
7           7     Nicole Baker      F  12th  Huang High School             96   
8           8     Michael Roth      M  10th  Huang High School             95   
9           9   Matthew Greene      M  10th  Huang High School             96   

   math_score  
0          79  
1          61  
2          60  
3          58  
4          84  
5          9

### merge two dataFrame


In [33]:
#check two DataFrame columns
print(student_data_df.columns)
print(len(school_data_df.columns))


Index(['Student ID', 'student_name', 'gender', 'grade', 'school_name',
       'reading_score', 'math_score'],
      dtype='object')
5


In [34]:
#merge school_data_df(right) and student_data_df(left) on a shared column 'school_name'

school_data_complete_df = pd.merge(student_data_df,school_data_df, on=['school_name'])
school_data_complete_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


### calculate key metrics for new merged DataFrame

In [35]:
student_count = school_data_complete_df.student_name.count()
school_count = len(school_data_complete_df.school_name.unique())
total_budget = school_data_df.budget.agg('sum')  #use origal school DataFrame
avg_math = school_data_complete_df.math_score.agg('mean')
avg_reading = school_data_complete_df.reading_score.mean()

pass_math_filter_df = school_data_complete_df[school_data_complete_df.math_score >= 70]
pass_reading_filter_df = school_data_complete_df[school_data_complete_df.reading_score >= 70]
passing_math_count = pass_math_filter_df.student_name.count()
passing_reading_count = pass_reading_filter_df.student_name.count()

passing_math_percentage = passing_math_count/float(student_count)*100
print('Math pass Rate: ',passing_math_percentage)
passing_reading_percentage = passing_reading_count/student_count *100
print('Readinf pass Rate: ',passing_reading_percentage)
overall_passing_percentage = (passing_math_percentage + passing_reading_percentage) /2
print(f'Overall pass Rate:{overall_passing_percentage:.2f}%')

Math pass Rate:  74.9808526933878
Readinf pass Rate:  85.80546336482001
Overall pass Rate:80.39%


### a summary new Dataframe to collect all key metrics aboved

In [36]:
district_summary_df = pd.DataFrame([{"Total Schools": school_count,
                      "Total Students": student_count,
                      "Total Budget": total_budget,
                      "Average Math Score": avg_math,
                      "Average Reading Score": avg_reading,
                      "% Passing Math": passing_math_percentage,
                      "% Passing Reading": passing_reading_percentage,
                      "% Overall Passing": overall_passing_percentage}])
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,80.393158


### Format summary DataFrame by using map("{}".format) |||| map and format chaining

In [37]:
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.1f}".format)
district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)
district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.0f}".format)
district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}".format)
district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.0f}".format)

district_summary_df


Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",79.0,81.9,75,86,80
