In [None]:
import pandas as pd

In [None]:
# Files to load
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"
# Read the school data file and store it in a Pandas DataFrame.
school_data_df = pd.read_csv(school_data_to_load)
school_data_df

In [None]:
# Read the student data file and store it in a Pandas DataFrame.
student_data_df = pd.read_csv(student_data_to_load)
student_data_df.head()

In [None]:
# Determine if there are any missing values in the school data
school_data_df.count()

In [None]:
# Determine if there are any missing values in the student data.
student_data_df.count()

In [None]:
# Determine if there are any missing values in the school data
school_data_df.isnull()

In [None]:
student_data_df.isnull().sum()

In [None]:
# Determine if there are not any missing values in the school data.
school_data_df.notnull().sum()

In [None]:
# Determine if there are not any missing values in the student data.
student_data_df.notnull().sum()

In [None]:
# Add each prefiex and suffix to remove to a list
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

In [None]:
# Iterate through the words in the "prefixes_suffixes" list and replace them with an empty space, "".
for item in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(item, "")
student_data_df.head()

## Merge DataFrames

In [None]:
# Combine the data into a single dataset
school_data_complete_df = pd.merge(student_data_df, school_data_df, on=["school_name", "school_name"])
school_data_complete_df.head()

## Get the Number of Students

In [None]:
# Get the total number of students
student_count = school_data_complete_df.count()
student_count

In [None]:
student_count2 = school_data_complete_df["Student ID"].count()
student_count2

## Get the Number of Schools

In [None]:
# Calculate the total number of schools
school_count = school_data_df["school_name"].count()
school_count

In [None]:
# calculate the total number of schools using the merged DataFrame
# since there's 39,170 rows/values in the merged DataFrame, it's better to use unique() instead of count()
school_count_2 = school_data_complete_df["school_name"].unique()
school_count_2

In [None]:
len(school_data_complete_df["school_name"].unique())

## Get the Total Budget

In [None]:
# Calculate the Total Budget using "school_data_complete_df" DataFrame
total_budget = school_data_complete_df["budget"].sum()
total_budget

In [None]:
# Calculate the Total Budget using "school_data_df" DataFrame
total_budget_2 = school_data_df["budget"].sum()
total_budget_2

## Get the Score Averages

In [None]:
# Calculate the average reading score
average_reading_score = school_data_complete_df["reading_score"].mean()
average_reading_score

In [None]:
# Calculate the average reading score
average_math_score = school_data_complete_df["math_score"].mean()
average_math_score

## Get the Passing Percentages

In [None]:
# Determine the Passing Grade
passing_math = school_data_complete_df["math_score"] >= 70
passing_reading = school_data_complete_df["reading_score"] >= 70

In [None]:
# make a new dataframe for the students who passed math
passing_math_df = school_data_complete_df[passing_math]
passing_math_df.head(5)

In [None]:
# make a new dataframe for the students who passed reading
passing_reading_df = school_data_complete_df[passing_reading]
passing_reading_df.head(5)

In [None]:
# Calculate the number of students passing math
passing_math_count = passing_math_df["Student ID"].count()
passing_math_count

In [None]:
# Calculate the number of students passing reading
passing_reading_count = passing_reading_df["Student ID"].count()
passing_reading_count

In [None]:
# Get the percentage of students who passed math and reading
passing_math_percentage = passing_math_count / float(student_count2) * 100
passing_reading_percentage = passing_reading_count / float(student_count2) * 100

In [None]:
print(passing_math_percentage)
print(passing_reading_percentage)

In [None]:
# Calculate the percentage of students who passed both math and reading
passing_math_reading = school_data_complete_df[passing_math & passing_reading]
passing_math_reading.head(5)

In [None]:
# Calculate the number of students who passed both math and reading
overall_passing_math_reading_count = passing_math_reading["student_name"].count()
overall_passing_math_reading_count

In [None]:
# Calculate the overall passing percentage
overall_passing_percentage = overall_passing_math_reading_count / student_count2 * 100
overall_passing_percentage

## Create a District Summary DataFrame
#### Maria is eager to see the district summary so that she can pass the information along to stakeholders. You'll need to combine all of the metrics we just calculated and put them in a new DataFrame to provide Maria with a table that contains all the data

In [None]:
district_summary_df = pd.DataFrame(
                        [{"Total Schools": school_count,
                          "Total Students": student_count2,
                          "Total Budget": total_budget_2,
                          "Average Math Score": average_math_score,
                          "Average Reading Score": average_reading_score,
                          "% Passing Math": passing_math_percentage,
                          "% Passing Reading": passing_reading_percentage,
                          "% Overall Passing": overall_passing_percentage}])
district_summary_df

## Format Columns

#### Maria is impressed by your summary DataFrame but wants to add some formatting to make the DataFrame look more professional. You'll need to format the budget to two decimal places; format the grade averages to one decimal place and grade percentages to the nearest whole number percent; and add a thousands separator for numbers greater than 1,000.

### A little bit about functions

In [None]:
passing_math_count = 29370
total_student_count = 39170

In [None]:
# Define a function that calculates the percentage of students that passed
# math and returns the passing percentage when the function is called.

def passing_math_percent(passing_math_count, total_student_count):
    return passing_math_count / float(total_student_count) * 100

In [None]:
passing_math_percent(passing_math_count, total_student_count)

In [None]:
district_summary_df

In [None]:
# Format the "Total Students" to have the comma for a thousands separator
# district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)
district_summary_df["Total Students"]

In [None]:
# district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,}".format)
district_summary_df["Total Budget"]

In [None]:
# Formatting the rest of the columns
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.1f}".format)

district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)

district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.0f}".format)

district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}".format)

district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.0f}".format)

In [None]:
district_summary_df

# 4.8.1 Set the Index to the School Name
### Maria is impressed with the work you have done generating the district summary. Now she would like to generate a similar summary for each school in the district

In [None]:
# In this code, the "set_index" method will return a Series with the index as the "school_name"
# and a column with the type of school
per_school_types = school_data_df.set_index(["school_name"])["type"]
per_school_types

In [None]:
# Now we'll create a new DataFrame by converting this Series to a DataFrame as follows:
df = pd.DataFrame(per_school_types)
df.head(5)

# 4.8.2 Get the Student Count Per School
### Next to the "type" of school column, Maria wants you to add the total number of students in each school. However, you have the number of students in two DataFrames: school_data_df and school_data_complete_df. Which one do you use? You'll need to get the number of students from both DataFrames and find which one has "school_name" as the index.

In [None]:
school_data_df.head(5)

In [None]:
per_school_counts = school_data_df.set_index(["school_name"])["size"]
per_school_counts

# 4.8.3 Get the Budget Per Student
### Next tp the Total Students column, Maria wants you to add the budget per student for each school. First, you'll need to get the budget for each school and then divide by the total students per school, which you already calculated as "per_school_counts".

In [None]:
# Calculate the total school budget.
per_school_budget = school_data_df.set_index(["school_name"])["budget"]
per_school_budget

In [None]:
# Calculate the per capita spending.
per_school_capita = per_school_budget / per_school_counts
per_school_capita

## 4.8.4 Get the Score Averages Per School
#### Now you need to calculate the average math score and the average reading score for each school

In [None]:
# Calculate the math scores.
student_school_math = student_data_df.set_index(["school_name"])["math_score"]
student_school_math.head(5)

Using the `set_index()` method on the `"school_name"` column in `student_data_df` is not good because there are too many occurrences of the school_name column.

Instead, we use the `groupby()` function as it splits an object (like a DataFrame), apply a mathematical operation, and combine the results. This can be used to group large amounts of data when we want to compute mathematical operations on these groups.

In our case, the mathematical operation we will apply to the `groupby()` function is the `mean()` method. Let's see how this will look when we apply it to `school_data_complete_df` to get the grade averages for each column. 

In [None]:
school_data_complete_df.head(5)

In [None]:
# Calculate the avereage math scores.
per_school_averages = school_data_complete_df.groupby(["school_name"]).mean()
per_school_averages.head(5)

In [None]:
# Calculate the average test scores
per_school_math_average = school_data_complete_df.groupby(["school_name"]).mean()["math_score"]
per_school_reading_average = school_data_complete_df.groupby(["school_name"]).mean()["reading_score"]

In [None]:
per_school_math_average.head(5)

In [None]:
per_school_reading_average.head(5)

## 4.8.5 Get the Passing Percentages Per School
Now Maria would like you to continue gathering key data by calculating the passing percentages for math and reading for each school, as well as get the overall passing percentage for each school. 

In [None]:
# To get the passing percentages, we need to:
# 1. Determine what is the passing grade (>=70)
# 2. Calculate the passing scores by creating a filtered DataFrame
per_school_passing_math = school_data_complete_df[(school_data_complete_df["math_score"] >= 70)]
per_school_passing_reading = school_data_complete_df[(school_data_complete_df["reading_score"] >= 70)]
# 3. Get the number of students who passed math and passed reading by school.

In [None]:
per_school_passing_math.head(5)

In [None]:
per_school_passing_reading.head(5)

In [None]:
passing_student_number_per_school_math = per_school_passing_math.groupby(["school_name"]).count()["student_name"]
passing_student_number_per_school_reading = per_school_passing_reading.groupby(["school_name"]).count()["student_name"]

In [None]:
passing_student_number_per_school_math.head()

In [None]:
passing_student_number_per_school_reading

In [None]:
# Determine the Percentage of Students Passing Math and Reading
percentage_students_passing_math = passing_student_number_per_school_math / per_school_counts * 100
percentage_students_passing_reading = passing_student_number_per_school_reading / per_school_counts * 100

In [None]:
percentage_students_passing_math

In [None]:
percentage_students_passing_reading

In [None]:
# Get the overall passing percentage for all students for each school
# Calculate the students who passed both math and reading
per_passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70) & (school_data_complete_df["reading_score"] >= 70)]
per_passing_math_reading.head()

In [None]:
per_passing_math_reading = per_passing_math_reading.groupby(["school_name"]).count()["student_name"]
per_passing_math_reading

In [None]:
# Calculate the overall passing percentage
per_overall_passing_percentage = per_passing_math_reading / per_school_counts * 100
per_overall_passing_percentage