In [5]:
import pandas as pd
import numpy as np
import sqlalchemy
import psycopg2
from psycopg2 import sql

In [6]:
# This notebook will be mainly used for cleaning and performing high level analysis of school testing data for Tennessee schools for the 2021-2022, 2020-2021, 2017-2018, and 2012-2013 schoo years.


In [7]:
# Import the xslx/csv for each school year and convert to a dataframe
read_dis_sc22_df = pd.read_excel('Data/2022/district_assessment_file_suppressed_upd32323.xlsx')
read_dis_sc21_df = pd.read_csv('Data/2021/district_assessment_file_suppressed_upd422.csv')
read_dis_sc18_df = pd.read_csv('Data/2018/data_2018_district_base.csv')
read_dis_sc13_df = pd.read_excel('Data/2013/data_2013_district_base.xlsx')


In [8]:
dis_sc13_df = read_dis_sc13_df
dis_sc18_df = read_dis_sc18_df
dis_sc21_df = read_dis_sc21_df
dis_sc22_df = read_dis_sc22_df

In [9]:
# Check data types
dis_sc22_df.info()
dis_sc21_df.info()
dis_sc18_df.info()
dis_sc13_df.info()  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167631 entries, 0 to 167630
Data columns (total 20 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   year                       167631 non-null  int64 
 1   system                     167631 non-null  int64 
 2   system_name                167631 non-null  object
 3   test                       167631 non-null  object
 4   subject                    167631 non-null  object
 5   grade                      167631 non-null  object
 6   student_group              167631 non-null  object
 7   participation_rate         167631 non-null  object
 8   enrolled                   167631 non-null  int64 
 9   tested                     167631 non-null  int64 
 10  valid_tests                167631 non-null  int64 
 11  n_below                    167631 non-null  object
 12  n_approaching              167631 non-null  object
 13  n_met_expectations         167631 non-null  

In [10]:
# View the top 3 rows of each
display(dis_sc13_df.head(3))
display(dis_sc18_df.head(3))
display(dis_sc21_df.head(3))
display(dis_sc22_df.head(3))


Unnamed: 0,year,system,system_name,school,school_name,subject,grade,subgroup,valid_tests,n_below_bsc,n_bsc,n_prof,n_adv,pct_below_bsc,pct_bsc,pct_prof,pct_adv,pct_bsc_and_below,pct_prof_adv
0,2013,10,Anderson County,0,District,Math,All Grades,All Students,2908,475,1036,888,509,16.4,35.6,30.5,17.5,52.0,48.0
1,2013,10,Anderson County,0,District,Math,All Grades,White,2810,451,1005,861,493,16.1,35.8,30.6,17.5,51.9,48.1
2,2013,10,Anderson County,0,District,Math,All Grades,Hispanic,16,1,4,5,6,6.2,25.0,31.3,37.5,31.2,68.8


Unnamed: 0,year,system,system_name,test,subject,grade,subgroup,valid_tests,n_below,n_approaching,n_on_track,n_mastered,pct_below,pct_approaching,pct_on_track,pct_mastered,pct_on_mastered
0,2018,10,Anderson County,EOC,Algebra I,10,All Students,80.0,**,**,**,**,**,**,**,**,3.8
1,2018,10,Anderson County,EOC,Algebra I,10,American Indian or Alaska Native,1.0,*,*,*,*,*,*,*,*,*
2,2018,10,Anderson County,EOC,Algebra I,10,Black/Hispanic/Native American,1.0,*,*,*,*,*,*,*,*,*


Unnamed: 0,year,system,system_name,test,subject,grade,subgroup,participation_rate,enrolled,tested,valid_tests,n_below,n_approaching,n_on_track,n_mastered,pct_below,pct_approaching,pct_on_track,pct_mastered,pct_on_mastered
0,2021,10,Anderson County,EOC,Algebra I,10,All Students,94,89,84,82,**,**,**,**,**,**,**,**,1.2
1,2021,10,Anderson County,EOC,Algebra I,10,Black or African American,*,4,4,4,*,*,*,*,*,*,*,*,*
2,2021,10,Anderson County,EOC,Algebra I,10,Black/Hispanic/Native American,*,9,8,7,*,*,*,*,*,*,*,*,*


Unnamed: 0,year,system,system_name,test,subject,grade,student_group,participation_rate,enrolled,tested,valid_tests,n_below,n_approaching,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded
0,2022,10,Anderson County,EOC,Algebra I,10,All Students,99,79,78,77,**,**,**,**,**,**,**,**,1.3
1,2022,10,Anderson County,EOC,Algebra I,10,American Indian or Alaska Native,*,1,1,1,*,*,*,*,*,*,*,*,*
2,2022,10,Anderson County,EOC,Algebra I,10,Black or African American,*,2,2,2,*,*,*,*,*,*,*,*,*


In [11]:
## Cleaning the Data ##
# Delete 'school' and 'school_name' column for 2013
dis_sc13_df = dis_sc13_df.drop(['school', 'school_name'], axis=1)
dis_sc13_df.head(3)

Unnamed: 0,year,system,system_name,subject,grade,subgroup,valid_tests,n_below_bsc,n_bsc,n_prof,n_adv,pct_below_bsc,pct_bsc,pct_prof,pct_adv,pct_bsc_and_below,pct_prof_adv
0,2013,10,Anderson County,Math,All Grades,All Students,2908,475,1036,888,509,16.4,35.6,30.5,17.5,52.0,48.0
1,2013,10,Anderson County,Math,All Grades,White,2810,451,1005,861,493,16.1,35.8,30.6,17.5,51.9,48.1
2,2013,10,Anderson County,Math,All Grades,Hispanic,16,1,4,5,6,6.2,25.0,31.3,37.5,31.2,68.8


In [12]:
# Change valid_tests in 2018 from float to int
dis_sc18_df = dis_sc18_df.astype({'valid_tests': int})
dis_sc18_df.head(3)

Unnamed: 0,year,system,system_name,test,subject,grade,subgroup,valid_tests,n_below,n_approaching,n_on_track,n_mastered,pct_below,pct_approaching,pct_on_track,pct_mastered,pct_on_mastered
0,2018,10,Anderson County,EOC,Algebra I,10,All Students,80,**,**,**,**,**,**,**,**,3.8
1,2018,10,Anderson County,EOC,Algebra I,10,American Indian or Alaska Native,1,*,*,*,*,*,*,*,*,*
2,2018,10,Anderson County,EOC,Algebra I,10,Black/Hispanic/Native American,1,*,*,*,*,*,*,*,*,*


In [13]:
# Change 2021 df column names to match 2022
dis_sc21_df = dis_sc21_df.rename(columns={'n_mastered':'n_exceeded_expectations','n_on_track':'n_met_expectations','pct_on_track':'pct_met_expectations','pct_mastered':'pct_exceeded_expectations','pct_on_mastered':'pct_met_exceeded','subgroup':'student_group'})
display(dis_sc21_df.head(2))
display(dis_sc22_df.head(2))

Unnamed: 0,year,system,system_name,test,subject,grade,student_group,participation_rate,enrolled,tested,valid_tests,n_below,n_approaching,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded
0,2021,10,Anderson County,EOC,Algebra I,10,All Students,94,89,84,82,**,**,**,**,**,**,**,**,1.2
1,2021,10,Anderson County,EOC,Algebra I,10,Black or African American,*,4,4,4,*,*,*,*,*,*,*,*,*


Unnamed: 0,year,system,system_name,test,subject,grade,student_group,participation_rate,enrolled,tested,valid_tests,n_below,n_approaching,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded
0,2022,10,Anderson County,EOC,Algebra I,10,All Students,99,79,78,77,**,**,**,**,**,**,**,**,1.3
1,2022,10,Anderson County,EOC,Algebra I,10,American Indian or Alaska Native,*,1,1,1,*,*,*,*,*,*,*,*,*


In [14]:
# Show the number of tests for the student groups
# For 2021
df_2021 = dis_sc21_df[dis_sc21_df['grade'].isin(['09', '10', '11', '12'])].groupby('student_group')['valid_tests'].sum().reset_index()

# For 2022
df_2022 = dis_sc22_df[dis_sc22_df['grade'].isin(['09', '10', '11', '12'])].groupby('student_group')['valid_tests'].sum().reset_index()



display(df_2021)
display(df_2022)


Unnamed: 0,student_group,valid_tests
0,All Students,259502
1,American Indian or Alaska Native,1161
2,Asian,5356
3,Black or African American,60130
4,Black/Hispanic/Native American,90777
5,Economically Disadvantaged,72135
6,English Learner Transitional 1-4,4446
7,English Learners,8133
8,English Learners with Transitional 1-4,12569
9,Female,126349


Unnamed: 0,student_group,valid_tests
0,All Students,288033
1,American Indian or Alaska Native,1315
2,Asian,6173
3,Black or African American,67587
4,Black/Hispanic/Native American,106519
5,Economically Disadvantaged,75920
6,English Learner Transitional 1-4,3730
7,English Learners,10735
8,English Learners with Transitional 1-4,14463
9,Female,139779


In [15]:
# Compare 2018 columns to 2022
display(dis_sc22_df.head(2))
display(dis_sc18_df.head(2))


Unnamed: 0,year,system,system_name,test,subject,grade,student_group,participation_rate,enrolled,tested,valid_tests,n_below,n_approaching,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded
0,2022,10,Anderson County,EOC,Algebra I,10,All Students,99,79,78,77,**,**,**,**,**,**,**,**,1.3
1,2022,10,Anderson County,EOC,Algebra I,10,American Indian or Alaska Native,*,1,1,1,*,*,*,*,*,*,*,*,*


Unnamed: 0,year,system,system_name,test,subject,grade,subgroup,valid_tests,n_below,n_approaching,n_on_track,n_mastered,pct_below,pct_approaching,pct_on_track,pct_mastered,pct_on_mastered
0,2018,10,Anderson County,EOC,Algebra I,10,All Students,80,**,**,**,**,**,**,**,**,3.8
1,2018,10,Anderson County,EOC,Algebra I,10,American Indian or Alaska Native,1,*,*,*,*,*,*,*,*,*


In [16]:
# Match 2018 columns to 2022
dis_sc18_df = dis_sc18_df.rename(columns={'n_on_track':'n_met_expectations','n_mastered':'n_exceeded_expectations','pct_on_track':'pct_met_expectations','pct_mastered':'pct_exceeded_expectations','pct_on_mastered':'pct_met_exceeded','subgroup':'student_group'})

display(dis_sc22_df.head(2))
display(dis_sc18_df.head(2))

Unnamed: 0,year,system,system_name,test,subject,grade,student_group,participation_rate,enrolled,tested,valid_tests,n_below,n_approaching,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded
0,2022,10,Anderson County,EOC,Algebra I,10,All Students,99,79,78,77,**,**,**,**,**,**,**,**,1.3
1,2022,10,Anderson County,EOC,Algebra I,10,American Indian or Alaska Native,*,1,1,1,*,*,*,*,*,*,*,*,*


Unnamed: 0,year,system,system_name,test,subject,grade,student_group,valid_tests,n_below,n_approaching,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded
0,2018,10,Anderson County,EOC,Algebra I,10,All Students,80,**,**,**,**,**,**,**,**,3.8
1,2018,10,Anderson County,EOC,Algebra I,10,American Indian or Alaska Native,1,*,*,*,*,*,*,*,*,*


In [17]:
# Compare 2013 columns to 2022
display(dis_sc22_df.head(2))
display(dis_sc13_df.head(2))

Unnamed: 0,year,system,system_name,test,subject,grade,student_group,participation_rate,enrolled,tested,valid_tests,n_below,n_approaching,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded
0,2022,10,Anderson County,EOC,Algebra I,10,All Students,99,79,78,77,**,**,**,**,**,**,**,**,1.3
1,2022,10,Anderson County,EOC,Algebra I,10,American Indian or Alaska Native,*,1,1,1,*,*,*,*,*,*,*,*,*


Unnamed: 0,year,system,system_name,subject,grade,subgroup,valid_tests,n_below_bsc,n_bsc,n_prof,n_adv,pct_below_bsc,pct_bsc,pct_prof,pct_adv,pct_bsc_and_below,pct_prof_adv
0,2013,10,Anderson County,Math,All Grades,All Students,2908,475,1036,888,509,16.4,35.6,30.5,17.5,52.0,48.0
1,2013,10,Anderson County,Math,All Grades,White,2810,451,1005,861,493,16.1,35.8,30.6,17.5,51.9,48.1


In [18]:
# Match 2013 columns to 2022
dis_sc13_df = dis_sc13_df.rename(columns={'n_below_bsc':'n_below','n_bsc':'n_approaching','n_prof':'n_met_expectations','n_adv':'n_exceeded_expectations','pct_below_bsc':'pct_below','pct_bsc':'pct_approaching','pct_prof':'pct_met_expectations','pct_adv':'pct_exceeded_expectations','pct_prof_adv':'pct_met_exceeded','pct_bsc_and_below':'pct_approaching_and_below','subgroup':'student_group'})

display(dis_sc22_df.head(2))
display(dis_sc13_df.head(2))

Unnamed: 0,year,system,system_name,test,subject,grade,student_group,participation_rate,enrolled,tested,valid_tests,n_below,n_approaching,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded
0,2022,10,Anderson County,EOC,Algebra I,10,All Students,99,79,78,77,**,**,**,**,**,**,**,**,1.3
1,2022,10,Anderson County,EOC,Algebra I,10,American Indian or Alaska Native,*,1,1,1,*,*,*,*,*,*,*,*,*


Unnamed: 0,year,system,system_name,subject,grade,student_group,valid_tests,n_below,n_approaching,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_approaching_and_below,pct_met_exceeded
0,2013,10,Anderson County,Math,All Grades,All Students,2908,475,1036,888,509,16.4,35.6,30.5,17.5,52.0,48.0
1,2013,10,Anderson County,Math,All Grades,White,2810,451,1005,861,493,16.1,35.8,30.6,17.5,51.9,48.1


In [19]:
# Check data types
dis_sc22_df.info()
dis_sc21_df.info()
dis_sc18_df.info()
dis_sc13_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167631 entries, 0 to 167630
Data columns (total 20 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   year                       167631 non-null  int64 
 1   system                     167631 non-null  int64 
 2   system_name                167631 non-null  object
 3   test                       167631 non-null  object
 4   subject                    167631 non-null  object
 5   grade                      167631 non-null  object
 6   student_group              167631 non-null  object
 7   participation_rate         167631 non-null  object
 8   enrolled                   167631 non-null  int64 
 9   tested                     167631 non-null  int64 
 10  valid_tests                167631 non-null  int64 
 11  n_below                    167631 non-null  object
 12  n_approaching              167631 non-null  object
 13  n_met_expectations         167631 non-null  

In [20]:
# Define the columns to be converted to integers and floats
int_columns = ['n_below', 'n_approaching', 'n_met_expectations', 'n_exceeded_expectations']
float_columns = ['pct_below', 'pct_approaching', 'pct_met_expectations', 'pct_exceeded_expectations', 'pct_approaching_and_below','pct_met_exceeded']

# If the column exists in the dataframes then convert it to the specified data type
def convert_columns_to_int_float(df, int_columns, float_columns):
    for col in int_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
        
    for col in float_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(float)
        
    return df

# Convert the specified columns in each DataFrame
dis_sc22_df = convert_columns_to_int_float(dis_sc22_df, int_columns, float_columns)
dis_sc21_df = convert_columns_to_int_float(dis_sc21_df, int_columns, float_columns)
dis_sc18_df = convert_columns_to_int_float(dis_sc18_df, int_columns, float_columns)
dis_sc13_df = convert_columns_to_int_float(dis_sc13_df, int_columns, float_columns)

# Check data types
dis_sc22_df.info()
dis_sc21_df.info()
dis_sc18_df.info()
dis_sc13_df.info() 



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167631 entries, 0 to 167630
Data columns (total 20 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   year                       167631 non-null  int64  
 1   system                     167631 non-null  int64  
 2   system_name                167631 non-null  object 
 3   test                       167631 non-null  object 
 4   subject                    167631 non-null  object 
 5   grade                      167631 non-null  object 
 6   student_group              167631 non-null  object 
 7   participation_rate         167631 non-null  object 
 8   enrolled                   167631 non-null  int64  
 9   tested                     167631 non-null  int64  
 10  valid_tests                167631 non-null  int64  
 11  n_below                    167631 non-null  int32  
 12  n_approaching              167631 non-null  int32  
 13  n_met_expectations         16

In [21]:
# View the top 3 rows of each
display(dis_sc13_df.head(3))
display(dis_sc18_df.head(3))
display(dis_sc21_df.head(3))
display(dis_sc22_df.head(3))

Unnamed: 0,year,system,system_name,subject,grade,student_group,valid_tests,n_below,n_approaching,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_approaching_and_below,pct_met_exceeded
0,2013,10,Anderson County,Math,All Grades,All Students,2908,475,1036,888,509,16.4,35.6,30.5,17.5,52.0,48.0
1,2013,10,Anderson County,Math,All Grades,White,2810,451,1005,861,493,16.1,35.8,30.6,17.5,51.9,48.1
2,2013,10,Anderson County,Math,All Grades,Hispanic,16,1,4,5,6,6.2,25.0,31.3,37.5,31.2,68.8


Unnamed: 0,year,system,system_name,test,subject,grade,student_group,valid_tests,n_below,n_approaching,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded
0,2018,10,Anderson County,EOC,Algebra I,10,All Students,80,0,0,0,0,0.0,0.0,0.0,0.0,3.8
1,2018,10,Anderson County,EOC,Algebra I,10,American Indian or Alaska Native,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0
2,2018,10,Anderson County,EOC,Algebra I,10,Black/Hispanic/Native American,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,year,system,system_name,test,subject,grade,student_group,participation_rate,enrolled,tested,valid_tests,n_below,n_approaching,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded
0,2021,10,Anderson County,EOC,Algebra I,10,All Students,94,89,84,82,0,0,0,0,0.0,0.0,0.0,0.0,1.2
1,2021,10,Anderson County,EOC,Algebra I,10,Black or African American,*,4,4,4,0,0,0,0,0.0,0.0,0.0,0.0,0.0
2,2021,10,Anderson County,EOC,Algebra I,10,Black/Hispanic/Native American,*,9,8,7,0,0,0,0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,year,system,system_name,test,subject,grade,student_group,participation_rate,enrolled,tested,valid_tests,n_below,n_approaching,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_met_exceeded
0,2022,10,Anderson County,EOC,Algebra I,10,All Students,99,79,78,77,0,0,0,0,0.0,0.0,0.0,0.0,1.3
1,2022,10,Anderson County,EOC,Algebra I,10,American Indian or Alaska Native,*,1,1,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0
2,2022,10,Anderson County,EOC,Algebra I,10,Black or African American,*,2,2,2,0,0,0,0,0.0,0.0,0.0,0.0,0.0


In [22]:
# dis_sc22_df.to_csv('district_scores_2022.csv', index=False)
# dis_sc21_df.to_csv('district_scores_2021.csv', index=False)
# dis_sc18_df.to_csv('district_scores_2018.csv', index=False)
# dis_sc13_df.to_csv('district_scores_2013.csv', index=False)


PermissionError: [Errno 13] Permission denied: 'district_scores_2022.csv'

In [None]:
def import_csv_to_postgres(df, csv_file, table_name, host, port, user, password, dbname):
    # Connect to the PostgreSQL database
    conn = psycopg2.connect(
        host=host,
        port=port,
        user=user,
        password=password,
        dbname=dbname
    )

    # Infer the data types for the table schema
    data_types = {
        'int32': 'integer',
        'int64': 'integer',
        'float64': 'double precision',
        'object': 'text'
    }

    column_defs = ', '.join([f"{col} {data_types[str(df[col].dtype)]}" for col in df.columns])

    # Create a table with the appropriate data types based on the DataFrame
    with conn.cursor() as cur:
        cur.execute(f"DROP TABLE IF EXISTS {table_name};")
        create_table_query = f"CREATE TABLE {table_name} ({column_defs});"
        cur.execute(create_table_query)

        # Import the CSV data into the newly created table
        with open(csv_file, 'r') as f:
            cur.copy_expert(f"COPY {table_name} FROM STDIN WITH CSV HEADER", f)
            conn.commit()

    conn.close()


In [None]:
import_csv_to_postgres(
    df=dis_sc22_df,
    csv_file='district_scores_2022.csv',
    table_name='district_scores_2022',
    host='localhost',
    port=5433,
    user='postgres',
    password='Metcon202#',
    dbname='Tennessee Schools Standardized Scores' 
)

import_csv_to_postgres(
    df=dis_sc21_df,
    csv_file='district_scores_2021.csv',
    table_name='district_scores_2021',
    host='localhost',
    port=5433,
    user='postgres',
    password='Metcon202#',
    dbname='Tennessee Schools Standardized Scores'
)

import_csv_to_postgres(
    df=dis_sc18_df,
    csv_file='district_scores_2018.csv',
    table_name='district_scores_2018',
    host='localhost',
    port=5433,
    user='postgres',
    password='Metcon202#',
    dbname='Tennessee Schools Standardized Scores'
)

import_csv_to_postgres(
    df=dis_sc13_df,
    csv_file='district_scores_2013.csv',
    table_name='district_scores_2013',
    host='localhost',
    port=5433,
    user='postgres',
    password='Metcon202#',
    dbname='Tennessee Schools Standardized Scores' 
)


In [None]:
# Filter the DataFrame based on system_name and subject for Davidson County Algebra I
davidson_algebra_df = dis_sc22_df[(dis_sc22_df['system_name'] == 'Davidson County') & (dis_sc22_df['subject'] == 'Algebra I')]

davidson_algebra_df.head(20)

In [None]:
# List the unique values of the student_group column for the 2013,2018,2021,and 2022 dataframes
# Create a list of dataframes and their corresponding years
dfs = [(dis_sc13_df, 2013), (dis_sc18_df, 2018), (dis_sc21_df, 2021), (dis_sc22_df, 2022)]

# Loop through each dataframe and get the unique values for the student_group column
for df, year in dfs:
    unique_student_groups = df['student_group'].unique()
    
    # Create a temporary dataframe with the unique values and the year as the title
    temp_df = pd.DataFrame({'Student Group': unique_student_groups})
    temp_df = temp_df.rename(columns={'Student Group': f'Unique Student Groups {year}'})
    
    # Display the rows of the temporary dataframe
    display(temp_df.head(15))







In [None]:
# Try to determine how the ethnic groups are broken down. I think "black or African American", "Hispanic", and "Non-Black/Hispanic/Native American" make for a single choice category.

def test_hypothesis(df, year, subject='Algebra I'):
    unique_grades = df[df['subject'] == subject]['grade'].unique()
    print(f"Testing hypothesis for {year}")
    
    for grade in unique_grades:
        grade_data = df[(df['subject'] == subject) & (df['grade'] == grade)]
        all_students = grade_data[grade_data['student_group'] == 'All Students']['tested'].sum()
        black_students = grade_data[grade_data['student_group'] == 'Black or African American']['tested'].sum()
        hispanic_students = grade_data[grade_data['student_group'] == 'Hispanic']['tested'].sum()
        non_bhh_students = grade_data[grade_data['student_group'] == 'Non-Black/Hispanic/Native American']['tested'].sum()

        approx_total = black_students + hispanic_students + non_bhh_students
        
        print(f"Grade {grade}:")
        print(f"All Students: {all_students}")
        print(f"Black + Hispanic + Non-Black/Hispanic/Native American: {approx_total}")
        print(f"Difference: {abs(all_students - approx_total)}")
        print()

# Apply the test_hypothesis function to dis_sc21_df and dis_sc22_df
test_hypothesis(dis_sc21_df, 2021)
test_hypothesis(dis_sc22_df, 2022)


In [None]:
# Check if asian students are missing in the sum for the choice

def test_hypothesis_asian(df, year, subject='Algebra I'):
    unique_grades = df[df['subject'] == subject]['grade'].unique()
    print(f"Testing hypothesis for {year}")
    
    for grade in unique_grades:
        grade_data = df[(df['subject'] == subject) & (df['grade'] == grade)]
        all_students = grade_data[grade_data['student_group'] == 'All Students']['tested'].sum()
        black_students = grade_data[grade_data['student_group'] == 'Black or African American']['tested'].sum()
        hispanic_students = grade_data[grade_data['student_group'] == 'Hispanic']['tested'].sum()
        non_bhh_students = grade_data[grade_data['student_group'] == 'Non-Black/Hispanic/Native American']['tested'].sum()
        asian_students = grade_data[grade_data['student_group'] == 'Asian']['tested'].sum()

        approx_total = black_students + hispanic_students + non_bhh_students + asian_students 
        
        print(f"Grade {grade}:")
        print(f"All Students: {all_students}")
        print(f"Black + Hispanic + Non-Black/Hispanic/Native American + Asian: {approx_total}")
        print(f"Difference: {abs(all_students - approx_total)}")
        print()

# Apply the test_hypothesis function to dis_sc21_df and dis_sc22_df
test_hypothesis_asian(dis_sc21_df, 2021)
test_hypothesis_asian(dis_sc22_df, 2022)

In [None]:


def test_hypothesis_english_learners(df, year, subject='Algebra I'):
    unique_grades = df[df['subject'] == subject]['grade'].unique()
    print(f"Testing hypothesis for English Learners in {year}")
    
    for grade in unique_grades:
        grade_data = df[(df['subject'] == subject) & (df['grade'] == grade)]
        all_students = grade_data[grade_data['student_group'] == 'All Students']['tested'].sum()
        english_learners = grade_data[grade_data['student_group'] == 'English Learners']['tested'].sum()
        transitional_learners = grade_data[grade_data['student_group'] == 'English Learners with Transitional 1-4']['tested'].sum()
        non_english_learners = grade_data[grade_data['student_group'] == 'Non-English Learners/Transitional 1-4']['tested'].sum()

        approx_total = english_learners + transitional_learners + non_english_learners
        
        print(f"Grade {grade}:")
        print(f"All Students: {all_students}")
        print(f"English Learners + Transitional Learners + Non-English Learners: {approx_total}")
        print(f"Difference: {abs(all_students - approx_total)}")
        print()

# Apply the test_hypothesis_english_learners function to dis_sc21_df and dis_sc22_df
test_hypothesis_english_learners(dis_sc21_df, 2021)
test_hypothesis_english_learners(dis_sc22_df, 2022)


In [None]:
# Test for the disability category

def test_hypothesis_disabilities(df, year, subject='Algebra I'):
    unique_grades = df[df['subject'] == subject]['grade'].unique()
    print(f"Testing hypothesis for Disabilities in {year}")
    
    for grade in unique_grades:
        grade_data = df[(df['subject'] == subject) & (df['grade'] == grade)]
        all_students = grade_data[grade_data['student_group'] == 'All Students']['tested'].sum()
        students_with_disabilities = grade_data[grade_data['student_group'] == 'Students with Disabilities']['tested'].sum()
        non_students_with_disabilities = grade_data[grade_data['student_group'] == 'Non-Students with Disabilities']['tested'].sum()

        approx_total = students_with_disabilities + non_students_with_disabilities
        
        print(f"Grade {grade}:")
        print(f"All Students: {all_students}")
        print(f"Students with Disabilities + Non-Students with Disabilities: {approx_total}")
        print(f"Difference: {abs(all_students - approx_total)}")
        print()

# Apply the test_hypothesis_disabilities function to dis_sc21_df and dis_sc22_df
test_hypothesis_disabilities(dis_sc21_df, 2021)
test_hypothesis_disabilities(dis_sc22_df, 2022)


In [None]:
# Davidson, Rutherford, Williamson, Sumner, Wilson, Maury, Robertson, Dickson*, Cheatham*, Smith, Cannon and [ommit Trousdale, Smith, Cannon for inconsistent and/or negligible reporting] (Nashville-Davidson-Murfreesboro-Franklin_Metro). Shelby, Tipton, and Fayette ([ommit Fayette for inconsistent and/or negligible reporting] Memphis_Metro). Sullivan, Washington, and Carter (Johnson_City-Kingsport-Bristol) [ommit Hawkins]. Anderson*, Blount, Campbell*, Grainger, Knox, Loudon*, Morgan*, Roane, and Union ([Union ommited for inconsistent and/or negligible reporting]Knoxville-Morristown-Sevierville-La_Follette). Hammilton, Marion, and Sequatchie ([ommit Sequatchie for inconsistent and/or negligible reporting] Chattanooga-Cleveland-Athens), Montgomery and Stewart (Clarksville) 

In [34]:
# Filter for high school students

grades_to_include = ['9', '10', '11', '12']

# Filter for 2022 data
dis_sc22_df = dis_sc22_df[dis_sc22_df['grade'].isin(grades_to_include)]

# Filter for 2021 data
dis_sc21_df = dis_sc21_df[dis_sc21_df['grade'].isin(grades_to_include)]

# Filter for 2018 data
dis_sc18_df = dis_sc18_df[dis_sc18_df['grade'].isin(grades_to_include)]

# Filter for 2013 data
dis_sc13_df = dis_sc13_df[dis_sc13_df['grade'].isin(grades_to_include)]


dis_sc13_df.info()
dis_sc18_df.info()
dis_sc21_df.info()
dis_sc22_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31668 entries, 360 to 92159
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   year                       31668 non-null  int64  
 1   system                     31668 non-null  int64  
 2   system_name                31668 non-null  object 
 3   subject                    31668 non-null  object 
 4   grade                      31668 non-null  object 
 5   student_group              31668 non-null  object 
 6   valid_tests                17814 non-null  object 
 7   n_below                    31668 non-null  int32  
 8   n_approaching              31668 non-null  int32  
 9   n_met_expectations         31668 non-null  int32  
 10  n_exceeded_expectations    31668 non-null  int32  
 11  pct_below                  31668 non-null  float64
 12  pct_approaching            31668 non-null  float64
 13  pct_met_expectations       31668 non-null  float6

In [37]:
# Filter the dataset for the major metropolitan areas of Tennessee
# Davidson, Rutherford, Williamson, Sumner, Wilson, Maury, Robertson, Dickson*, Cheatham*, Smith, Cannon and [ommit Trousdale, Smith, Cannon for inconsistent and/or negligible reporting] (Nashville-Davidson-Murfreesboro-Franklin_Metro). Shelby, Tipton, and Fayette ([ommit Fayette for inconsistent and/or negligible reporting] Memphis_Metro). Sullivan, Washington, and Carter (Johnson_City-Kingsport-Bristol) [ommit Hawkins]. Anderson*, Blount, Campbell*, Grainger, Knox, Loudon*, Morgan*, Roane, and Union ([Union ommited for inconsistent and/or negligible reporting]Knoxville-Morristown-Sevierville-La_Follette). Hammilton, Marion, and Sequatchie ([ommit Sequatchie for inconsistent and/or negligible reporting] Chattanooga-Cleveland-Athens), Montgomery and Stewart (Clarksville) 
counties_to_include = ['Davidson County', 'Rutherford County', 'Williamson County', 'Sumner County', 
                       'Wilson County', 'Maury County', 'Robertson County', 'Dickson County', 
                       'Cheatham County', 'Smith County', 'Cannon County', 'Shelby County', 
                       'Tipton County', 'Fayette County', 'Sullivan County', 'Washington County', 
                       'Carter County', 'Anderson County', 'Blount County', 'Campbell County', 
                       'Grainger County', 'Knox County', 'Loudon County', 'Morgan County', 
                       'Roane County', 'Union County', 'Hamilton County', 'Marion County', 
                       'Montgomery County', 'Stewart County']

# Filter for 2022 data
dis_sc22_df = dis_sc22_df[dis_sc22_df['system_name'].isin(counties_to_include)]

# Filter for 2021 data
dis_sc21_df = dis_sc21_df[dis_sc21_df['system_name'].isin(counties_to_include)]

# Filter for 2018 data
dis_sc18_df = dis_sc18_df[dis_sc18_df['system_name'].isin(counties_to_include)]

# Filter for 2013 data
dis_sc13_df = dis_sc13_df[dis_sc13_df['system_name'].isin(counties_to_include)]


In [40]:
non_numeric = pd.to_numeric(dis_sc13_df['valid_tests'], errors='coerce').isna()
print(dis_sc13_df.loc[non_numeric, 'valid_tests'].unique())


['*' nan]


In [42]:
dis_sc13_df['valid_tests'] = pd.to_numeric(dis_sc13_df['valid_tests'], errors='coerce').fillna(0)
dis_sc13_df['valid_tests'] = dis_sc13_df['valid_tests'].astype('int32')
dis_sc13_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8724 entries, 360 to 89687
Data columns (total 17 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   year                       8724 non-null   int64  
 1   system                     8724 non-null   int64  
 2   system_name                8724 non-null   object 
 3   subject                    8724 non-null   object 
 4   grade                      8724 non-null   object 
 5   student_group              8724 non-null   object 
 6   valid_tests                8724 non-null   int32  
 7   n_below                    8724 non-null   int32  
 8   n_approaching              8724 non-null   int32  
 9   n_met_expectations         8724 non-null   int32  
 10  n_exceeded_expectations    8724 non-null   int32  
 11  pct_below                  8724 non-null   float64
 12  pct_approaching            8724 non-null   float64
 13  pct_met_expectations       8724 non-null   float64

In [43]:
# Remove underreporting counties
counties_to_remove = ['Fayette County', 'Union County', 'Sequatchie County']
for df in [dis_sc22_df, dis_sc21_df, dis_sc18_df, dis_sc13_df]:
    df = df[~df['system_name'].isin(counties_to_remove)]


In [44]:
# Create a dictionary mapping each county to its metropolitan area
metro_areas = {
    'Davidson County': 'Nashville-Davidson-Murfreesboro-Franklin',
    'Rutherford County': 'Nashville-Davidson-Murfreesboro-Franklin',
    'Williamson County': 'Nashville-Davidson-Murfreesboro-Franklin',
    'Sumner County': 'Nashville-Davidson-Murfreesboro-Franklin',
    'Wilson County': 'Nashville-Davidson-Murfreesboro-Franklin',
    'Maury County': 'Nashville-Davidson-Murfreesboro-Franklin',
    'Robertson County': 'Nashville-Davidson-Murfreesboro-Franklin',
    'Dickson County': 'Nashville-Davidson-Murfreesboro-Franklin',
    'Cheatham County': 'Nashville-Davidson-Murfreesboro-Franklin',
    'Smith County': 'Nashville-Davidson-Murfreesboro-Franklin',
    'Cannon County': 'Nashville-Davidson-Murfreesboro-Franklin',
    'Shelby County': 'Memphis',
    'Tipton County': 'Memphis',
    'Sullivan County': 'Johnson_City-Kingsport-Bristol',
    'Washington County': 'Johnson_City-Kingsport-Bristol',
    'Carter County': 'Johnson_City-Kingsport-Bristol',
    'Anderson County': 'Knoxville-Morristown-Sevierville-La_Follette',
    'Blount County': 'Knoxville-Morristown-Sevierville-La_Follette',
    'Campbell County': 'Knoxville-Morristown-Sevierville-La_Follette',
    'Grainger County': 'Knoxville-Morristown-Sevierville-La_Follette',
    'Knox County': 'Knoxville-Morristown-Sevierville-La_Follette',
    'Loudon County': 'Knoxville-Morristown-Sevierville-La_Follette',
    'Morgan County': 'Knoxville-Morristown-Sevierville-La_Follette',
    'Roane County': 'Knoxville-Morristown-Sevierville-La_Follette',
    'Hamilton County': 'Chattanooga-Cleveland-Athens',
    'Marion County': 'Chattanooga-Cleveland-Athens',
    'Montgomery County': 'Clarksville',
    'Stewart County': 'Clarksville',
}

# Create a new 'metro_area' column in each dataframe
for df in [dis_sc22_df, dis_sc21_df, dis_sc18_df, dis_sc13_df]:
    df['metro_area'] = df['system_name'].map(metro_areas)


dis_sc13_df.head()

Unnamed: 0,year,system,system_name,subject,grade,student_group,valid_tests,n_below,n_approaching,n_met_expectations,n_exceeded_expectations,pct_below,pct_approaching,pct_met_expectations,pct_exceeded_expectations,pct_approaching_and_below,pct_met_exceeded,metro_area
360,2013,10,Anderson County,Algebra I,9,All Students,394,10,51,140,193,2.6,12.9,35.5,49.0,15.5,84.5,Knoxville-Morristown-Sevierville-La_Follette
361,2013,10,Anderson County,Algebra I,9,White,380,10,48,133,189,2.7,12.6,35.0,49.7,15.3,84.7,Knoxville-Morristown-Sevierville-La_Follette
362,2013,10,Anderson County,Algebra I,9,Hispanic,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,Knoxville-Morristown-Sevierville-La_Follette
363,2013,10,Anderson County,Algebra I,9,Black,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,Knoxville-Morristown-Sevierville-La_Follette
364,2013,10,Anderson County,Algebra I,9,Asian,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,Knoxville-Morristown-Sevierville-La_Follette


In [45]:
for df in [dis_sc22_df, dis_sc21_df, dis_sc18_df, dis_sc13_df]:
    if 'metro_area' in df.columns:
        df.drop('metro_area', axis=1, inplace=True)


In [47]:
# Create a separate dataframe containing the school systems, their system numbers and their metro areas
metro_system_df = dis_sc22_df[['system', 'system_name']].drop_duplicates()

# Map the metro areas to the df
metro_system_df['metro_area'] = metro_system_df['system_name'].map(metro_areas)

metro_system_df.head(50)

 

Unnamed: 0,system,system_name,metro_area
0,10,Anderson County,Knoxville-Morristown-Sevierville-La_Follette
6751,50,Blount County,Knoxville-Morristown-Sevierville-La_Follette
13503,70,Campbell County,Knoxville-Morristown-Sevierville-La_Follette
14715,80,Cannon County,Nashville-Davidson-Murfreesboro-Franklin
19803,100,Carter County,Johnson_City-Kingsport-Bristol
22226,110,Cheatham County,Nashville-Davidson-Murfreesboro-Franklin
35103,190,Davidson County,Nashville-Davidson-Murfreesboro-Franklin
39436,220,Dickson County,Nashville-Davidson-Murfreesboro-Franklin
53181,290,Grainger County,Knoxville-Morristown-Sevierville-La_Follette
59622,330,Hamilton County,Chattanooga-Cleveland-Athens


<bound method DataFrame.dropna of         system        system_name   
0           10    Anderson County  \
6751        50      Blount County   
13503       70    Campbell County   
14715       80      Cannon County   
19803      100      Carter County   
22226      110    Cheatham County   
35103      190    Davidson County   
39436      220     Dickson County   
53181      290    Grainger County   
59622      330    Hamilton County   
78523      470        Knox County   
87060      530      Loudon County   
96250      580      Marion County   
99405      600       Maury County   
103862     630  Montgomery County   
106379     650      Morgan County   
116936     730       Roane County   
118271     740   Robertson County   
119913     750  Rutherford County   
126972     792      Shelby County   
136659     800       Smith County   
137679     810     Stewart County   
138767     820    Sullivan County   
142864     830      Sumner County   
144560     840      Tipton County   
1481

In [None]:
# ##Compress the dfs
+# # Set the multi-level index for both dataframes
# dis_sc21_df.set_index(['year', 'system', 'system_name', 'subject', 'grade', 'student_group'], inplace=True)
# dis_sc22_df.set_index(['year', 'system', 'system_name', 'subject', 'grade', 'student_group'], inplace=True)

# # Concatenate the compressed dataframes
# compressed_df = pd.concat([dis_sc21_df, dis_sc22_df])

# # Reorder the index levels for better readability (optional)
# compressed_df = compressed_df.reorder_levels(['year', 'system', 'system_name', 'subject', 'grade', 'student_group']).sort_index()
