### Question 3:
Program types and hours correlated with performance/scores.

Author: Thomas

Updates:
* Last updated on March 24, 2020
* Updated on March 23, 2020
* Updated on March 21, 2020


In [1]:
# import dependencies
import os
import pandas as pd
import numpy as np
import json

In [2]:
# CSV Files
math_results = 'data/2013-2019_school_math_results.csv'
ela_results = 'data/2013-2019_school_ela_results.csv'
arts_2017 = 'data/2016-2017_Arts_Survey_Data.csv'
arts_2018 = 'data/2017-2018_Arts_Survey_Data.csv'
arts_2019 = 'data/2018-2019_Arts_Survey_Data.csv'

In [3]:
math_df = pd.read_csv(math_results, encoding='utf-8', low_memory=False)
math_df.head()
print(f'Math Results Rows: {math_df.shape}')
print(list(math_df.columns))
print()
print(math_df['Year'].unique())

Math Results Rows: (32606, 18)
['Unnamed: 0', 'DBN', 'School Name', 'Grade', 'Year', 'Category', 'Number Tested', 'Mean Scale Score', '# Level 1', '% Level 1', '# Level 2', '% Level 2', '# Level 3', '% Level 3', '# Level 4', '% Level 4', '# Level 3+4', '% Level 3+4']

[2013 2014 2015 2016 2017 2018 2019]


In [4]:
# drop unneeded columns
math_df = math_df.drop(labels='Unnamed: 0', axis=1)

In [5]:
math_df.head(1)

Unnamed: 0,DBN,School Name,Grade,Year,Category,Number Tested,Mean Scale Score,# Level 1,% Level 1,# Level 2,% Level 2,# Level 3,% Level 3,# Level 4,% Level 4,# Level 3+4,% Level 3+4
0,01M015,P.S. 015 ROBERTO CLEMENTE,3,2013,All Students,27,277.777771,16,59.25925827,11,40.74074173,0,0,0,0,0,0


In [6]:
ela_df = pd.read_csv(ela_results, encoding='utf-8', low_memory=False)
ela_df.head()
print(f'ELA Results Rows: {ela_df.shape}')
print(list(ela_df.columns))
print()
print(ela_df['Year'].unique())

ELA Results Rows: (32826, 18)
['Unnamed: 0', 'DBN', 'School Name', 'Grade', 'Year', 'Category', 'Number Tested', 'Mean Scale Score', '# Level 1', '% Level 1', '# Level 2', '% Level 2', '# Level 3', '% Level 3', '# Level 4', '% Level 4', '# Level 3+4', '% Level 3+4']

[2013 2014 2015 2016 2017 2018 2019]


In [7]:
# drop unneeded columns
ela_df = ela_df.drop(labels='Unnamed: 0', axis=1)

In [8]:
ela_df.head(1)

Unnamed: 0,DBN,School Name,Grade,Year,Category,Number Tested,Mean Scale Score,# Level 1,% Level 1,# Level 2,% Level 2,# Level 3,% Level 3,# Level 4,% Level 4,# Level 3+4,% Level 3+4
0,01M015,P.S. 015 ROBERTO CLEMENTE,3,2013,All Students,27,289.2962952,14,51.85185242,11,40.74074173,2,7.407407284,0,0,2,7.407407284


In [9]:
# create dictionary of all dataframes for each year's art survey 
arts_dfs = {
    '2017': pd.read_csv(arts_2017, encoding='utf-8', low_memory=False),
    '2018': pd.read_csv(arts_2018, encoding='utf-8', low_memory=False),
    '2019': pd.read_csv(arts_2019, encoding='utf-8', low_memory=False)
}

In [10]:
# clean up arts data
for (year, df) in arts_dfs.items():
    # rename DBN column to match the test scores data
    df = df.rename(columns={ 'Q0_DBN': 'DBN' })
    # add missing data - `Year`
    df['Year'] = int(year)
    # store updated dataframe
    arts_dfs[year] = df


In [13]:
# calculate all hours by art type for each school
df_columns = ['DBN','Year','Dance','Music','Theater','Visual Arts']
hours_df = pd.DataFrame(columns=df_columns)

for (year, df) in arts_dfs.items():

    temp_df = pd.DataFrame(columns=df_columns)

    grades = {
        'Q12': 'GK',
        'Q13': 'G1',
        'Q14': 'G2',
        'Q15': 'G3',
        'Q16': 'G4',
        'Q17': 'G5',
        'Q19': 'G6'
    }
    types = {
        'Dance': 'R1',
        'Music': 'R2',
        'Theater': 'R3',
        'Visual Arts': 'R4'
    }
    cols = { 
        'Dance': ['DBN'],
        'Music': ['DBN'],
        'Theater': ['DBN'],
        'Visual Arts': ['DBN']
    }

    for col in df.columns:
        for (question, grade) in grades.items():
            for (art, values) in cols.items(): 
                if (question in col) & (types[art] in col):
                    cols[art].append(col)
            
    for (art,values) in cols.items():
        temp_df[art] = df[values].fillna(0).sum(axis=1)
    
    temp_df[['DBN','Year']] = df[['DBN','Year']]

    hours_df = pd.concat([hours_df, temp_df])

hours_df.head()

Unnamed: 0,DBN,Year,Dance,Music,Theater,Visual Arts
0,01M015,2017,162.0,252.0,145.0,120.0
1,01M019,2017,682.0,222.0,0.0,190.0
2,01M020,2017,24.0,24.0,4.0,25.0
3,01M034,2017,0.0,20.0,2.0,6.0
4,01M515,2017,0.0,0.0,0.0,0.0


In [15]:
hours_df.shape

(4513, 6)