In [None]:
import sys
sys.path.append('..')

import copy
import os
from datetime import datetime

import geopandas as gpd
import numpy as np
import pandas as pd
#import pyodbc

## database or file operator
import psycopg2

from sshtunnel import SSHTunnelForwarder
import yaml
import sqlalchemy
from sqlalchemy import *
from sqlalchemy.types import *
import sqlalchemy.types as sql_types

## plotting
import plotnine
from plotnine import *

## merging assistance
import re

## output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1) 

## background
standard_background = theme(panel_background = element_blank(),   
       panel_grid_major_y = element_blank(),
      axis_text_x = element_text(color = "black", hjust = 1, size = 12),
      axis_text_y = element_text(color = "black", size = 12),
    legend_text = element_text(color = 'black', size = 10),
    legend_title = element_text(color = 'black', size = 12),
      axis_title=element_text(size=12),
    strip_text_x = element_text(size = 12),
    legend_background = element_blank(),
    legend_key = element_blank(),
    panel_grid_major = element_blank(), 
    panel_grid_minor = element_blank(),
     axis_ticks=element_blank())


#  0: define postgres functions

For now, just copied postgres functions from other script before making into a module

In [None]:
def load_creds_file(name_and_path_creds_file):
    
    with open(name_and_path_creds_file,'r') as stream: 
        creds_dict = yaml.load(stream)
        
    return(creds_dict) 


def start_sshtunnel(creds, port_number):
    tunnel = SSHTunnelForwarder(
    ('10.56.6.66', 22),
    ssh_username= creds['osse_ssh_tunnel']['ssh_username'],
    ssh_private_key= creds['osse_ssh_tunnel']['pathname_privatekey'],
    remote_bind_address=('localhost', 5432),
    local_bind_address=('localhost',port_number), # could be any available port
    )
    return(tunnel)
    

## Functions for working with postgres
def startengine_psy(creds, tunnel):
    connection = psycopg2.connect(dbname= creds['osse_database']['dbname'],
    user=  creds['osse_database']['user'],
    password= creds['osse_database']['password'],
    host=tunnel.local_bind_host,
    port=tunnel.local_bind_port,)
    return(connection)


## function for starting an alchemy connection
def startengine_alchemy(creds, tunnel):
    engine_string = "postgresql://{user}:{password}@{host}:{port}/{db}".format(user = creds['osse_database']['user'],
                        password = creds['osse_database']['password'],
                        host = tunnel.local_bind_host,
                        port = tunnel.local_bind_port,
                        db =  creds['osse_database']['dbname'])
    engine = create_engine(engine_string)
    alchemy_connection = engine.connect()
    return(alchemy_connection)

# 1. Open connection with database

In [None]:
attend_data = pd.read_csv("../data/attendance_indiv_data.csv")


In [None]:
## load creds and start alchemy connection
creds = load_creds_file("/home/jovyan/work/creds/creds_osse.yaml")
## start an ssh tunnel
tunnel = start_sshtunnel(creds, port_number = 7015)
tunnel.start()


In [None]:
## create alchemy connection
alchemy_connection = startengine_alchemy(creds, tunnel)

# 2. Read in relevant attendance data (at student-level)

In [None]:
dcps_attend_query = """
select usi, "Enr_SchoolName",
firstname, lastname, dateofbirth, 
gender, race, "LEPIndicator", "GradeLevel",
"HomelessIndicatorOSSE", "AtRiskIndicator", "FarmsStatusSISdesc",
sum(case when "AttendanceStatusCode" ='PF' then 1 else 0 end) as present_days,
count("AttendanceStatusCode") as membership_days,
sum(case when "Attendance_Status_Desc" in ('Absent Excused Suspension', 
'Absent Fully Unexcused') then 1 else 0 end) as total_excusedorunexcused
from dcps_sy1718
where  "AttendanceStatusCode" !='NSD'
and "Enr_SchoolName" in ('Anacostia High School', 
                'Columbia Heights Education Campus', 
            'Dunbar High School')
and "AttendanceDate" between '2017-10-01' and '2018-01-20'
and dateofbirth between '1998-10-1 00:00:00' AND '2011-09-30 23:59:59'
group by usi, "Enr_SchoolName",
firstname, lastname, dateofbirth, gender, race, 
"LEPIndicator", "GradeLevel", "HomelessIndicatorOSSE", "AtRiskIndicator", "FarmsStatusSISdesc";
"""

In [None]:
dcps_attend_df = pd.read_sql_query(dcps_attend_query, alchemy_connection)


In [None]:
pcs_attend_query = """
select usi, "Enr_SchoolName",
firstname, lastname, dateofbirth, 
gender, race, "LEPIndicator", "GradeLevel",
"HomelessIndicatorOSSE", "AtRiskIndicator", "FarmsStatusSISdesc",
sum(case when "AttendanceStatusCode" ='PF' or "AttendanceStatusCode" ='PPE' or "AttendanceStatusCode" ='PIS'or "AttendanceStatusCode" ='PPU' 
		then 1 else 0 end) as present_days,
count("AttendanceStatusCode") as membership_days,
sum(case when "Attendance_Status_Desc" in ('Absent Fully Unexcused',
'Absent Partial Unexcused', 
'Absent Fully Excused',
'Absent Partial Excused') then 1 else 0 end)  as total_excusedorunexcused
from charter_sy1718
where  "AttendanceStatusCode" !='NSD'
and "Enr_SchoolName" in ('Friendship PCS - Collegiate Academy', 
                        'Paul PCS - International High School',
            'Paul PCS - Middle School')
and "AttendanceDate" between '2017-10-01' and '2018-01-20'
and dateofbirth between '1998-10-1 00:00:00' AND '2011-09-30 23:59:59'
group by usi, "Enr_SchoolName",
firstname, lastname, dateofbirth, gender, race, 
"LEPIndicator", "GradeLevel", "HomelessIndicatorOSSE", "AtRiskIndicator", "FarmsStatusSISdesc"
"""

In [None]:
pcs_attend_df = pd.read_sql_query(pcs_attend_query, alchemy_connection)

In [None]:
attend_df = pd.concat([dcps_attend_df, pcs_attend_df])


In [None]:
attend_df.head()

# 3. Rate across sample and by school

In [None]:

attend_df['studentlevel_isa'] = attend_df.present_days/attend_df.membership_days


In [None]:
attend_df['studentlevel_ca'] = np.where(attend_df.total_excusedorunexcused/attend_df.membership_days >= 0.1, 1, 0)

In [None]:
attend_df.Enr_SchoolName.value_counts()

In [None]:
## subset to relevant grades
## Dunbar only 9th grade
## Friendship only 9th and 10th
attend_df_analytic = attend_df.loc[(~attend_df.Enr_SchoolName.isin(["Dunbar High School", "Friendship PCS - Collegiate Academy"])) |
                                ((attend_df.Enr_SchoolName == "Dunbar High School") & 
                                (attend_df.GradeLevel == "09")) |
                                ((attend_df.Enr_SchoolName == "Friendship PCS - Collegiate Academy") & 
                                (attend_df.GradeLevel.isin(["9", "10"])))].copy()



In [None]:
## create aggregation for beau
## for each school, sum: 1) present days, 2) membership days
## to create ISA; total absences over membership days is used for 
## chronic absenteeism
attend_df_analytic_byschool = attend_df_analytic.groupby('Enr_SchoolName').agg({'present_days': np.sum,
                                                                               'membership_days': np.sum,
                                                                               'total_excusedorunexcused': np.sum,
                                                                               'usi': lambda x: x.nunique()}).reset_index()
attend_df_analytic_byschool.columns = ['school', 'schooldays_present', 'schooldays_total',
                                      'schooldays_absent', 'n_students']
attend_df_analytic_byschool.head()

attend_df_analytic_byschool.to_csv("../data/attendance_aggregated_schoolevel.csv",
                                  index = False)

In [None]:
attend_df_analytic[attend_df_analytic.Enr_SchoolName == "District of Columbia International School"].describe()

In [None]:
attend_df_analytic.GradeLevel.value_counts()

In [None]:
## create two separate ones for check
attend_df_analytic['schoolname_toanonymize'] = np.where((attend_df_analytic.Enr_SchoolName == "Columbia Heights Education Campus") &
                                                       (attend_df_analytic.GradeLevel.isin(['06', 
                                                        '07', '08'])),
                                                       'CHEC MS',
                                            np.where((attend_df_analytic.Enr_SchoolName == "Columbia Heights Education Campus") &
                                                       (attend_df_analytic.GradeLevel.isin(['09', 
                                                        '10', '11', '12'])),
                                                       'CHEC HS',
                                                    attend_df_analytic.Enr_SchoolName))

attend_df_analytic.loc[attend_df_analytic.schoolname_toanonymize == "Columbia Heights Education Campus"].head()

## exclude DCI and ungraded students
attend_df_analytic_final = attend_df_analytic.loc[(attend_df_analytic.GradeLevel.isin(['06', 
                                                        '07', '08', '09', '10', '11', '12'])) &
                                                 (attend_df_analytic.Enr_SchoolName != "District of Columbia International School")].copy()

In [None]:
attend_df_analytic_final.describe()

## anonymize schools
school_anon_dict = {"CHEC HS": "School B (DCPS; 9-12)", 
                    "CHEC MS": "School B (DCPS; 6-8)", 
     "Friendship PCS - Collegiate Academy": "School D (PCS; 9-10)",
     "Paul PCS - International High School": "School E (PCS; 9-12)",
     "Paul PCS - Middle School": "School E (PCS; 6-8)",
     "Anacostia High School": "School A (DCPS; 9-12)",
     "Dunbar High School": "School C (DCPS; 9)"}

attend_df_analytic_final['school_anon'] = attend_df_analytic_final.schoolname_toanonymize.replace(school_anon_dict,
                                                                             inplace = False)

n_perschool = attend_df_analytic_final.groupby('school_anon').agg({'usi': lambda x: x.nunique()})
n_perschool

### Demographics for PAA abstract

In [None]:
def efficient_crosstabs(attribute_varname, grouping_varname, data):
    
    ## create one cross tab
    one_crosstab = pd.crosstab(data[attribute_varname],
                  data[grouping_varname],
                  normalize = "columns")
    
    ## add variable name as a col
    one_crosstab['variable_name'] = attribute_varname
    one_crosstab_transp = one_crosstab.T
    return(one_crosstab_transp)
    

In [None]:
attend_df_analytic_final.columns

In [None]:
race_remap_dictionary = {'B': "Black", 'BL': "Black",
                        'HI': 'Hispanic', 'H': 'Hispanic',
                        'WH': 'White', 'W': 'White'}
attend_df_analytic_final['race_nows'] = attend_df_analytic_final.race.str.strip()
attend_df_analytic_final['race_broadercat_init'] = attend_df_analytic_final.race_nows.replace(race_remap_dictionary, inplace = False)
attend_df_analytic_final['race_broadercat'] = np.where(~attend_df_analytic_final.race_broadercat_init.isin(['White',
                                                    'Black', 'Hispanic']), 
                                                    'Other',
                                                    attend_df_analytic_final.race_broadercat_init)

In [None]:
vars_tocross = ['AtRiskIndicator', 'HomelessIndicatorOSSE', 'LEPIndicator', 'FarmsStatusSISdesc', 'race_broadercat']
att_crosstabs = [efficient_crosstabs(var, 'school_anon', attend_df_analytic_final) for 
                var in vars_tocross]

att_crosstabs

In [None]:
## rowbind
att_crosstabs_one = pd.concat(att_crosstabs, axis = 1)
att_crosstabs_one_columns = att_crosstabs_one.iloc[att_crosstabs_one.shape[0]-1]
att_crosstabs_one_toadd = att_crosstabs_one.iloc[0:att_crosstabs_one.shape[0]-2]
att_crosstabs_one_cols = [str(col).lower() for col in att_crosstabs_one.columns]
new_colnames = [a + ": " + b for a, b in zip(att_crosstabs_one_columns, att_crosstabs_one_cols)]
att_crosstabs_one_toadd.columns = new_colnames
att_crosstabs_one_toadd[new_colnames] = att_crosstabs_one_toadd[new_colnames].apply(pd.to_numeric, errors = 'coerce')
att_crosstabs_one_toadd['school'] = att_crosstabs_one_toadd.index
att_crosstabs_one.head()

In [None]:
## merge the size
att_forgraph = pd.merge(att_crosstabs_one_toadd,
                       n_perschool, 
                       left_on = 'school',
                       right_on = 'school_anon',
                       how = 'left')
att_forgraph

In [None]:
## choose a few cols
cols_toshow = ['AtRiskIndicator: true', 'HomelessIndicatorOSSE: yes',
              'race_broadercat: black',
              'race_broadercat: hispanic',
              'LEPIndicator: true',
              'school', 'usi']
att_forgraph_long = pd.melt(att_forgraph[cols_toshow],
                           id_vars = ['school', 'usi', 'AtRiskIndicator: true'])
att_forgraph_long.head()
att_forgraph_long['var_descriptive'] = att_forgraph_long.variable.replace({'HomelessIndicatorOSSE: yes': 'Experiencing Homelessness',
                                                                 'race_broadercat: black': 'Black',
                                                                 'race_broadercat: hispanic': 'Hispanic',
                                                                 'LEPIndicator: true': 'Limited English Proficiency'},
                                                                    inplace = False)

att_graph = (ggplot(att_forgraph_long, aes(x = 'AtRiskIndicator: true', 
                              y = 'value', 
                              fill = 'factor(school)')) +
geom_point(size = 4, color = 'black') +
facet_wrap('~var_descriptive, scales = free') +
standard_background +
xlab('Proportion of students with at-risk poverty indicator') +
ylab('Proportion of students with other attribute') +
labs(fill = "School") +
scale_fill_brewer(palette = 'Reds'))

ggsave(att_graph,
       "../output/attributes_comparison.pdf",
      device = "pdf",
      width = 12,
      height = 8)

In [None]:
(ggplot(att_forgraph, aes(x = 'AtRiskIndicator: true', y = 'HomelessIndicatorOSSE: yes',
                         color = 'AtRiskIndicator: true')) +
geom_point(aes(size = 'usi')) +
xlim(0, 1) +
ylim(0, 0.15) +
xlab("Proportion of students\nwith at-risk poverty indicator") +
ylab("Proportion of students\nexperiencing homelessness") +
standard_background +
guides(color = False) +
labs(size = "N students\nin school") +
theme(legend_position = (0.3, 0.7) ))

## save as pdf

In [None]:
att_crosstabs_long = pd.melt(att_crosstabs_one_toadd, id_vars='school')
att_crosstabs_long

In [None]:
one_crosstab = pd.crosstab(attend_df_analytic_final.HomelessIndicatorOSSE,
                  attend_df_analytic_final.school_anon,
                  normalize = "columns").reset_index()

one_crosstab['variable_name'] = one_crosstab.columns[0]
one_crosstab.head()

In [None]:
## group by school
attend_byschool = attend_df_analytic_final.groupby(['school_anon'])['school_anon', 'studentlevel_isa',
                                                     'studentlevel_ca'].mean().reset_index()

ca_order = attend_byschool.sort_values(by = 'studentlevel_ca', ascending = False)
isa_order = attend_byschool.sort_values(by = 'studentlevel_isa', ascending = True)


## print the rest of the schools
cols_toround = [col for col in attend_byschool.columns if col not in ["Enr_SchoolName", "school_anon"]]
cols_tomultiply = [col for col in attend_byschool.columns if col not in ["Enr_SchoolName", "school_anon"]]
cols_toround_name = [col + "_rounded" for col in cols_toround]
cols_tomultiply_name = [col + '_multiplied' for col in cols_tomultiply]
attend_byschool[cols_tomultiply_name] = attend_byschool[cols_toround]*100
attend_byschool[cols_toround_name] = attend_byschool[cols_tomultiply_name].round(-1).astype(int)

In [None]:
attend_byschool.head()

In [None]:
## 
attend_byschool['school_ordered_ca'] = attend_byschool.school_anon.astype('category').cat.reorder_categories(ca_order.school_anon)

(ggplot(attend_byschool, aes(x = 'factor(school_ordered_ca)', y = 'studentlevel_ca_multiplied')) +
geom_bar(stat = "identity", fill = "#2B4888", color = "black") +
xlab("") +
standard_background +
scale_fill_gradient(low = "darkgreen", high = "firebrick") +
ylab("Percent chronically absent\nTime frame: 10/1/2017-01/20/2018\nSource: OSSE") +
geom_text(aes(x = 'factor(school_ordered_ca)', y = 'studentlevel_ca_multiplied',
              label = 'studentlevel_ca_rounded'), nudge_y = 5)+
coord_flip() +
guides(fill = False) +
theme(legend_position = (0.8, 0.8)))

In [None]:
## repeat for ISA
attend_byschool['school_ordered_isa'] = attend_byschool.school_anon.astype('category').cat.reorder_categories(isa_order.school_anon)
(ggplot(attend_byschool, aes(x = 'factor(school_ordered_isa)', y = 'studentlevel_isa_multiplied')) +
geom_bar(stat = "identity", fill = "#2B4888", color = "black") +
xlab("") +
standard_background +
scale_fill_gradientn(colors = ("firebrick", "gray", "darkgreen"), values = (0, 0.89, 1)) +
ylab("In-seat attendance rate\nTime frame: 10/1/2017-01/20/2018\nSource: OSSE") +
geom_text(aes(x = 'factor(school_ordered_isa)', y = 'studentlevel_isa_multiplied',
              label = 'studentlevel_isa_rounded'), nudge_y = 5)+
coord_flip() +
guides(fill = False) +
theme(legend_position = (0.8, 0.8)))

## Show relative to distribution across schools in district

In [None]:
dcps_ms_query = """
select usi, "Enr_SchoolName",
firstname, lastname, dateofbirth, 
gender, race, "LEPIndicator", "GradeLevel",
sum(case when "AttendanceStatusCode" ='PF' then 1 else 0 end) as present_days,
count("AttendanceStatusCode") as membership_days,
sum(case when "Attendance_Status_Desc" in ('Absent Excused Suspension', 
'Absent Fully Unexcused') then 1 else 0 end) as total_excusedorunexcused
from dcps_sy1718
where  "AttendanceStatusCode" !='NSD'
and "AttendanceDate" between '2017-10-01' and '2018-01-20'
and dateofbirth between '1998-10-1 00:00:00' AND '2011-09-30 23:59:59'
and "GradeLevel" in ('06', '07', '08')
group by usi, "Enr_SchoolName",
firstname, lastname, dateofbirth, gender, race, 
"LEPIndicator", "GradeLevel";
"""

In [None]:
dcps_ms_allschools = pd.read_sql_query(dcps_ms_query, alchemy_connection)
dcps_ms_allschools.head()

In [None]:
pcs_ms_query = """
select usi, "Enr_SchoolName",
firstname, lastname, dateofbirth, 
gender, race, "LEPIndicator", "GradeLevel",
sum(case when "AttendanceStatusCode" ='PF' or "AttendanceStatusCode" ='PPE' or "AttendanceStatusCode" ='PIS'or "AttendanceStatusCode" ='PPU' 
		then 1 else 0 end) as present_days,
count("AttendanceStatusCode") as membership_days,
sum(case when "Attendance_Status_Desc" in ('Absent Fully Unexcused',
'Absent Partial Unexcused', 
'Absent Fully Excused',
'Absent Partial Excused') then 1 else 0 end)  as total_excusedorunexcused
from charter_sy1718
where  "AttendanceStatusCode" !='NSD'
and "AttendanceDate" between '2017-10-01' and '2018-01-20'
and dateofbirth between '1998-10-1 00:00:00' AND '2011-09-30 23:59:59'
and "GradeLevel" in ('06', '07', '08')
group by usi, "Enr_SchoolName",
firstname, lastname, dateofbirth, gender, race, 
"LEPIndicator", "GradeLevel"
"""

In [None]:
pcs_ms_allschools = pd.read_sql_query(pcs_ms_query, alchemy_connection)
pcs_ms_allschools.head()

In [None]:
ms_both_allschools = pd.concat([dcps_ms_allschools, pcs_ms_allschools])
ms_both_allschools['studentlevel_isa'] = ms_both_allschools.present_days/ms_both_allschools.membership_days
ms_both_allschools['studentlevel_ca'] = np.where(ms_both_allschools.total_excusedorunexcused/ms_both_allschools.membership_days >= 0.1, 1, 0)

In [None]:
ms_attend_byschool = ms_both_allschools.groupby(['Enr_SchoolName'])['Enr_SchoolName', 'studentlevel_isa',
                                                     'studentlevel_ca'].mean().reset_index()

In [None]:
ms_attend_byschool.sort_values(by = 'studentlevel_ca', ascending = False).head()

In [None]:
attend_byschool

In [None]:
(ggplot(ms_attend_byschool, aes(x = 'studentlevel_ca')) +
geom_density(fill = "gray", alpha = 0.3) +
standard_background +
scale_x_continuous(breaks = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8)) +
xlab("Chronic absenteeism by school\n6th-8th graders") +
geom_vline(xintercept = (0.20, 0.29), linetype = "dashed", 
           color = "#2B4888",size = 2))

#annotate("text", x = (0.1, 0.4),
 #       y = (3, 3),
  #      label = ("School E\n(PCS; 6-8)",
   #             "School B\n(DPCS; 6-8)"),
    #    color = "#2B4888"))



In [None]:
(ggplot(ms_attend_byschool, aes(x = 'studentlevel_isa')) +
geom_density(fill = "gray", alpha = 0.3) +
standard_background +
scale_x_continuous(breaks = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8)) +
xlab("In-seat attendance by school\n6th-8th graders") +
geom_vline(xintercept = (0.9, 0.89), linetype = "dashed", 
           color = "#2B4888",size = 0.5))
#annotate("text", x = (0.8, 0.95),
 #       y = (10, 10),
  #      label = ("School E\n(PCS; 6-8)",
   #             "School B\n(DPCS; 6-8)"),
    #    color = "#2B4888"))

In [None]:
dcps_hs_query = """
select usi, "Enr_SchoolName",
firstname, lastname, dateofbirth, 
gender, race, "LEPIndicator", "GradeLevel",
sum(case when "AttendanceStatusCode" ='PF' then 1 else 0 end) as present_days,
count("AttendanceStatusCode") as membership_days,
sum(case when "Attendance_Status_Desc" in ('Absent Excused Suspension', 
'Absent Fully Unexcused') then 1 else 0 end) as total_excusedorunexcused
from dcps_sy1718
where  "AttendanceStatusCode" !='NSD'
and "AttendanceDate" between '2017-10-01' and '2018-01-20'
and dateofbirth between '1998-10-1 00:00:00' AND '2011-09-30 23:59:59'
and "GradeLevel" in ('09', '10', '11', '12')
group by usi, "Enr_SchoolName",
firstname, lastname, dateofbirth, gender, race, 
"LEPIndicator", "GradeLevel";
"""

In [None]:
pcs_hs_query = """
select usi, "Enr_SchoolName",
firstname, lastname, dateofbirth, 
gender, race, "LEPIndicator", "GradeLevel",
sum(case when "AttendanceStatusCode" ='PF' or "AttendanceStatusCode" ='PPE' or "AttendanceStatusCode" ='PIS'or "AttendanceStatusCode" ='PPU' 
		then 1 else 0 end) as present_days,
count("AttendanceStatusCode") as membership_days,
sum(case when "Attendance_Status_Desc" in ('Absent Fully Unexcused',
'Absent Partial Unexcused', 
'Absent Fully Excused',
'Absent Partial Excused') then 1 else 0 end)  as total_excusedorunexcused
from charter_sy1718
where  "AttendanceStatusCode" !='NSD'
and "AttendanceDate" between '2017-10-01' and '2018-01-20'
and dateofbirth between '1998-10-1 00:00:00' AND '2011-09-30 23:59:59'
and "GradeLevel" in ('09', '10', '11', '12')
group by usi, "Enr_SchoolName",
firstname, lastname, dateofbirth, gender, race, 
"LEPIndicator", "GradeLevel"
"""

In [None]:
pcs_hs_allschools = pd.read_sql_query(pcs_hs_query, alchemy_connection)
dcps_hs_allschools = pd.read_sql_query(dcps_hs_query, alchemy_connection)
hs_both_allschools = pd.concat([dcps_hs_allschools, pcs_hs_allschools])
hs_both_allschools['studentlevel_isa'] = hs_both_allschools.present_days/hs_both_allschools.membership_days
hs_both_allschools['studentlevel_ca'] = np.where(hs_both_allschools.total_excusedorunexcused/hs_both_allschools.membership_days >= 0.1, 1, 0)

In [None]:
hs_attend_byschool = hs_both_allschools.groupby(['Enr_SchoolName'])['Enr_SchoolName', 'studentlevel_isa',
                                                     'studentlevel_ca'].mean().reset_index()

#hs_attend_byschool

(ggplot(hs_attend_byschool, aes(x = 'studentlevel_ca')) +
geom_density(fill = "gray", alpha = 0.3) +
standard_background +
scale_x_continuous(breaks = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8)) +
xlab("Chronic absenteeism by school\n9th-12th graders") +
geom_vline(xintercept = (0.89, 0.5, 0.74, 0.48, 0.33), linetype = "dashed", 
           color = "#2B4888",size = 2))

In [None]:
attend_byschool

In [None]:
(ggplot(hs_attend_byschool, aes(x = 'studentlevel_isa')) +
geom_density(fill = "gray", alpha = 0.3) +
standard_background +
scale_x_continuous(breaks = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8)) +
xlab("In-seat attendance by school\n9th-12th graders") +
geom_vline(xintercept = (0.56, 0.85, 0.73, 0.85, 0.89), linetype = "dashed", 
           color = "#2B4888",size = 2))

In [None]:
ms_both_allschools.describe()
hs_both_allschools.describe()

In [None]:
attend_all_schools = pd.concat([ms_both_allschools, hs_both_allschools])
attend_all_schools.describe()

### Graphs separate by grade created but not included in the pre-analysis plan

In [None]:
## by grade level
ungraded = ["C" + str(val) for val in np.arange(1, 6)]
attend_df_graded = attend_df_analytic.loc[~attend_df_analytic.GradeLevel.isin(ungraded)].copy()

## calculate mean by school and by grade
attend_byschool_bygrade = attend_df_graded.groupby(['school_anon', 'GradeLevel'])['school_anon', 'studentlevel_isa',
                                'studentlevel_ca'].mean().reset_index()
attend_byschool_bygrade_valid = attend_byschool_bygrade[attend_byschool_bygrade.studentlevel_ca > 0].copy() # gets rid of 
## erroneous eight grader at anacostia
(ggplot(attend_byschool_bygrade_valid, aes(x = 'factor(GradeLevel)', y = 'studentlevel_ca', fill = 'studentlevel_ca')) +
geom_bar(stat = 'identity') +
facet_wrap('~school_anon', ncol = 2) +
xlab("Grade") +
standard_background +
guides(fill = False) +
scale_fill_gradient(low = "darkgreen", high = "firebrick") +
ylab("Chronic absenteeism rate\n(end of first semester; SY1718; OSSE data)") +
theme(strip_text_x = element_text(size = 8)))

In [None]:
(ggplot(attend_byschool_bygrade_valid, aes(x = 'factor(GradeLevel)', y = 'studentlevel_isa', fill = 'studentlevel_isa')) +
geom_bar(stat = 'identity') +
facet_wrap('~school_anon', ncol = 2) +
xlab("Grade") +
standard_background +
guides(fill = False) +
scale_fill_gradientn(colors = ("firebrick", "gray", "darkgreen"), values = (0, 0.89, 1)) +
ylab("In-seat attendance rate\n(end of first semester; SY1718; OSSE data)") +
theme(strip_text_x = element_text(size = 8)))

# 4. Write to csv

In [None]:
attend_df.to_csv("../data/attendance_indiv_data.csv", index = False)