In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math as math
# from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from datetime import date
from datetime import datetime
from pathlib import Path 


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import tree

from sklearn import ensemble

# Table of Contents

## 1. [Helper Functions](#section_1)
Miscellaneous helper functions that need to be initialize for use later in notebook.
## 2. [Import Data ](#section_2)
Convert data from attendance and demographics datasets into Pandas Dataframe objects.
## 3. [Aggregation by Students](#section_3)
We create two aggregate dataframes in this section:
* __(A) STUDENT:__ Each row is a unique student and the center that they attend is stored as an array of strings if they attend multiple
* __(B) STUDENT/CENTER:__ Each row is a unique student and center they have attended. For example, if student A has attended two centers, then there would be two rows for each unique student/center combination.

## 4. [Aggregation by Date](#section_4)
We create an aggregated dataframe where each row is a unique date and center combination. For example, for a given date there should be N rows where N is the number of centers open on that day.


<a id=’section_1’></a>
### Helper Functions

In [25]:
def agg_string_to_arr(arr):
    new_arr = []
    for e in arr:
        if e not in new_arr:
            new_arr.append(e)
    return new_arr

def export_to_csv(df,name):
    filepath = Path('OUTPUTCSV/'+name+'.csv')  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    df.to_csv(filepath)

<a id=’section_2’></a>
### Import Data

In [26]:
att_18_19_file = 'INPUTCSVS/18-19 attendance master.csv'
demo_file = 'INPUTCSVS/PvH-demographicsdash_childdata.csv'

att_18_19_df = pd.read_csv(att_18_19_file)
demo_df = pd.read_csv(demo_file)

<a id=’section_3’></a>
### Aggregation by Students

#### (A) STUDENT

In [27]:
def agg_by_student(attendance_df, demog_df):
    #-explain
    
    #ATTENDANCE
    #----------
    
    #Extrapolate attendance specifier as boolean columns
    one_hot_atten = pd.get_dummies(attendance_df['Attendance'])
    one_hot_sesh = pd.get_dummies(attendance_df['Session'])
    attendance_clean = attendance_df.join(one_hot_atten)
    attendance_clean = attendance_clean.join(one_hot_sesh)
    
    #Combine rows by ChildID and 
    attendance_clean = attendance_clean.groupby(['ChildID'],as_index=False).agg(
             {'Best Interest': 'sum',
              'In Attendance': 'sum',
              'Other': 'sum',
              'Sick':'sum',
              'Unexcused Absence':'sum',
              'FD':'sum',
              'PD':'sum',
              'Center':agg_string_to_arr})
    
    #Add aggregate columns: Total Attendance,Absence, and Days AND Attendance Rate
    attendance_clean['Total Attendance'] = attendance_clean['In Attendance']
    attendance_clean['Total Absence'] = attendance_clean['Other'] + attendance_clean['Sick'] + attendance_clean['Unexcused Absence'] + attendance_clean['Best Interest']
    attendance_clean['Total Days'] = attendance_clean['Total Attendance'] + attendance_clean['Total Absence']
    attendance_clean['Attendance Rate'] = attendance_clean['Total Attendance'] / attendance_clean['Total Days']
    
    attendance_clean_final = attendance_clean
    
    #DEMOGRAPHICS
    #------------
    
    #Reorder Columns
    demog_clean = demog_df[['ChildID','ChildEthnicity','ChildRace','ChildLanguage','ChildFlags','ChildDateofBirth']]
    
    #Drop NAN 
    demog_clean = demog_clean[demog_clean['ChildDateofBirth'].notna()]

    #Add current age column
    today = datetime.today() 

    def get_age(bday_str):
        datetime_bday = datetime.strptime(str(bday_str),"%m/%d/%Y")
        age = (today-datetime_bday).days // 365
        return age

    demog_clean['Age (as of current day)'] = demog_clean['ChildDateofBirth'].apply(get_age)
    
    #Make ID# an int
    demog_clean['ChildID'] = demog_clean.ChildID.astype(int)
    
    #Aggregate rows to combine duplicate ChildIDs
    
    #Df with only aggregated string columns
    combined_str = demog_clean.groupby('ChildID').agg({'ChildRace':agg_string_to_arr, 'ChildFlags':agg_string_to_arr})
    
    #Combine agg str with demog clean
    merged = combined_str.merge(demog_clean, on = 'ChildID',how = 'left')
    
    #Drop extraneous columns 
    cleaned = merged[['ChildID','ChildEthnicity','ChildLanguage','ChildRace_x','ChildFlags_x','ChildDateofBirth','Age (as of current day)']]
    
    #Drop duplicates rows
    demog_clean_final = cleaned.drop_duplicates(subset= 'ChildID')
    
    #Rename columns
    demog_clean_final = demog_clean_final.rename(columns = {"ChildRace_x":'ChildRace',"ChildFlags_x":'ChildFlags'})
    
    
    #JOIN DFs for MASTER
    master = pd.merge(demog_clean_final,attendance_clean_final, how='inner', on= 'ChildID')
    
    #EXPORT
    #------
    #from pathlib import Path 

    #filepath = Path('OUTPUTCSV'attendance[:-4] + 'CHILDID_out.csv')  
    #filepath.parent.mkdir(parents=True, exist_ok=True)  
    #master.to_csv(filepath)
    
    return master

#### (B) STUDENT/CENTER

In [31]:
def agg_by_student_center(attendance_df, demog_df):
    #-explain

    #ATTENDANCE
    #----------
    
    #Extrapolate attendance specifier as boolean columns
    one_hot_atten = pd.get_dummies(attendance_df['Attendance'])
    one_hot_sesh = pd.get_dummies(attendance_df['Session'])
    attendance_clean = attendance_df.join(one_hot_atten)
    attendance_clean = attendance_clean.join(one_hot_sesh)
    
    #Combine rows by ChildID and 
    attendance_clean = attendance_clean.groupby(['ChildID','Center'],as_index=False).agg(
             {'Best Interest': 'sum',
              'In Attendance': 'sum',
              'Other': 'sum',
              'Sick':'sum',
              'Unexcused Absence':'sum',
              'FD':'sum',
              'PD':'sum'})
    
    #Add aggregate columns: Total Attendance,Absence, and Days AND Attendance Rate
    attendance_clean['Total Attendance'] = attendance_clean['In Attendance']
    attendance_clean['Total Absence'] = attendance_clean['Other'] + attendance_clean['Sick'] + attendance_clean['Unexcused Absence'] + attendance_clean['Best Interest']
    attendance_clean['Total Days'] = attendance_clean['Total Attendance'] + attendance_clean['Total Absence']
    attendance_clean['Attendance Rate'] = attendance_clean['Total Attendance'] / attendance_clean['Total Days']
    
    attendance_clean_final = attendance_clean
    
    #DEMOGRAPHICS
    #------------
    
    #Reorder Columns
    demog_clean = demog_df[['ChildID','ChildEthnicity','ChildRace','ChildLanguage','ChildFlags','ChildDateofBirth']]
    
    #Drop NAN 
    demog_clean = demog_clean[demog_clean['ChildDateofBirth'].notna()]

    #Add current age column
    today = datetime.today() 

    def get_age(bday_str):
        datetime_bday = datetime.strptime(str(bday_str),"%m/%d/%Y")
        age = (today-datetime_bday).days // 365
        return age

    demog_clean['Age (as of current day)'] = demog_clean['ChildDateofBirth'].apply(get_age)
    
    #Make ID# an int
    demog_clean['ChildID'] = demog_clean.ChildID.astype(int)
    
    #Aggregate rows to combine duplicate ChildIDs
    
    #Df with only aggregated string columns
    combined_str = demog_clean.groupby('ChildID').agg({'ChildRace':agg_string_to_arr, 'ChildFlags':agg_string_to_arr})
    
    #Combine agg str with demog clean
    merged = combined_str.merge(demog_clean, on = 'ChildID',how = 'left')
    
    #Drop extraneous columns 
    cleaned = merged[['ChildID','ChildEthnicity','ChildLanguage','ChildRace_x','ChildFlags_x','ChildDateofBirth','Age (as of current day)']]
    
    #Drop duplicates rows
    demog_clean_final = cleaned.drop_duplicates(subset= 'ChildID')
    
    #Rename columns
    demog_clean_final = demog_clean_final.rename(columns = {"ChildRace_x":'ChildRace',"ChildFlags_x":'ChildFlags'})
    
    
    #JOIN DFs for MASTER
    master = pd.merge(demog_clean_final,attendance_clean_final, how='inner', on= 'ChildID')
    
    
    return master


In [34]:
df = agg_by_student(att_18_19_df,demo_df)
df1 = agg_by_student_center(att_18_19_df,demo_df)


In [36]:
df1

Unnamed: 0,ChildID,ChildEthnicity,ChildLanguage,ChildRace,ChildFlags,ChildDateofBirth,Age (as of current day),Center,Best Interest,In Attendance,Other,Sick,Unexcused Absence,FD,PD,Total Attendance,Total Absence,Total Days,Attendance Rate
0,18578,Hispanic or Latino,01 -- Spanish,[Caucasian],[Preschool],12/28/2004,17,FH,0,102.0,0,0,0,102.0,0,102.0,0,102.0,1.000000
1,22717,Hispanic or Latino,01 -- Spanish,[Caucasian],[Infant/Toddler],01/13/2008,14,FH,0,245.0,2,1,0,248.0,0,245.0,3,248.0,0.987903
2,25173,Hispanic or Latino,00 -- English,[Caucasian],[Preschool],01/06/2007,15,FH,0,242.0,2,4,0,248.0,0,242.0,6,248.0,0.975806
3,30829,Hispanic or Latino,01 -- Spanish,[Caucasian],[Infant/Toddler],11/19/2010,12,FH,0,246.0,2,0,0,248.0,0,246.0,2,248.0,0.991935
4,31583,Hispanic or Latino,00 -- English,[Caucasian],[Emergency Care],05/22/2010,12,Linda Vista,3,232.0,1,10,0,246.0,0,232.0,14,246.0,0.943089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4967,65271,Not Hispanic or Latino,00 -- English,[Caucasian],"[Admissions Agreement #15 (check box if Yes), ...",03/06/2015,7,Emerald Vista,0,2.0,0,0,0,0.0,2,2.0,0,2.0,1.000000
4968,65289,Not Hispanic or Latino,00 -- English,[Caucasian],[nan],08/15/2016,6,Esther Hobbs,0,1.0,0,0,0,1.0,0,1.0,0,1.0,1.000000
4969,65290,Hispanic or Latino,00 -- English,[nan],"[Shelter-In-Place, IFSP]",11/14/2017,5,Esther Hobbs,0,3.0,0,0,0,3.0,0,3.0,0,3.0,1.000000
4970,65301,Hispanic or Latino,01 -- Spanish,[Caucasian],"[IEP, Admissions Agreement #16 (check box if Y...",05/12/2016,6,Castlemont,0,1.0,0,0,0,1.0,0,1.0,0,1.0,1.000000


<a id=’section_4’></a>
### Aggregation by Date