# Data Cleaning
---

In [1]:
import numpy as np
import pandas as pd

### Importing FY 2020 Senior Center .csv files as dataframes

In [2]:
sccd_raw = pd.read_csv('raw_csv/senior_center_client_data_fy2020.csv')
scpd_raw = pd.read_csv('raw_csv/senior_center_provider_data_fy2020.csv')

# Make a copy of the raw dataframes
sccd = sccd_raw.copy()
scpd = scpd_raw.copy()

---
### Cleaning Senior Center Provider Data

In [3]:
# Check for duplicated rows
print(f'There are {scpd.duplicated().sum()} duplicated rows in the scpd dataframe.')

There are 0 duplicated rows in the scpd dataframe.


In [4]:
print(f'The scpd dataframe currently has this shape: {scpd.shape}')

# Drop columns you won't be using
scpd.drop(columns = ['Sponsor Name', 'Program Address', 'Program Address1', 'Postcode',
                    'Community Board', 'Council D istrict', 'Sunday', 'Monday', 'Tuesday',
                     'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Fiscal Year Amount',
                     'FY 20 Actual Meals', 'Meal Prep', 'Meals Prep for Others', 'Borough',
                     'Census Tract', 'BIN', 'BBL', 'NTA'], inplace = True)

print(f'The scpd dataframe now has this shape: {scpd.shape}')

The scpd dataframe currently has this shape: (294, 49)
The scpd dataframe now has this shape: (294, 27)


In [5]:
# Rename columns to be more intuitive (based on data dictionary descriptions)
scpd.rename(columns = {
    'Provider Name': 'Senior Center Name',
    'Borough/City': 'Borough',
    '# of Full-time Staff': 'Total FTEs',
    '# of Part-time Staff': 'Total PTEs',
    'Average Daily Participants': 'Average Daily Clients',
    'Kosher Raw Food/ Disposable': 'Kosher Meal Budget',
    'Non-Kosher Raw Food/ Disposable': 'Non-Kosher Meal Budget',
    'Total Raw Food/ Disposable': 'Total Meal Budget',
    'Expenditures per Meal for Food and Disposable': 'Average Meal Expenditure Per Client',
    'Meal Prep1': 'Meal Prep Type',
    'Prep for Others1': 'Meal Prep For Other Centers',
    'Annual Expenditures for Information and Assistance, Education and Recreation, Health Promotion': 'Total AIB-SCE-HPP Expenditures',
    'Annual Expenditures Per Client for Information and Assistance, Education and Recreation, Health Promotion': 'Average AIB-SCE-HPP Expenditures Per Client',
    'Ultilization': 'Percent Utilization'
}, inplace = True)

print(f'The columns are now named:\n{scpd.columns}')

The columns are now named:
Index(['DFTA ID', 'Senior Center Name', 'Site Type', 'Borough',
       'Contract From Date', 'Contract To Date', 'FY 20 Budget',
       'FY 20 Reimbursement', 'Total FTEs', 'Total PTEs', 'Personnel Budget',
       'Months in HHS', 'Total FY20 Budget', 'Total FY20 Personnel Budget',
       'Total FY20 Reimbursement', 'Average Daily Clients',
       'Kosher Meal Budget', 'Non-Kosher Meal Budget', 'Total Meal Budget',
       'Average Meal Expenditure Per Client', 'Meal Prep Type',
       'Meal Prep For Other Centers', 'Total AIB-SCE-HPP Expenditures',
       'Average AIB-SCE-HPP Expenditures Per Client', 'Percent Utilization',
       'Latitude', 'Longitude'],
      dtype='object')


In [6]:
# Check which columns have NaNs, and count the total NaNs for that column
(scpd.isna().sum()).loc[(scpd.isna().sum()) > 0]

Total FTEs                                       1
Months in HHS                                   18
Total FY20 Budget                               18
Total FY20 Personnel Budget                     18
Total FY20 Reimbursement                        18
Average Daily Clients                          120
Meal Prep Type                                  27
Meal Prep For Other Centers                     27
Average AIB-SCE-HPP Expenditures Per Client    120
Latitude                                        11
Longitude                                       11
dtype: int64

In [7]:
# Pull only the rows that have NaNs in 'Total FY20 Budget', 'Total FY20 Personnel Budget', and 'Total FY20 Reimbursement'
budg_reim_nans = scpd.loc[scpd['Total FY20 Budget'].isna(), ['FY 20 Budget', 'Personnel Budget', 'FY 20 Reimbursement', 'Total FY20 Budget',
                                       'Total FY20 Personnel Budget', 'Total FY20 Reimbursement']]
budg_reim_nans

Unnamed: 0,FY 20 Budget,Personnel Budget,FY 20 Reimbursement,Total FY20 Budget,Total FY20 Personnel Budget,Total FY20 Reimbursement
3,1289365.0,735227.66,1218745.04,,,
105,187556.0,119205.0,154187.28,,,
110,1191748.85,831404.13,1156048.0,,,
111,858483.19,587697.4,755918.89,,,
112,846919.0,362411.09,579071.58,,,
146,1043359.0,639726.27,604745.91,,,
209,1417124.0,755912.38,1306070.22,,,
210,446362.0,204597.79,420444.92,,,
211,626000.0,193153.5,411426.77,,,
250,1300429.0,796655.84,1300429.0,,,


In [8]:
# Fill NaNs in 'Total FY20 Budget', 'Total FY20 Personnel Budget', and 'Total FY20 Reimbursement' with corresponding values
# in 'FY 20 Budget', 'Personnel Budget', and 'FY 20 Reimbursement' columns
fill_cols = ['Total FY20 Budget', 'Total FY20 Personnel Budget', 'Total FY20 Reimbursement']
fill_dict = {
    'Total FY20 Budget': 'FY 20 Budget',
    'Total FY20 Personnel Budget': 'Personnel Budget',
    'Total FY20 Reimbursement': 'FY 20 Reimbursement'
}

for col in fill_cols:
    scpd.loc[scpd[col].isna(), col] = scpd.loc[scpd[col].isna(), col].fillna(scpd.loc[scpd[col].isna(), fill_dict[col]])
    
# Check to see that those specific rows have been filled appropriately
scpd.loc[budg_reim_nans.index, ['FY 20 Budget', 'Personnel Budget', 'FY 20 Reimbursement', 'Total FY20 Budget', 'Total FY20 Personnel Budget', 'Total FY20 Reimbursement']]

Unnamed: 0,FY 20 Budget,Personnel Budget,FY 20 Reimbursement,Total FY20 Budget,Total FY20 Personnel Budget,Total FY20 Reimbursement
3,1289365.0,735227.66,1218745.04,1289365.0,735227.66,1218745.04
105,187556.0,119205.0,154187.28,187556.0,119205.0,154187.28
110,1191748.85,831404.13,1156048.0,1191748.85,831404.13,1156048.0
111,858483.19,587697.4,755918.89,858483.19,587697.4,755918.89
112,846919.0,362411.09,579071.58,846919.0,362411.09,579071.58
146,1043359.0,639726.27,604745.91,1043359.0,639726.27,604745.91
209,1417124.0,755912.38,1306070.22,1417124.0,755912.38,1306070.22
210,446362.0,204597.79,420444.92,446362.0,204597.79,420444.92
211,626000.0,193153.5,411426.77,626000.0,193153.5,411426.77
250,1300429.0,796655.84,1300429.0,1300429.0,796655.84,1300429.0


In [9]:
# Drop redundant budget and reimbursement columns
scpd.drop(columns = ['FY 20 Budget', 'Personnel Budget', 'FY 20 Reimbursement'], inplace = True)

print(f'The scpd dataframe now has this shape: {scpd.shape}')

# Rename columns to be more intuitive (based on data dictionary descriptions)
scpd.rename(columns = {
    'Total FY20 Budget': 'Total Budget',
    'Total FY20 Personnel Budget': 'Total Personnel Budget',
    'Total FY20 Reimbursement': 'Total Expenses'
}, inplace = True)

The scpd dataframe now has this shape: (294, 24)


In [10]:
# Pull only the rows that have NaNs in 'Latitude' or 'Longitude', and get the Program address from the original raw dataframe
lat_long_nans = scpd_raw.loc[(scpd_raw['Latitude'].isna()) & (scpd_raw['Longitude'].isna()), ['Provider Name', 'Program Address', 'Program Address1', 'Latitude', 'Longitude']]
lat_long_nans

Unnamed: 0,Provider Name,Program Address,Program Address1,Latitude,Longitude
102,ARROCHAR NEIGHBORHOOD SENIOR CENTER,44 BIONA AVENUE,,,
126,MORRISANIA AIR RIGHTS SOCIAL CLUB,3135 PARK AVENUE EAST,,,
139,RAIN MT CARMEL NEIGHBORHOOD SENIOR CTR,2405 SOUTHERN BOLULEVARD,,,
169,HUGH GILROY NEIGHBORHOOD SENIOR CENTER,447 KINGSBOROUGH,,,
179,HOMECREST BENSONHURST NEIGHBORHOOD SENIOR CENTER,BROOKLYN,,,
192,CCNS ST CHARLES NEIGHBORHOOD SENIOR CENT,55 PIERREPORT STREET,,,
214,POLO GROUNDS,2965 FREDERICK DOUGLASS BOULEVARD,,,
228,JEFFERSON HOUSES NSC,2205 FIST AVENUE,,,
231,FOOD BANK NEIGHBORHOOD SENIOR CENTER,WEST 116TH STREET,,,
260,HANAC RAVENSWOOD NSC,34-35A 12TH STREET,12TH STREET,,


In [11]:
# Fill NaNs in 'Latitude' and 'Longitude' with corresponding values in fill_dict
fill_cols = ['Latitude', 'Longitude']
fill_dict = {
    102: [40.596868706020445, -74.07555607642975],
    126: [40.82651347755042, -73.91661122662934],
    139: [40.85396935386981, -73.88190264660648],
    169: [40.67530765760249, -73.92446824134545],
    179: [40.602035938136076, -73.95715926297987],
    192: [40.696071010269755, -73.99506336199069],
    214: [40.830937975634, -73.93751073154422],
    228: [40.79462754058644, -73.93706847434424],
    231: [40.80412527380681, -73.95489674318942],
    260: [40.76252319677368, -73.93704370456038],
    273: [40.72522541418266, -73.76483607547621]
}
fill_df = pd.DataFrame(fill_dict.values(), index = fill_dict.keys(), columns = ['Latitude', 'Longitude'])

for col in fill_cols:
    scpd.loc[scpd[col].isna(), col] = scpd.loc[scpd[col].isna(), col].fillna(fill_df[col])

In [12]:
# Pull rows associated with meal budget & expenditure where the value is $0.00
scpd.loc[scpd['Total Meal Budget'] == 0, ['Kosher Meal Budget', 'Non-Kosher Meal Budget', 'Total Meal Budget', 'Average Meal Expenditure Per Client']]

Unnamed: 0,Kosher Meal Budget,Non-Kosher Meal Budget,Total Meal Budget,Average Meal Expenditure Per Client
5,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0
25,0.0,0.0,0.0,0.0
31,0.0,0.0,0.0,0.0
56,0.0,0.0,0.0,0.0
66,0.0,0.0,0.0,0.0
83,0.0,0.0,0.0,0.0
126,0.0,0.0,0.0,0.0
165,0.0,0.0,0.0,0.0
167,0.0,0.0,0.0,0.0


In [13]:
# Replace zeroes in 'Kosher Meal Budget', 'Non-Kosher Meal Budget', 'Total Meal Budget', 'Average Meal Expenditure Per Client', and 'Total AIB-SCE-HPP Expenditures' with NaN
fill_cols = ['Kosher Meal Budget', 'Non-Kosher Meal Budget', 'Total Meal Budget', 'Average Meal Expenditure Per Client', 'Total AIB-SCE-HPP Expenditures']

for col in fill_cols:
    scpd.loc[scpd[col] == 0, col] = scpd.loc[scpd[col].isna(), col].replace(0, np.nan)

In [14]:
# Fill NaNs in 'Total FTEs' and 'Months in HHS' columns with 0
for col in ['Total FTEs', 'Months in HHS']:
    scpd[col] = scpd[col].fillna(0)

In [15]:
# Title-case columns with strings
for col in ['Senior Center Name', 'Site Type', 'Borough', 'Meal Prep Type']:
    scpd[col] = scpd[col].str.title()

In [16]:
# Fill NaNs in 'Meal Prep Type' with 'Not Applicable'
scpd.loc[scpd['Meal Prep Type'].isna(), 'Meal Prep Type'] = scpd.loc[scpd['Meal Prep Type'].isna(), 'Meal Prep Type'].fillna('Not Applicable')

In [17]:
# Fill NaNs in 'Meal Prep For Other Centers' with 'No'
scpd.loc[scpd['Meal Prep For Other Centers'].isna(), 'Meal Prep For Other Centers'] = scpd.loc[scpd['Meal Prep For Other Centers'].isna(), 'Meal Prep For Other Centers'].fillna('No')

In [18]:
# Change data type of 'Total FTEs' and 'Months in HHS' to integer
for col in ['Total FTEs', 'Months in HHS']:
    scpd[col] = scpd[col].astype(int)

In [19]:
# Re-check which columns have NaNs, and count the total NaNs for that column
(scpd.isna().sum()).loc[(scpd.isna().sum()) > 0]

Average Daily Clients                          120
Kosher Meal Budget                             273
Non-Kosher Meal Budget                          39
Total Meal Budget                               18
Average Meal Expenditure Per Client             20
Total AIB-SCE-HPP Expenditures                   2
Average AIB-SCE-HPP Expenditures Per Client    120
dtype: int64

### Derive and add columns

In [20]:
# Add five derived columns to scpd dataframe
scpd['Total Employees'] = scpd['Total FTEs'] + scpd['Total PTEs']
scpd['PTE Status'] = scpd['Total PTEs'].apply(lambda num: 'Has No PTEs' if num == 0 else 'Has PTEs')
scpd['% Budget Allocated for Personnel'] = scpd['Total Personnel Budget'] / scpd['Total Budget']
scpd['% Budget Allocated for Meals'] = scpd['Total Meal Budget'] / scpd['Total Budget']
scpd['% Budget Used for AIB, SCE, & HPP Services'] = scpd['Total AIB-SCE-HPP Expenditures'] / scpd['Total Budget']

### Check the final cleaned dataframe

In [21]:
print(f'The cleaned scpd dataframe now has this shape: {scpd.shape}\n')
print(f'The columns are now named:\n{scpd.columns}')

The cleaned scpd dataframe now has this shape: (294, 29)

The columns are now named:
Index(['DFTA ID', 'Senior Center Name', 'Site Type', 'Borough',
       'Contract From Date', 'Contract To Date', 'Total FTEs', 'Total PTEs',
       'Months in HHS', 'Total Budget', 'Total Personnel Budget',
       'Total Expenses', 'Average Daily Clients', 'Kosher Meal Budget',
       'Non-Kosher Meal Budget', 'Total Meal Budget',
       'Average Meal Expenditure Per Client', 'Meal Prep Type',
       'Meal Prep For Other Centers', 'Total AIB-SCE-HPP Expenditures',
       'Average AIB-SCE-HPP Expenditures Per Client', 'Percent Utilization',
       'Latitude', 'Longitude', 'Total Employees', 'PTE Status',
       '% Budget Allocated for Personnel', '% Budget Allocated for Meals',
       '% Budget Used for AIB, SCE, & HPP Services'],
      dtype='object')


In [22]:
scpd.describe()

Unnamed: 0,Total FTEs,Total PTEs,Months in HHS,Total Budget,Total Personnel Budget,Total Expenses,Average Daily Clients,Kosher Meal Budget,Non-Kosher Meal Budget,Total Meal Budget,Average Meal Expenditure Per Client,Total AIB-SCE-HPP Expenditures,Average AIB-SCE-HPP Expenditures Per Client,Latitude,Longitude,Total Employees,% Budget Allocated for Personnel,% Budget Allocated for Meals,"% Budget Used for AIB, SCE, & HPP Services"
count,294.0,294.0,294.0,294.0,294.0,294.0,174.0,21.0,255.0,276.0,274.0,292.0,174.0,294.0,294.0,294.0,294.0,276.0,292.0
mean,6.829932,2.744898,6.758503,600119.9,352200.7,473044.2,84.718391,253030.688095,269520.634745,268265.964891,9.817591,199473.516233,3164.909713,40.732432,-73.926244,9.57483,0.60061,0.441526,0.353668
std,4.822267,3.760704,1.998168,268010.5,161420.2,245570.9,55.133322,100401.115498,137590.941155,135047.807388,5.804748,111439.382187,3915.332542,0.086868,0.076484,4.599248,0.130942,0.196045,0.178088
min,0.0,0.0,0.0,20665.71,-161007.8,0.0,1.0,73303.12,9168.31,9168.31,1.41,11057.28,621.47,40.509084,-74.22093,0.0,-0.21755,0.037176,0.058955
25%,3.0,0.0,7.0,462423.9,267327.5,343671.0,52.25,177761.01,176885.02,177261.79,7.06,128422.4075,1741.8475,40.669448,-73.970864,7.0,0.529189,0.339183,0.252292
50%,7.0,1.0,7.0,576771.4,341735.7,461874.8,73.5,245861.79,252957.56,250572.07,8.96,183894.435,2479.845,40.723679,-73.935417,9.0,0.611498,0.436065,0.310457
75%,10.0,4.0,7.0,716751.0,428916.3,587511.8,101.75,311860.81,337383.225,334809.3225,11.4675,242834.3225,3348.0675,40.811483,-73.885033,11.0,0.67066,0.517308,0.409316
max,24.0,16.0,12.0,1648998.0,1035410.0,1419843.0,319.0,485140.23,769929.36,769929.36,83.46,856978.66,48361.29,40.905009,-73.723122,31.0,1.039392,2.569692,1.224834


In [23]:
# Take a final look at the cleaned scpd dataframe
scpd

Unnamed: 0,DFTA ID,Senior Center Name,Site Type,Borough,Contract From Date,Contract To Date,Total FTEs,Total PTEs,Months in HHS,Total Budget,...,Total AIB-SCE-HPP Expenditures,Average AIB-SCE-HPP Expenditures Per Client,Percent Utilization,Latitude,Longitude,Total Employees,PTE Status,% Budget Allocated for Personnel,% Budget Allocated for Meals,"% Budget Used for AIB, SCE, & HPP Services"
0,12E01,Mosholu Montefiore Neighborhood Sr Ct,Neighborhood Senior Centers,Bronx,12/01/2019,06/30/2020,7,0,7,552097.71,...,107854.43,1609.77,100%+,40.881486,-73.881226,7,Has No PTEs,0.578655,0.284006,0.195354
1,15E01,The Neighborhood Self-Help By Older Persons Pr...,Senior Center,Bronx,07/01/2019,06/30/2020,4,12,12,967332.00,...,395854.81,3534.42,100%+,40.834198,-73.890157,16,Has PTEs,0.713406,0.336692,0.409223
2,22E01,Young Israel Sr Services Nbh Sr Ctr,Neighborhood Senior Centers,Brooklyn,12/01/2019,06/30/2020,0,10,7,644322.86,...,172093.41,1422.26,100%+,40.620447,-73.955405,10,Has PTEs,0.555704,0.491929,0.267092
3,35E01,The Educational Alliance Inc,Innovative Senior Centers,Manhattan,07/01/2019,06/30/2020,9,12,0,1289365.00,...,547321.42,4024.42,100%+,40.730657,-73.985041,21,Has PTEs,0.570225,0.502602,0.424489
4,12E02,Fort Independence Social Club,Neighborhood Senior Centers,Bronx,12/01/2019,06/30/2020,8,0,7,319129.71,...,56926.81,1674.32,100%+,40.880093,-73.900742,8,Has No PTEs,0.655196,0.280142,0.178381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,4A201,Pomonok Neighborhood Senior Center,Neighborhood Senior Centers,Queens,12/01/2019,06/30/2020,11,0,7,583805.14,...,138094.35,1569.25,100%+,40.735033,-73.814726,11,Has No PTEs,0.579582,0.516301,0.236542
290,50A01,Todt Hill Neighborhood Senior Center,Neighborhood Senior Centers,Staten Island,12/01/2019,06/30/2020,12,3,7,467674.35,...,93979.28,2473.14,100%+,40.609058,-74.119053,15,Has PTEs,0.829860,0.582007,0.200950
291,50B01,Mount Loretto Neighborhood Senior Cen,Neighborhood Senior Centers,Staten Island,12/01/2019,06/30/2020,13,4,7,773648.57,...,159727.36,1879.15,56%,40.509084,-74.220930,17,Has PTEs,0.605659,0.624975,0.206460
292,50R01,Cassidy Coles Neighborhood Senior Cent,Neighborhood Senior Centers,Staten Island,12/01/2019,06/30/2020,5,7,7,668334.86,...,187382.39,2204.50,100%+,40.641888,-74.096489,12,Has PTEs,0.559307,0.485323,0.280372
