# Data Cleaning
---

In [1]:
import pandas as pd

### Importing FY 2020 Senior Center .csv files as dataframes

In [2]:
sccd_raw = pd.read_csv('raw_csv/senior_center_client_data_fy2020.csv')
scpd_raw = pd.read_csv('raw_csv/senior_center_provider_data_fy2020.csv')

# Make a copy of the raw dataframes
sccd = sccd_raw.copy()
scpd = scpd_raw.copy()

---
### Cleaning Senior Center Provider Data

In [3]:
# Check for duplicated rows
print(f'There are {scpd.duplicated().sum()} duplicated rows in the scpd dataframe.')

There are 0 duplicated rows in the scpd dataframe.


In [4]:
print(f'The scpd dataframe currently has this shape: {scpd.shape}')

# Drop columns you won't be using
scpd.drop(columns = ['Sponsor Name', 'Program Address', 'Program Address1', 'Postcode',
                    'Community Board', 'Council D istrict', 'Sunday', 'Monday', 'Tuesday',
                     'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Fiscal Year Amount',
                     'FY 20 Actual Meals', 'Meal Prep', 'Meals Prep for Others', 'Borough',
                     'Latitude', 'Longitude', 'Census Tract', 'BIN', 'BBL', 'NTA'], inplace = True)

print(f'The scpd dataframe now has this shape: {scpd.shape}')

The scpd dataframe currently has this shape: (294, 49)
The scpd dataframe now has this shape: (294, 25)


In [5]:
# Rename columns to be more intuitive (based on data dictionary descriptions)
scpd.rename(columns = {
    'Provider Name': 'Senior Center Name',
    'Borough/City': 'Borough',
    '# of Full-time Staff': 'Total FTEs',
    '# of Part-time Staff': 'Total PTEs',
    'Average Daily Participants': 'Average Daily Clients',
    'Kosher Raw Food/ Disposable': 'Kosher Meal Budget',
    'Non-Kosher Raw Food/ Disposable': 'Non-Kosher Meal Budget',
    'Total Raw Food/ Disposable': 'Total Meal Budget',
    'Expenditures per Meal for Food and Disposable': 'Average Meal Expenditure Per Client',
    'Meal Prep1': 'Meal Prep Type',
    'Prep for Others1': 'Meal Prep For Other Centers',
    'Annual Expenditures for Information and Assistance, Education and Recreation, Health Promotion': 'Total AIB-SCE-HPP Expenditures',
    'Annual Expenditures Per Client for Information and Assistance, Education and Recreation, Health Promotion': 'Average AIB-SCE-HPP Expenditures Per Client',
    'Ultilization': 'Percent Utilization'
}, inplace = True)

print(f'The columns are now named:\n{scpd.columns}')

The columns are now named:
Index(['DFTA ID', 'Senior Center Name', 'Site Type', 'Borough',
       'Contract From Date', 'Contract To Date', 'FY 20 Budget',
       'FY 20 Reimbursement', 'Total FTEs', 'Total PTEs', 'Personnel Budget',
       'Months in HHS', 'Total FY20 Budget', 'Total FY20 Personnel Budget',
       'Total FY20 Reimbursement', 'Average Daily Clients',
       'Kosher Meal Budget', 'Non-Kosher Meal Budget', 'Total Meal Budget',
       'Average Meal Expenditure Per Client', 'Meal Prep Type',
       'Meal Prep For Other Centers', 'Total AIB-SCE-HPP Expenditures',
       'Average AIB-SCE-HPP Expenditures Per Client', 'Percent Utilization'],
      dtype='object')


In [6]:
# Check which columns have NaNs, and count the total NaNs for that column
(scpd.isna().sum()).loc[(scpd.isna().sum()) > 0]

Total FTEs                                       1
Months in HHS                                   18
Total FY20 Budget                               18
Total FY20 Personnel Budget                     18
Total FY20 Reimbursement                        18
Average Daily Clients                          120
Meal Prep Type                                  27
Meal Prep For Other Centers                     27
Average AIB-SCE-HPP Expenditures Per Client    120
dtype: int64

In [7]:
# Pull only the rows that have NaNs in 'Total FY20 Budget', 'Total FY20 Personnel Budget', and 'Total FY20 Reimbursement'
budg_reim_nans = scpd.loc[scpd['Total FY20 Budget'].isna(), ['FY 20 Budget', 'Personnel Budget', 'FY 20 Reimbursement', 'Total FY20 Budget',
                                       'Total FY20 Personnel Budget', 'Total FY20 Reimbursement']]
budg_reim_nans

Unnamed: 0,FY 20 Budget,Personnel Budget,FY 20 Reimbursement,Total FY20 Budget,Total FY20 Personnel Budget,Total FY20 Reimbursement
3,1289365.0,735227.66,1218745.04,,,
105,187556.0,119205.0,154187.28,,,
110,1191748.85,831404.13,1156048.0,,,
111,858483.19,587697.4,755918.89,,,
112,846919.0,362411.09,579071.58,,,
146,1043359.0,639726.27,604745.91,,,
209,1417124.0,755912.38,1306070.22,,,
210,446362.0,204597.79,420444.92,,,
211,626000.0,193153.5,411426.77,,,
250,1300429.0,796655.84,1300429.0,,,


In [8]:
# Fill NaNs in 'Total FY20 Budget', 'Total FY20 Personnel Budget', and 'Total FY20 Reimbursement' with corresponding values
# in 'FY 20 Budget', 'Personnel Budget', and 'FY 20 Reimbursement' columns
fill_cols = ['Total FY20 Budget', 'Total FY20 Personnel Budget', 'Total FY20 Reimbursement']
fill_dict = {
    'Total FY20 Budget': 'FY 20 Budget',
    'Total FY20 Personnel Budget': 'Personnel Budget',
    'Total FY20 Reimbursement': 'FY 20 Reimbursement'
}

for col in fill_cols:
    scpd.loc[scpd[col].isna(), col] = scpd.loc[scpd[col].isna(), col].fillna(scpd.loc[scpd[col].isna(), fill_dict[col]])
    
# Check to see that those specific rows have been filled appropriately
scpd.loc[budg_reim_nans.index, ['FY 20 Budget', 'Personnel Budget', 'FY 20 Reimbursement', 'Total FY20 Budget', 'Total FY20 Personnel Budget', 'Total FY20 Reimbursement']]

Unnamed: 0,FY 20 Budget,Personnel Budget,FY 20 Reimbursement,Total FY20 Budget,Total FY20 Personnel Budget,Total FY20 Reimbursement
3,1289365.0,735227.66,1218745.04,1289365.0,735227.66,1218745.04
105,187556.0,119205.0,154187.28,187556.0,119205.0,154187.28
110,1191748.85,831404.13,1156048.0,1191748.85,831404.13,1156048.0
111,858483.19,587697.4,755918.89,858483.19,587697.4,755918.89
112,846919.0,362411.09,579071.58,846919.0,362411.09,579071.58
146,1043359.0,639726.27,604745.91,1043359.0,639726.27,604745.91
209,1417124.0,755912.38,1306070.22,1417124.0,755912.38,1306070.22
210,446362.0,204597.79,420444.92,446362.0,204597.79,420444.92
211,626000.0,193153.5,411426.77,626000.0,193153.5,411426.77
250,1300429.0,796655.84,1300429.0,1300429.0,796655.84,1300429.0


In [9]:
# Drop redundant budget and reimbursement columns
scpd.drop(columns = ['FY 20 Budget', 'Personnel Budget', 'FY 20 Reimbursement'], inplace = True)

print(f'The scpd dataframe now has this shape: {scpd.shape}')

# Rename columns to be more intuitive (based on data dictionary descriptions)
scpd.rename(columns = {
    'Total FY20 Budget': 'Total Budget',
    'Total FY20 Personnel Budget': 'Total Personnel Budget',
    'Total FY20 Reimbursement': 'Total Expenses'
}, inplace = True)

The scpd dataframe now has this shape: (294, 22)


In [10]:
# Fill NaNs in 'Total FTEs' and 'Months in HHS' columns with 0
for col in ['Total FTEs', 'Months in HHS']:
    scpd[col] = scpd[col].fillna(0)

In [11]:
# Title-case columns with strings
for col in ['Senior Center Name', 'Site Type', 'Borough', 'Meal Prep Type']:
    scpd[col] = scpd[col].str.title()

In [12]:
# Re-check which columns have NaNs, and count the total NaNs for that column
(scpd.isna().sum()).loc[(scpd.isna().sum()) > 0]

Average Daily Clients                          120
Meal Prep Type                                  27
Meal Prep For Other Centers                     27
Average AIB-SCE-HPP Expenditures Per Client    120
dtype: int64

### Derive and add columns

In [13]:
scpd['Total Employees'] = scpd['Total FTEs'] + scpd['Total PTEs']

In [14]:
# Take a final look at the cleaned scpd dataframe

scpd

# for col in scpd.columns:
#     print(f'{scpd.value_counts(col)}\n')

Unnamed: 0,DFTA ID,Senior Center Name,Site Type,Borough,Contract From Date,Contract To Date,Total FTEs,Total PTEs,Months in HHS,Total Budget,...,Kosher Meal Budget,Non-Kosher Meal Budget,Total Meal Budget,Average Meal Expenditure Per Client,Meal Prep Type,Meal Prep For Other Centers,Total AIB-SCE-HPP Expenditures,Average AIB-SCE-HPP Expenditures Per Client,Percent Utilization,Total Employees
0,12E01,Mosholu Montefiore Neighborhood Sr Ct,Neighborhood Senior Centers,Bronx,12/01/2019,06/30/2020,7.0,0,7.0,552097.71,...,0.0,156799.09,156799.09,7.38,Meal Prep On-Site,Yes,107854.43,1609.77,100%+,7.0
1,15E01,The Neighborhood Self-Help By Older Persons Pr...,Senior Center,Bronx,07/01/2019,06/30/2020,4.0,12,12.0,967332.00,...,0.0,325693.31,325693.31,8.12,Meal Prep On-Site,No,395854.81,3534.42,100%+,16.0
2,22E01,Young Israel Sr Services Nbh Sr Ctr,Neighborhood Senior Centers,Brooklyn,12/01/2019,06/30/2020,0.0,10,7.0,644322.86,...,0.0,316960.92,316960.92,10.13,Catered,No,172093.41,1422.26,100%+,10.0
3,35E01,The Educational Alliance Inc,Innovative Senior Centers,Manhattan,07/01/2019,06/30/2020,9.0,12,0.0,1289365.00,...,0.0,648037.47,648037.47,8.91,Meal Prep On-Site,No,547321.42,4024.42,100%+,21.0
4,12E02,Fort Independence Social Club,Neighborhood Senior Centers,Bronx,12/01/2019,06/30/2020,8.0,0,7.0,319129.71,...,0.0,89401.71,89401.71,7.15,Catered,No,56926.81,1674.32,100%+,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,4A201,Pomonok Neighborhood Senior Center,Neighborhood Senior Centers,Queens,12/01/2019,06/30/2020,11.0,0,7.0,583805.14,...,0.0,301419.41,301419.41,12.95,Meal Prep On-Site,No,138094.35,1569.25,100%+,11.0
290,50A01,Todt Hill Neighborhood Senior Center,Neighborhood Senior Centers,Staten Island,12/01/2019,06/30/2020,12.0,3,7.0,467674.35,...,0.0,272189.76,272189.76,11.46,Meal Prep On-Site,No,93979.28,2473.14,100%+,15.0
291,50B01,Mount Loretto Neighborhood Senior Cen,Neighborhood Senior Centers,Staten Island,12/01/2019,06/30/2020,13.0,4,7.0,773648.57,...,0.0,483511.23,483511.23,9.92,Meal Prep On-Site,No,159727.36,1879.15,56%,17.0
292,50R01,Cassidy Coles Neighborhood Senior Cent,Neighborhood Senior Centers,Staten Island,12/01/2019,06/30/2020,5.0,7,7.0,668334.86,...,0.0,324358.18,324358.18,8.95,Meal Prep On-Site,No,187382.39,2204.50,100%+,12.0
