# Data Cleaning
---

In [1]:
import pandas as pd

### Importing FY 2020 Senior Center .csv files as dataframes

In [2]:
sccd_raw = pd.read_csv('raw_csv/senior_center_client_data_fy2020.csv')
scpd_raw = pd.read_csv('raw_csv/senior_center_provider_data_fy2020.csv')

# Make a copy of the raw dataframes
sccd = sccd_raw.copy()
scpd = scpd_raw.copy()

---
### Cleaning Senior Center Provider Data

In [3]:
# Check for duplicated rows
print(f'There are {scpd.duplicated().sum()} duplicated rows in the scpd dataframe.')

There are 0 duplicated rows in the scpd dataframe.


In [4]:
print(f'The scpd dataframe currently has this shape: {scpd.shape}')

# Drop columns you won't be using
scpd.drop(columns = ['Sponsor Name', 'Program Address', 'Program Address1', 'Postcode',
                    'Community Board', 'Council D istrict', 'Sunday', 'Monday', 'Tuesday',
                     'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Fiscal Year Amount',
                     'FY 20 Actual Meals', 'Meal Prep', 'Meals Prep for Others', 'Borough',
                     'Latitude', 'Longitude', 'Census Tract', 'BIN', 'BBL', 'NTA'], inplace = True)

print(f'The scpd dataframe now has this shape: {scpd.shape}')

The scpd dataframe currently has this shape: (294, 49)
The scpd dataframe now has this shape: (294, 25)


In [5]:
# Rename columns to be more intuitive (based on data dictionary descriptions)
scpd.rename(columns = {
    'Provider Name': 'Senior Center Name',
    'Borough/City': 'Borough',
    '# of Full-time Staff': 'Total FTEs',
    '# of Part-time Staff': 'Total PTEs',
    'Average Daily Participants': 'Average Daily Clients',
    'Kosher Raw Food/ Disposable': 'Kosher Meal Budget',
    'Non-Kosher Raw Food/ Disposable': 'Non-Kosher Meal Budget',
    'Total Raw Food/ Disposable': 'Total Meal Budget',
    'Expenditures per Meal for Food and Disposable': 'Average Meal Expenditure Per Client',
    'Meal Prep1': 'Meal Prep Type',
    'Prep for Others1': 'Meal Prep For Other Centers',
    'Annual Expenditures for Information and Assistance, Education and Recreation, Health Promotion': 'Total AIB-SCE-HPP Expenditures',
    'Annual Expenditures Per Client for Information and Assistance, Education and Recreation, Health Promotion': 'Average AIB-SCE-HPP Expenditures Per Client',
    'Ultilization': 'Percent Utilization'
}, inplace = True)

print(f'The columns are now named:\n{scpd.columns}')

The columns are now named:
Index(['DFTA ID', 'Senior Center Name', 'Site Type', 'Borough',
       'Contract From Date', 'Contract To Date', 'FY 20 Budget',
       'FY 20 Reimbursement', 'Total FTEs', 'Total PTEs', 'Personnel Budget',
       'Months in HHS', 'Total FY20 Budget', 'Total FY20 Personnel Budget',
       'Total FY20 Reimbursement', 'Average Daily Clients',
       'Kosher Meal Budget', 'Non-Kosher Meal Budget', 'Total Meal Budget',
       'Average Meal Expenditure Per Client', 'Meal Prep Type',
       'Meal Prep For Other Centers', 'Total AIB-SCE-HPP Expenditures',
       'Average AIB-SCE-HPP Expenditures Per Client', 'Percent Utilization'],
      dtype='object')


In [6]:
# Check which columns have NaNs, and count the total NaNs for that column
(scpd.isna().sum()).loc[(scpd.isna().sum()) > 0]

Total FTEs                                       1
Months in HHS                                   18
Total FY20 Budget                               18
Total FY20 Personnel Budget                     18
Total FY20 Reimbursement                        18
Average Daily Clients                          120
Meal Prep Type                                  27
Meal Prep For Other Centers                     27
Average AIB-SCE-HPP Expenditures Per Client    120
dtype: int64

In [10]:
# Pull only the rows that have NaNs in 'Total FY20 Budget', 'Total FY20 Personnel Budget', and 'Total FY20 Reimbursement'
scpd.loc[scpd['Total FY20 Budget'].isna(), ['FY 20 Budget', 'FY 20 Reimbursement', 'Personnel Budget', 'Total FY20 Budget',
                                       'Total FY20 Personnel Budget', 'Total FY20 Reimbursement']]

Unnamed: 0,FY 20 Budget,FY 20 Reimbursement,Personnel Budget,Total FY20 Budget,Total FY20 Personnel Budget,Total FY20 Reimbursement
3,1289365.0,1218745.04,735227.66,,,
105,187556.0,154187.28,119205.0,,,
110,1191748.85,1156048.0,831404.13,,,
111,858483.19,755918.89,587697.4,,,
112,846919.0,579071.58,362411.09,,,
146,1043359.0,604745.91,639726.27,,,
209,1417124.0,1306070.22,755912.38,,,
210,446362.0,420444.92,204597.79,,,
211,626000.0,411426.77,193153.5,,,
250,1300429.0,1300429.0,796655.84,,,


In [17]:

scpd.loc[scpd['Total FY20 Budget'].isna(),['Total FY20 Budget', 'Total FY20 Personnel Budget', 'Total FY20 Reimbursement']]

Unnamed: 0,Total FY20 Budget,Total FY20 Personnel Budget,Total FY20 Reimbursement
3,,,
105,,,
110,,,
111,,,
112,,,
146,,,
209,,,
210,,,
211,,,
250,,,


In [18]:
scpd.loc[scpd['Total FY20 Budget'].isna(),['FY 20 Budget', 'Personnel Budget', 'FY 20 Reimbursement']]

Unnamed: 0,FY 20 Budget,Personnel Budget,FY 20 Reimbursement
3,1289365.0,735227.66,1218745.04
105,187556.0,119205.0,154187.28
110,1191748.85,831404.13,1156048.0
111,858483.19,587697.4,755918.89
112,846919.0,362411.09,579071.58
146,1043359.0,639726.27,604745.91
209,1417124.0,755912.38,1306070.22
210,446362.0,204597.79,420444.92
211,626000.0,193153.5,411426.77
250,1300429.0,796655.84,1300429.0


In [25]:
# scpd.loc[scpd['Total FY20 Budget'].isna(),['Total FY20 Budget', 'Total FY20 Personnel Budget', 'Total FY20 Reimbursement']]
# replace_df = scpd.loc[scpd['Total FY20 Budget'].isna(),['FY 20 Budget', 'Personnel Budget', 'FY 20 Reimbursement']]

# # Replace NaNs with corresponding replace values
# nan_df.fillna(replace_df, inplace = True)




scpd['Total FY20 Budget'].isna()#,['Total FY20 Budget', 'Total FY20 Personnel Budget', 'Total FY20 Reimbursement']]['Total FY20 Budget', 'Total FY20 Personnel Budget', 'Total FY20 Reimbursement']

0      False
1      False
2      False
3       True
4      False
       ...  
289    False
290    False
291    False
292    False
293    False
Name: Total FY20 Budget, Length: 294, dtype: bool

In [22]:
(scpd.isna().sum()).loc[(scpd.isna().sum()) > 0]

Total FTEs                                       1
Months in HHS                                   18
Total FY20 Budget                               18
Total FY20 Personnel Budget                     18
Total FY20 Reimbursement                        18
Average Daily Clients                          120
Meal Prep Type                                  27
Meal Prep For Other Centers                     27
Average AIB-SCE-HPP Expenditures Per Client    120
dtype: int64