In [1]:
import pandas as pd
import numpy as np
import os

os.chdir('waterloo_datathon_datasets')
pd.set_option('max_columns', 200)

In [2]:
df = pd.read_csv("community_health.csv")
print(df.head())
print(df.tail())

      county_name                         health_topic  \
0          Albany  Family Planning/Natality Indicators   
1        Allegany  Family Planning/Natality Indicators   
2           Bronx  Family Planning/Natality Indicators   
3          Broome  Family Planning/Natality Indicators   
4  Capital Region  Family Planning/Natality Indicators   

                                           indicator  event_count  \
0  Percentage of births within 24 months of previ...       2043.0   
1  Percentage of births within 24 months of previ...        399.0   
2  Percentage of births within 24 months of previ...      10032.0   
3  Percentage of births within 24 months of previ...       1654.0   
4  Percentage of births within 24 months of previ...       6273.0   

   avg_num_of_denominator unit_measurement  precent_or_rate  low_limit_95p_CI  \
0                  3144.0       Percentage             21.7               NaN   
1                   487.0       Percentage             27.3               

In [3]:
df.health_topic.unique()

array(['Family Planning/Natality Indicators', 'Cancer Indicators',
       'Oral Health Indicators', 'Maternal and Infant Health Indicators',
       'Injury Indicators',
       'Socio-Economic Status and General Health Indicators',
       'Cardiovascular Disease Indicators',
       'Child and Adolescent Health Indicators',
       'Obesity and Related Indicators', 'Cirrhosis/Diabetes Indicators',
       'HIV/AIDS and Other Sexually Transmitted Infection Indicators',
       'Respiratory Disease Indicators',
       'Tobacco, Alcohol and Other Substance Abuse Indicators',
       'Communicable Disease Indicators',
       'Occupational Health Indicators'], dtype=object)

In [4]:
df.loc[:, 'lat_long'] = df.lat_long.apply(lambda x: x.strip('()').split(', ') if not pd.isnull(x) else [np.nan, np.nan])
df[['lat', 'lng']] = pd.DataFrame(df.lat_long.values.tolist())
df.drop('lat_long', axis=1, inplace=True)

print(df.columns)
df.head()

Index(['county_name', 'health_topic', 'indicator', 'event_count',
       'avg_num_of_denominator', 'unit_measurement', 'precent_or_rate',
       'low_limit_95p_CI', 'upp_limit_95p_CI', 'data_comment', 'year',
       'data_source', 'lat', 'lng'],
      dtype='object')


Unnamed: 0,county_name,health_topic,indicator,event_count,avg_num_of_denominator,unit_measurement,precent_or_rate,low_limit_95p_CI,upp_limit_95p_CI,data_comment,year,data_source,lat,lng
0,Albany,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,2043.0,3144.0,Percentage,21.7,,,,2012-2014,"2012-2014 Vital Statistics Data as of May, 2016",42.678066,-73.814233
1,Allegany,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,399.0,487.0,Percentage,27.3,,,,2012-2014,"2012-2014 Vital Statistics Data as of May, 2016",42.226801,-78.020567
2,Bronx,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,10032.0,21249.0,Percentage,15.7,,,,2012-2014,"2012-2014 Vital Statistics Data as of May, 2016",40.85589,-73.868294
3,Broome,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,1654.0,2083.0,Percentage,26.5,,,,2012-2014,"2012-2014 Vital Statistics Data as of May, 2016",42.122015,-75.933191
4,Capital Region,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,6273.0,9774.0,Percentage,21.4,,,,2012-2014,"2012-2014 Vital Statistics Data as of May, 2016",,


In [5]:
# drop data suppressed rows. Unreliable and not that many occurrences
df = df[df['data_comment'] != 'Data Suppressed']

In [6]:
# drop income and population data (already in other dataset)
df = df[df['unit_measurement'].isin(['Percentage', 'Rate'])]

In [7]:
# drop unnecessary cols

df.drop(['low_limit_95p_CI', 'upp_limit_95p_CI', 'data_comment', 'data_source', 'unit_measurement'], axis=1, inplace=True)


In [8]:
df.head()


Unnamed: 0,county_name,health_topic,indicator,event_count,avg_num_of_denominator,precent_or_rate,year,lat,lng
0,Albany,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,2043.0,3144.0,21.7,2012-2014,42.678066,-73.814233
1,Allegany,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,399.0,487.0,27.3,2012-2014,42.226801,-78.020567
2,Bronx,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,10032.0,21249.0,15.7,2012-2014,40.85589,-73.868294
3,Broome,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,1654.0,2083.0,26.5,2012-2014,42.122015,-75.933191
4,Capital Region,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,6273.0,9774.0,21.4,2012-2014,,


In [9]:
df.year.unique()
df.shape[0]

23791

In [10]:
df.county_name.unique()

array(['Albany', 'Allegany', 'Bronx', 'Broome', 'Capital Region',
       'Cattaraugus', 'Cayuga', 'Central NY', 'Chautauqua', 'Chemung',
       'Chenango', 'Clinton', 'Columbia', 'Cortland', 'Delaware',
       'Dutchess', 'Erie', 'Essex', 'Finger Lakes', 'Franklin', 'Fulton',
       'Genesee', 'North Country', 'Greene', 'Hamilton', 'Herkimer',
       'Jefferson', 'Kings', 'Lewis', 'Livingston', 'Long Island',
       'Madison', 'Mid-Hudson', 'Mohawk Valley', 'Monroe', 'Montgomery',
       'Nassau', 'New York', 'New York City', 'New York State', 'Niagara',
       'Oneida', 'Onondaga', 'Ontario', 'Orange', 'Orleans', 'Oswego',
       'Otsego', 'Putnam', 'Queens', 'Rensselaer', 'Richmond', 'Rockland',
       'Saratoga', 'Schenectady', 'Schoharie', 'Schuyler', 'Seneca',
       'Southern Tier', 'St. Lawrence', 'Steuben', 'Suffolk', 'Sullivan',
       'Tioga', 'Tompkins', 'Tug Hill Seaway', 'Ulster', 'Warren',
       'Washington', 'Wayne', 'Westchester', 'Western NY', 'Wyoming',
       'Yates

In [11]:
hamilton_df = df[df['county_name'].isin(['Essex/Hamilton', 'Hamilton/Essex'])]
hamilton_df.loc[:, 'county_name'] = 'Hamilton'
essex_df = hamilton_df.copy()
essex_df.loc[:, 'county_name'] = 'Essex'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [12]:
df = df[~df['county_name'].isin(['Essex/Hamilton', 'Hamilton/Essex'])].append(hamilton_df).append(essex_df)

In [13]:
df.shape[0]

23811

In [14]:
df = df[~df['county_name'].isin(['New York State (excluding NYC)', 'Central NY', 'Western NY'])]

In [15]:
df.shape[0]

23162

In [16]:
df.head()

Unnamed: 0,county_name,health_topic,indicator,event_count,avg_num_of_denominator,precent_or_rate,year,lat,lng
0,Albany,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,2043.0,3144.0,21.7,2012-2014,42.678066,-73.814233
1,Allegany,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,399.0,487.0,27.3,2012-2014,42.226801,-78.020567
2,Bronx,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,10032.0,21249.0,15.7,2012-2014,40.85589,-73.868294
3,Broome,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,1654.0,2083.0,26.5,2012-2014,42.122015,-75.933191
4,Capital Region,Family Planning/Natality Indicators,Percentage of births within 24 months of previ...,6273.0,9774.0,21.4,2012-2014,,


In [17]:
# keep most up-to-date indicators

df.sort_values('year', ascending=True, inplace=True)
df.drop_duplicates(['county_name', 'health_topic', 'indicator'], keep='last', inplace=True)

In [18]:
df.county_name.unique()

array(['Chemung', 'Ontario', 'Orange', 'Orleans', 'Oswego', 'Otsego',
       'Putnam', 'Queens', 'Rensselaer', 'Richmond', 'Rockland',
       'Saratoga', 'Schenectady', 'Schoharie', 'Schuyler', 'Seneca',
       'St. Lawrence', 'Steuben', 'Suffolk', 'Sullivan', 'Tioga',
       'Tompkins', 'Ulster', 'Warren', 'Washington', 'Wayne',
       'Westchester', 'Wyoming', 'Onondaga', 'Oneida', 'Niagara',
       'New York State', 'Bronx', 'Broome', 'Cattaraugus', 'Cayuga',
       'Chautauqua', 'Chenango', 'Clinton', 'Columbia', 'Cortland',
       'Delaware', 'Dutchess', 'Erie', 'Yates', 'Essex', 'Fulton',
       'Genesee', 'Greene', 'Hamilton', 'Herkimer', 'Jefferson', 'Kings',
       'Livingston', 'Madison', 'Monroe', 'Montgomery', 'Nassau',
       'New York', 'Franklin', 'Allegany', 'Lewis', 'Albany',
       'Capital Region', 'Finger Lakes', 'Southern Tier',
       'Tug Hill Seaway', 'North Country', 'New York City',
       'Mohawk Valley', 'Mid-Hudson', 'Long Island'], dtype=object)

In [19]:
df.head()


Unnamed: 0,county_name,health_topic,indicator,event_count,avg_num_of_denominator,precent_or_rate,year,lat,lng
8717,Chemung,"Tobacco, Alcohol and Other Substance Abuse Ind...",Age-adjusted percentage of adults living in ho...,,,71.0,2008-2009,42.116644,-76.812331
16777,Ontario,Obesity and Related Indicators,Age-adjusted percentage of adults eating 5 or ...,,,32.9,2008-2009,42.894571,-77.252045
16778,Orange,Obesity and Related Indicators,Age-adjusted percentage of adults eating 5 or ...,,,30.2,2008-2009,41.422459,-74.241929
16779,Orleans,Obesity and Related Indicators,Age-adjusted percentage of adults eating 5 or ...,,,24.9,2008-2009,43.248394,-78.218438
16780,Oswego,Obesity and Related Indicators,Age-adjusted percentage of adults eating 5 or ...,,,27.1,2008-2009,43.39123,-76.31133


In [22]:
df.shape[0]
df.drop('year', axis=1, inplace=True)

In [23]:
df.to_csv('community_health_cleaned.csv', index=False)