In [294]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [295]:
# CSV for reported Lyme Disease cases reported to CDC CDC.gov
# https://www.cdc.gov/lyme/stats/survfaq.html
df = pd.read_csv('/Users/tcbon/Downloads/LD-Case-Counts-by-County-00-17.csv', encoding='latin-1')

In [296]:
df.head()

Unnamed: 0,Ctyname,Stname,STCODE,CTYCODE,Cases2000,Cases2001,Cases2002,Cases2003,Cases2004,Cases2005,...,Cases2008,Cases2009,Cases2010,Cases2011,Cases2012,Cases2013,Cases2014,Cases2015,Cases2016,Cases2017
0,Autauga County,Alabama,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
1,Baldwin County,Alabama,1,3,1,0,1,0,0,0,...,0,1,0,1,1,0,3,1,2,2
2,Barbour County,Alabama,1,5,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,Bibb County,Alabama,1,7,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,Blount County,Alabama,1,9,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [297]:
# Removing 2000- 2009 columns
df = df.drop(columns=['Cases2000','Cases2001','Cases2002','Cases2003',
                 'Cases2004','Cases2005','Cases2006','Cases2007',
                 'Cases2008','Cases2009'])

In [298]:
df.head()

Unnamed: 0,Ctyname,Stname,STCODE,CTYCODE,Cases2010,Cases2011,Cases2012,Cases2013,Cases2014,Cases2015,Cases2016,Cases2017
0,Autauga County,Alabama,1,1,0,0,0,0,0,0,2,0
1,Baldwin County,Alabama,1,3,0,1,1,0,3,1,2,2
2,Barbour County,Alabama,1,5,0,1,0,0,0,0,0,0
3,Bibb County,Alabama,1,7,0,0,0,0,1,0,0,0
4,Blount County,Alabama,1,9,0,1,0,0,0,0,0,0


In [299]:
# CSV for county population data from 2010 - 2017 (census.gov)
# https://www.census.gov/data/datasets/2017/demo/popest/counties-total.html
# Dictionary -> https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2017/co-est2017-alldata.pdf
pop_df = pd.read_csv('/Users/tcbon/Downloads/co-est2017-alldata.csv', encoding='latin-1')

In [300]:
pop_df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2015,RDOMESTICMIG2016,RDOMESTICMIG2017,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017
0,40,3,6,1,0,Alabama,Alabama,4779736,4780135,4785579,...,-0.317205,-0.404473,0.788882,0.450741,0.939393,1.364296,0.694271,0.678575,0.558931,1.708218
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54750,...,-1.950739,4.831269,1.047102,5.911832,-6.102101,-4.050282,2.099325,-1.65904,5.103709,1.317904
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183110,...,17.047872,20.493601,22.383175,16.28594,17.196786,22.615285,20.380904,17.903749,21.317244,23.163873
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27332,...,-16.222436,-18.755525,-19.042395,0.256021,-6.822433,-8.01892,-5.549762,-16.411069,-18.947692,-19.15994
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22872,...,0.931388,-1.416117,-0.882983,-5.04198,-4.096646,-5.890038,1.24345,1.818424,-0.531044,0.0


In [302]:
# Merging dataframes 
all_years_with_pop = df.merge(pop_df, left_on=['Stname','Ctyname'],right_on=['STNAME','CTYNAME'])

In [303]:
#  Creating new columns for yearly incidence rate(number of cases/total population)
for i in range(8):
    all_years_with_pop['201{}_incid_rate'.format(i)] = (all_years_with_pop['Cases201{}'.format(i)] / all_years_with_pop['POPESTIMATE201{}'.format(i)])

In [304]:
all_years_with_pop = all_years_with_pop.loc[:, ['STCODE', 'CTYCODE','STNAME', 'CTYNAME', '2010_incid_rate', 
                        '2011_incid_rate', '2012_incid_rate', '2013_incid_rate',
                        '2014_incid_rate', '2015_incid_rate','2016_incid_rate', 
                        '2017_incid_rate']]

In [305]:
# all_years_with_pop.CTYNAME = all_years_with_pop.CTYNAME.str.strip(' County')

In [306]:
all_years_with_pop

Unnamed: 0,STCODE,CTYCODE,STNAME,CTYNAME,2010_incid_rate,2011_incid_rate,2012_incid_rate,2013_incid_rate,2014_incid_rate,2015_incid_rate,2016_incid_rate,2017_incid_rate
0,1,1,Alabama,Autauga County,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000,3.618076e-05,0.000000e+00
1,1,3,Alabama,Baldwin County,0.000000,0.000005,5.261829e-06,0.000000,1.507053e-05,0.000005,9.638136e-06,9.406099e-06
2,1,5,Alabama,Barbour County,0.000000,0.000037,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00
3,1,7,Alabama,Bibb County,0.000000,0.000000,0.000000e+00,0.000000,4.437935e-05,0.000000,0.000000e+00,0.000000e+00
4,1,9,Alabama,Blount County,0.000000,0.000017,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00
5,1,11,Alabama,Bullock County,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,9.700262e-05
6,1,13,Alabama,Butler County,0.000000,0.000048,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00
7,1,15,Alabama,Calhoun County,0.000000,0.000017,8.531040e-06,0.000026,1.725015e-05,0.000000,8.697165e-06,0.000000e+00
8,1,17,Alabama,Chambers County,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000029,0.000000e+00,0.000000e+00
9,1,19,Alabama,Cherokee County,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00


In [307]:
# # CSV for urban vs rural by county
# # Dictionary _> https://www.census.gov/programs-surveys/geography/technical-documentation/records-layout/2010-urban-lists-record-layout.html
# urban_df = pd.read_excel('/Users/tcbon/Downloads/PctUrbanRural_County.xls')

In [308]:
# urban_df.head()

In [142]:
# important factors
# mild winter, high acorn prodcution yerar before which leads to high mice population which are tick hosts

Preparing data for merging

In [310]:
# Rotating year columns to become rows
all_years_with_pop = pd.melt(all_years_with_pop,id_vars=['STNAME','CTYNAME'],value_vars=['2010_incid_rate','2011_incid_rate', '2012_incid_rate', 
                                                    '2013_incid_rate','2014_incid_rate', '2015_incid_rate', 
                                                    '2016_incid_rate','2017_incid_rate'],var_name='year',value_name='incidence_rate')



In [311]:
years= [x for x in range(8)]
for year in years:
    all_years_with_pop.year[all_years_with_pop.year == '201{}_incid_rate'.format(year)] = 2010 + year

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [315]:
all_years_with_pop.head()

Unnamed: 0,STNAME,CTYNAME,year,incidence_rate
0,Alabama,Autauga County,2010,0.0
1,Alabama,Baldwin County,2010,0.0
2,Alabama,Barbour County,2010,0.0
3,Alabama,Bibb County,2010,0.0
4,Alabama,Blount County,2010,0.0


In [313]:
with open('target_variable.pickle','wb') as f:
    pickle.dump(all_years_with_pop,f)