## chronic diseases indicators (CDI) Analysis

Analysis on lifestyle factors on health in Nashville/Tennessee

#### Import library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Formatting 

In [2]:
%matplotlib inline

In [3]:
#import csv file: U.S._Chronic_Disease_Indicators__CDI_.master
us_cdi_df = pd.read_csv('../data/U.S._Chronic_Disease_Indicators__CDI_.master.csv', low_memory=False)

In [4]:
#look at head
us_cdi_df.head(5)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2010,2014,WA,Washington,Death Certificate,Cancer,"Cancer of the colon and rectum (colorectal), m...",,"per 100,000",Average Annual Age-adjusted Rate,...,53,CAN,CAN7_2,AVGANNAGEADJRATE,GENDER,GENM,,,,
1,2009,2013,OR,Oregon,Statewide central cancer registries,Cancer,"Invasive cancer (all sites combined), incidence",,"per 100,000",Average Annual Age-adjusted Rate,...,41,CAN,CAN4_1,AVGANNAGEADJRATE,RACE,WHT,,,,
2,2009,2013,FL,Florida,Death Certificate,Cancer,"Cancer of the oral cavity and pharynx, mortality",,"per 100,000",Average Annual Crude Rate,...,12,CAN,CAN10_2,AVGANNCRDRATE,RACE,AIAO,,,,
3,2010,2014,MI,Michigan,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence",,,Average Annual Number,...,26,CAN,CAN8_1,AVGANNNMBR,RACE,WHT,,,,
4,2010,2014,KY,Kentucky,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence",,"per 100,000",Average Annual Crude Rate,...,21,CAN,CAN8_1,AVGANNCRDRATE,RACE,AIAO,,,,


In [5]:
#look at tail
us_cdi_df.tail(5)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
814932,2009,2013,NM,New Mexico,Death Certificate,Cancer,"Invasive cancer (all sites combined), mortality",,,Average Annual Number,...,35,CAN,CAN4_2,AVGANNNMBR,OVERALL,OVR,,,,
814933,2009,2013,AL,Alabama,Statewide central cancer registries,Cancer,"Invasive cancer of the oral cavity or pharynx,...",,"per 100,000",Average Annual Age-adjusted Rate,...,1,CAN,CAN10_1,AVGANNAGEADJRATE,RACE,WHT,,,,
814934,2010,2014,OH,Ohio,Death Certificate,Cancer,"Cancer of the lung and bronchus, mortality",,"per 100,000",Average Annual Age-adjusted Rate,...,39,CAN,CAN8_2,AVGANNAGEADJRATE,OVERALL,OVR,,,,
814935,2009,2013,SC,South Carolina,Death Certificate,Cancer,"Invasive cancer (all sites combined), mortality",,"per 100,000",Average Annual Age-adjusted Rate,...,45,CAN,CAN4_2,AVGANNAGEADJRATE,RACE,HIS,,,,
814936,2009,2013,DC,District of Columbia,Death Certificate,Cancer,"Cancer of the oral cavity and pharynx, mortality",,,Average Annual Number,...,11,CAN,CAN10_2,AVGANNNMBR,RACE,AIAO,,,,


In [6]:
# examine shape of us cdi df
#814937 rows, 34 columns
us_cdi_df.shape

(814937, 34)

In [7]:
#examine info of us cdi df
us_cdi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 814937 entries, 0 to 814936
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   YearStart                  814937 non-null  int64  
 1   YearEnd                    814937 non-null  int64  
 2   LocationAbbr               814937 non-null  object 
 3   LocationDesc               814937 non-null  object 
 4   DataSource                 814937 non-null  object 
 5   Topic                      814937 non-null  object 
 6   Question                   814937 non-null  object 
 7   Response                   0 non-null       float64
 8   DataValueUnit              728402 non-null  object 
 9   DataValueType              814937 non-null  object 
 10  DataValue                  555277 non-null  object 
 11  DataValueAlt               554139 non-null  object 
 12  DataValueFootnoteSymbol    270821 non-null  object 
 13  DatavalueFootnote          27

In [8]:
# drop unecessary columns in us cdi df 
us_cdi_df = us_cdi_df.drop(columns = ['LocationAbbr','Response', 'DataValueFootnoteSymbol','DatavalueFootnote','LowConfidenceLimit','HighConfidenceLimit','StratificationCategory2','Stratification2','StratificationCategory3','Stratification3','ResponseID','LocationID','TopicID','QuestionID','DataValueTypeID','StratificationCategoryID1','StratificationID1','StratificationCategoryID2','StratificationCategoryID3','StratificationID2','StratificationID3'])                                                                       

In [9]:
#look at column names now
us_cdi_df.columns

Index(['YearStart', 'YearEnd', 'LocationDesc', 'DataSource', 'Topic',
       'Question', 'DataValueUnit', 'DataValueType', 'DataValue',
       'DataValueAlt', 'StratificationCategory1', 'Stratification1',
       'GeoLocation'],
      dtype='object')

In [10]:
# change YearEnd to datetimeindex format
us_cdi_df['YearEnd'] = pd.DatetimeIndex(us_cdi_df['YearEnd'])

In [11]:
# want to subset to have topic of Nutrition, Physical Activity, and Weight Status on topic column

#need to subset multiple columns based on mulitple conditions

# us_cdi_df to only have DataSource col 'BRFSS' and DataValueType be 'Crude Prevalence'
## example of exact match subset dogs["breed"] == "Labrador"

# brfss and cp subset
brfss = us_cdi_df['DataSource'] == 'BRFSS'
cp    = us_cdi_df['DataValueType']== 'Crude Prevalence'
us_cdi_brfss_cp = us_cdi_df[brfss & cp]

# DataSource needs to be 'BRFSS', Topic be 'Nutrition, Physical Activity, and Weight Status', DataValueType be 'Crude Prevalence'
# brfss and cp and npw subset

npw = us_cdi_df['Topic']== 'Nutrition, Physical Activity, and Weight Status'
us_cdi_brfss_npw_cp = us_cdi_df[brfss & cp & npw]


In [12]:
us_cdi_brfss_cp.head()

Unnamed: 0,YearStart,YearEnd,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation
813,2013,1970-01-01 00:00:00.000002013,Mississippi,BRFSS,Arthritis,Fair or poor health among adults aged >= 18 ye...,%,Crude Prevalence,46.9,46.9,Gender,Male,"(32.745510099000455, -89.53803082499968)"
1419,2015,1970-01-01 00:00:00.000002015,Arizona,BRFSS,Arthritis,Arthritis among adults aged >= 18 years,%,Crude Prevalence,23.6,23.6,Overall,Overall,"(34.865970280000454, -111.76381127699972)"
3469,2014,1970-01-01 00:00:00.000002014,Arkansas,BRFSS,Alcohol,Heavy drinking among adults aged >= 18 years,%,Crude Prevalence,6.0,6.0,Gender,Male,"(34.74865012400045, -92.27449074299966)"
4077,2018,1970-01-01 00:00:00.000002018,Virginia,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,%,Crude Prevalence,36.6,36.6,Gender,Male,"(37.54268067400045, -78.45789046299967)"
4148,2018,1970-01-01 00:00:00.000002018,Idaho,BRFSS,Asthma,Asthma prevalence among women aged 18-44 years,%,Crude Prevalence,12.4,12.4,Race/Ethnicity,"White, non-Hispanic","(43.682630005000476, -114.3637300419997)"


In [13]:
us_cdi_brfss_npw_cp.head()

Unnamed: 0,YearStart,YearEnd,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation
29332,2018,1970-01-01 00:00:00.000002018,North Dakota,BRFSS,"Nutrition, Physical Activity, and Weight Status",No leisure-time physical activity among adults...,%,Crude Prevalence,22.3,22.3,Overall,Overall,"(47.47531977900047, -100.11842104899966)"
32562,2018,1970-01-01 00:00:00.000002018,Utah,BRFSS,"Nutrition, Physical Activity, and Weight Status",Obesity among adults aged >= 18 years,%,Crude Prevalence,30.0,30.0,Race/Ethnicity,Hispanic,"(39.360700171000474, -111.58713063499971)"
32687,2018,1970-01-01 00:00:00.000002018,Oklahoma,BRFSS,"Nutrition, Physical Activity, and Weight Status",Overweight or obesity among adults aged >= 18 ...,%,Crude Prevalence,68.9,68.9,Race/Ethnicity,"White, non-Hispanic","(35.47203135600046, -97.52107021399968)"
34092,2018,1970-01-01 00:00:00.000002018,Wisconsin,BRFSS,"Nutrition, Physical Activity, and Weight Status",Obesity among adults aged >= 18 years,%,Crude Prevalence,40.3,40.3,Race/Ethnicity,Hispanic,"(44.39319117400049, -89.81637074199966)"
34098,2018,1970-01-01 00:00:00.000002018,Montana,BRFSS,"Nutrition, Physical Activity, and Weight Status",Healthy weight among adults aged >= 18 years,%,Crude Prevalence,32.9,32.9,Race/Ethnicity,Hispanic,"(47.06652897200047, -109.42442064499971)"


In [14]:
# three conditions for questions subset see notes of capstone proposal

#Row contains string ‘consumption’ mean the person fruit and vegetable consumption frequency
## example of isin subset is_black_or_brown = dogs["color"].isin(["Black", "Brown"]) #this isin code has an error
## df[df['A'].str.contains("hello")]
nutrition_intake_consump = us_cdi_df[us_cdi_df['Question'].str.contains("consumption")]
nutrition_intake_consump.head()
#us_cdi_brfss_npw_cp_consump = us_cdi_df[brfss & cp & npw & nutrition_intake_consump]

#There are so many filters within the data set of us cdc dataframe and i did a general analysis with excel. See notes in capstone

Unnamed: 0,YearStart,YearEnd,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation
34319,2017,1970-01-01 00:00:00.000002017,New Hampshire,YRBSS,"Nutrition, Physical Activity, and Weight Status",Soda consumption among high school students,%,Crude Prevalence,11.7,11.7,Race/Ethnicity,"Asian, non-Hispanic","(43.65595011300047, -71.50036091999965)"
34331,2017,1970-01-01 00:00:00.000002017,Montana,YRBSS,"Nutrition, Physical Activity, and Weight Status",Soda consumption among high school students,%,Crude Prevalence,10.9,10.9,Gender,Female,"(47.06652897200047, -109.42442064499971)"
34372,2017,1970-01-01 00:00:00.000002017,California,YRBSS,"Nutrition, Physical Activity, and Weight Status",Soda consumption among high school students,%,Crude Prevalence,13.2,13.2,Race/Ethnicity,"Black, non-Hispanic","(37.63864012300047, -120.99999953799971)"
35530,2016,1970-01-01 00:00:00.000002016,Georgia,AEDS,Alcohol,Per capita alcohol consumption among persons a...,gallons,Per capita alcohol consumption,1.9,1.9,Overall,Overall,"(32.83968109300048, -83.62758034599966)"
35557,2016,1970-01-01 00:00:00.000002016,Delaware,AEDS,Alcohol,Per capita alcohol consumption among persons a...,gallons,Per capita alcohol consumption,3.7,3.7,Overall,Overall,"(39.008830667000495, -75.57774116799965)"


In [15]:
#import for only obesity dataset to explore relationship between obesity and nutrition intake and excercise
# inner joins obesity data,nutrition and exercise after apply filters formula

#import csv file: Nutrition__Physical_Activity__and_Obesity_-_Behavioral_Risk_Factor_Surveillance_System
us_obesity_df = pd.read_csv('../data/Nutrition__Physical_Activity__and_Obesity_-_Behavioral_Risk_Factor_Surveillance_System.csv')

In [16]:
# examine us_obesity_df head

us_obesity_df.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,2012,2012,WY,Wyoming,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(43.235541343, -108.109830353)",OWS,OWS1,Q037,VALUE,56,Race/Ethnicity,American Indian/Alaska Native,RACE,RACENAA
1,2012,2012,DC,District of Columbia,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(38.890371385, -77.031961127)",OWS,OWS1,Q036,VALUE,11,Education,Less than high school,EDU,EDUHS
2,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.840571122, -86.631860762)",OWS,OWS1,Q036,VALUE,1,Age (years),25 - 34,AGEYR,AGEYR2534
3,2013,2013,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Gender,Female,GEN,FEMALE
4,2011,2011,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Age (years),18 - 24,AGEYR,AGEYR1824


In [17]:
#dataframe info

us_obesity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63028 entries, 0 to 63027
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   YearStart                   63028 non-null  int64  
 1   YearEnd                     63028 non-null  int64  
 2   LocationAbbr                63028 non-null  object 
 3   LocationDesc                63028 non-null  object 
 4   Datasource                  63028 non-null  object 
 5   Class                       63028 non-null  object 
 6   Topic                       63028 non-null  object 
 7   Question                    63028 non-null  object 
 8   Data_Value_Unit             0 non-null      float64
 9   Data_Value_Type             63028 non-null  object 
 10  Data_Value                  56796 non-null  float64
 11  Data_Value_Alt              56796 non-null  float64
 12  Data_Value_Footnote_Symbol  6232 non-null   object 
 13  Data_Value_Footnote         623

In [18]:
# subset obeisty values
#use Question contains to subset  
## df[df['A'].str.contains("hello")]

#explore if there are ways to transpose rows to columns grouping by topics and sub topics and show their values
#trying df.pivot(column='var',values = 'val') that spread rows into columns

#by_topic = us_obesity_df.pivot(columns='Topic', values='Data_Value')
#it does not work as there are no matching values for each column

In [19]:
#try groupby

by_topic = us_obesity_df.groupby(by='Topic')

In [20]:
by_topic.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,2012,2012,WY,Wyoming,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(43.235541343, -108.109830353)",OWS,OWS1,Q037,VALUE,56,Race/Ethnicity,American Indian/Alaska Native,RACE,RACENAA
1,2012,2012,DC,District of Columbia,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(38.890371385, -77.031961127)",OWS,OWS1,Q036,VALUE,11,Education,Less than high school,EDU,EDUHS
2,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.840571122, -86.631860762)",OWS,OWS1,Q036,VALUE,1,Age (years),25 - 34,AGEYR,AGEYR2534
3,2013,2013,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Gender,Female,GEN,FEMALE
4,2011,2011,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Age (years),18 - 24,AGEYR,AGEYR1824
5,2011,2011,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Age (years),25 - 34,AGEYR,AGEYR2534
6,2011,2011,US,National,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,,PA,PA1,Q047,VALUE,59,Age (years),55 - 64,AGEYR,AGEYR5564
7,2015,2015,RI,Rhode Island,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(41.708280193, -71.522470314)",OWS,OWS1,Q037,VALUE,44,Race/Ethnicity,Hispanic,RACE,RACEHIS
8,2012,2012,WY,Wyoming,Behavioral Risk Factor Surveillance System,Physical Activity,Physical Activity - Behavior,Percent of adults who engage in no leisure-tim...,,Value,...,"(43.235541343, -108.109830353)",PA,PA1,Q047,VALUE,56,Income,"Less than $15,000",INC,INCLESS15
10,2011,2011,WA,Washington,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(47.522278629, -120.47001079)",OWS,OWS1,Q037,VALUE,53,Gender,Male,GEN,MALE


In [21]:
#figure out at this point it is easier to do EDA with excel

#### I want to create map showing Nashville location with obese and no physical activity
##### reference: https://medium.com/analytics-vidhya/finding-nearest-pair-of-latitude-and-longitude-match-using-python-ce50d62af546


In [22]:
#import dataset with no physical activity nashville #(member)
#file name: 500_Cities__No_leisure-time_physical_activity_among_adults_aged___18_years
no_activity = pd.read_csv('../data/500_Cities__No_leisure-time_physical_activity_among_adults_aged___18_years_tn_cities.csv')
no_activity.head()

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,UniqueID,Measure,Data_Value_Unit,...,High_Confidence_Limit,Data_Value_Footnote_Symbol,Data_Value_Footnote,PopulationCount,GeoLocation,CategoryID,MeasureId,CityFIPS,TractFIPS,Short_Question_Text
0,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,4740000-47093002600,No leisure-time physical activity among adults...,%,...,37.3,,,2451,"(35.9541120412, -83.9603611106)",UNHBEH,LPA,4740000,47093000000.0,Physical Inactivity
1,2017,TN,Tennessee,Clarksville,Census Tract,BRFSS,Unhealthy Behaviors,4715160-47125100300,No leisure-time physical activity among adults...,%,...,37.1,,,5221,"(36.4955960329, -87.3598454821)",UNHBEH,LPA,4715160,47125100000.0,Physical Inactivity
2,2017,TN,Tennessee,Chattanooga,Census Tract,BRFSS,Unhealthy Behaviors,4714000-47065010432,No leisure-time physical activity among adults...,%,...,25.3,,,6317,"(35.1279296495, -85.2082336438)",UNHBEH,LPA,4714000,47065010000.0,Physical Inactivity
3,2017,TN,Tennessee,Chattanooga,Census Tract,BRFSS,Unhealthy Behaviors,4714000-47065000800,No leisure-time physical activity among adults...,%,...,29.5,,,1348,"(35.0684524247, -85.3130794178)",UNHBEH,LPA,4714000,47065000000.0,Physical Inactivity
4,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,4740000-47093004000,No leisure-time physical activity among adults...,%,...,32.7,,,4313,"(36.0123147591, -83.959901727)",UNHBEH,LPA,4740000,47093000000.0,Physical Inactivity


In [23]:
#import obese nashville dataset #hotel
#file name: 500_Cities__Obesity_among_adults_aged___18_years 
obese = pd.read_csv('../data/500_Cities__Obesity_among_adults_aged___18_years_tn_cities.csv')
obese.head()

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,UniqueID,Measure,Data_Value_Unit,...,High_Confidence_Limit,Data_Value_Footnote_Symbol,Data_Value_Footnote,PopulationCount,GeoLocation,CategoryID,MeasureId,CityFIPS,TractFIPS,Short_Question_Text
0,2017,TN,Tennessee,Chattanooga,Census Tract,BRFSS,Unhealthy Behaviors,4714000-47065010433,Obesity among adults aged >=18 Years,%,...,32.2,,,5095,"(35.1233714276, -85.2493676609)",UNHBEH,OBESITY,4714000,47065010000.0,Obesity
1,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,4740000-47093004100,Obesity among adults aged >=18 Years,%,...,32.8,,,4160,"(36.0199847745, -83.940415456)",UNHBEH,OBESITY,4740000,47093000000.0,Obesity
2,2017,TN,Tennessee,Chattanooga,Census Tract,BRFSS,Unhealthy Behaviors,4714000-47065010901,Obesity among adults aged >=18 Years,%,...,28.5,,,1457,"(35.0553305719, -85.3330624598)",UNHBEH,OBESITY,4714000,47065010000.0,Obesity
3,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,4740000-47093002900,Obesity among adults aged >=18 Years,%,...,42.0,,,3765,"(36.003541894, -83.9398730976)",UNHBEH,OBESITY,4740000,47093000000.0,Obesity
4,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,4740000-47093003902,Obesity among adults aged >=18 Years,%,...,35.5,,,3019,"(35.9985898898, -83.9728153766)",UNHBEH,OBESITY,4740000,47093000000.0,Obesity


In [24]:
# split geolocation column into lat and lon
split_data_obese = obese.GeoLocation.str.strip(')').str.strip('(')#.str.split(', ') #become a series of list, each list of your series,extract
# how many nan and I have dropped 2 nan values
split_data_obese = split_data_obese.dropna()
#obese['lat'] = split_data_obese.apply(lambda x: x[0])
#obese['lon'] = split_data_obese.apply(lambda x: x[1])

split_data_obese = pd.DataFrame(split_data_obese, columns = ['GeoLocation'])
#split_data_obese.head()

split_data_obese.head()


Unnamed: 0,GeoLocation
0,"35.1233714276, -85.2493676609"
1,"36.0199847745, -83.940415456"
2,"35.0553305719, -85.3330624598"
3,"36.003541894, -83.9398730976"
4,"35.9985898898, -83.9728153766"


In [25]:
#reference: https://chrisalbon.com/python/data_wrangling/pandas_split_lat_and_long_into_variables/
#Create two lists for the loop results to be placed
lat = []
lon = []

# For each row in a variable,

for row in split_data_obese['GeoLocation']:
    # try to,
    #try:
        #Split the row by comma and append
        #everything before the comma to lat
        lat.append(row.split(',')[0])
        #Split the row by comma and append
        #Everything after the comma to lon
        lon.append(row.split(',')[1])

    


In [26]:
# Create two new columns from lat and lon
split_data_obese['lat'] = lat
split_data_obese['lon'] = lon

In [27]:
split_data_obese

Unnamed: 0,GeoLocation,lat,lon
0,"35.1233714276, -85.2493676609",35.1233714276,-85.2493676609
1,"36.0199847745, -83.940415456",36.0199847745,-83.940415456
2,"35.0553305719, -85.3330624598",35.0553305719,-85.3330624598
3,"36.003541894, -83.9398730976",36.003541894,-83.9398730976
4,"35.9985898898, -83.9728153766",35.9985898898,-83.9728153766
...,...,...,...
544,"36.1582500552, -86.8715276437",36.1582500552,-86.8715276437
545,"35.1004579845, -89.9399787328",35.1004579845,-89.9399787328
546,"35.1653247299, -90.01466327",35.1653247299,-90.01466327
547,"36.0712760339, -86.9382627391",36.0712760339,-86.9382627391


In [28]:
#drop column GeoLocation on split_data_obese so i can do a outer join with obese dataset

split_data_obese = split_data_obese.drop(columns = 'GeoLocation')

In [29]:
#join the obese lat lon to obese dataset 
obese_latlon = obese.join(split_data_obese, how='outer') 

In [30]:
obese_latlon.head(5)

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,UniqueID,Measure,Data_Value_Unit,...,Data_Value_Footnote,PopulationCount,GeoLocation,CategoryID,MeasureId,CityFIPS,TractFIPS,Short_Question_Text,lat,lon
0,2017,TN,Tennessee,Chattanooga,Census Tract,BRFSS,Unhealthy Behaviors,4714000-47065010433,Obesity among adults aged >=18 Years,%,...,,5095,"(35.1233714276, -85.2493676609)",UNHBEH,OBESITY,4714000,47065010000.0,Obesity,35.1233714276,-85.2493676609
1,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,4740000-47093004100,Obesity among adults aged >=18 Years,%,...,,4160,"(36.0199847745, -83.940415456)",UNHBEH,OBESITY,4740000,47093000000.0,Obesity,36.0199847745,-83.940415456
2,2017,TN,Tennessee,Chattanooga,Census Tract,BRFSS,Unhealthy Behaviors,4714000-47065010901,Obesity among adults aged >=18 Years,%,...,,1457,"(35.0553305719, -85.3330624598)",UNHBEH,OBESITY,4714000,47065010000.0,Obesity,35.0553305719,-85.3330624598
3,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,4740000-47093002900,Obesity among adults aged >=18 Years,%,...,,3765,"(36.003541894, -83.9398730976)",UNHBEH,OBESITY,4740000,47093000000.0,Obesity,36.003541894,-83.9398730976
4,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,4740000-47093003902,Obesity among adults aged >=18 Years,%,...,,3019,"(35.9985898898, -83.9728153766)",UNHBEH,OBESITY,4740000,47093000000.0,Obesity,35.9985898898,-83.9728153766


In [31]:

# split geolocation column into lat and lon
split_data_no_activity = no_activity.GeoLocation.str.strip(')').str.strip('(')#.str.split(', ') #become a series of list, each list of your series,extract
# how many nan and I have dropped 2 nan values
split_data_no_activity = split_data_no_activity.dropna()
#obese['lat'] = split_data_obese.apply(lambda x: x[0])
#obese['lon'] = split_data_obese.apply(lambda x: x[1])

split_data_no_activity = pd.DataFrame(split_data_no_activity, columns = ['GeoLocation'])
#split_data_obese.head()

split_data_no_activity.head()


Unnamed: 0,GeoLocation
0,"35.9541120412, -83.9603611106"
1,"36.4955960329, -87.3598454821"
2,"35.1279296495, -85.2082336438"
3,"35.0684524247, -85.3130794178"
4,"36.0123147591, -83.959901727"


In [32]:
#Create two lists for the loop results to be placed

lat1 = []
lon1 = []

# For each row in a variable,

for row in split_data_no_activity['GeoLocation']:
    # try to,
    #try:
        #Split the row by comma and append
        #everything before the comma to lat
        lat1.append(row.split(',')[0])
        #Split the row by comma and append
        #Everything after the comma to lon
        lon1.append(row.split(',')[1])

    

In [33]:
# Create two new columns from lat and lon
split_data_no_activity['lat'] = lat1
split_data_no_activity['lon'] = lon1

In [34]:
split_data_no_activity

Unnamed: 0,GeoLocation,lat,lon
0,"35.9541120412, -83.9603611106",35.9541120412,-83.9603611106
1,"36.4955960329, -87.3598454821",36.4955960329,-87.3598454821
2,"35.1279296495, -85.2082336438",35.1279296495,-85.2082336438
3,"35.0684524247, -85.3130794178",35.0684524247,-85.3130794178
4,"36.0123147591, -83.959901727",36.0123147591,-83.959901727
...,...,...,...
544,"36.0810876055, -86.6188609306",36.0810876055,-86.6188609306
545,"36.148106453, -86.5856887267",36.148106453,-86.5856887267
546,"36.151096554, -86.7597078621",36.151096554,-86.7597078621
547,"36.2249944111, -86.7399718241",36.2249944111,-86.7399718241


In [35]:
#drop column GeoLocation on split_data_no_activity so i can do a outer join with no activity dataset

split_data_no_activity = split_data_no_activity.drop(columns = 'GeoLocation')

In [36]:
#join the no activity lat lon to no activity dataset
no_activity_latlon = no_activity.join(split_data_no_activity, how='outer') 

In [37]:
no_activity_latlon

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,UniqueID,Measure,Data_Value_Unit,...,Data_Value_Footnote,PopulationCount,GeoLocation,CategoryID,MeasureId,CityFIPS,TractFIPS,Short_Question_Text,lat,lon
0,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,4740000-47093002600,No leisure-time physical activity among adults...,%,...,,2451,"(35.9541120412, -83.9603611106)",UNHBEH,LPA,4740000,4.709300e+10,Physical Inactivity,35.9541120412,-83.9603611106
1,2017,TN,Tennessee,Clarksville,Census Tract,BRFSS,Unhealthy Behaviors,4715160-47125100300,No leisure-time physical activity among adults...,%,...,,5221,"(36.4955960329, -87.3598454821)",UNHBEH,LPA,4715160,4.712510e+10,Physical Inactivity,36.4955960329,-87.3598454821
2,2017,TN,Tennessee,Chattanooga,Census Tract,BRFSS,Unhealthy Behaviors,4714000-47065010432,No leisure-time physical activity among adults...,%,...,,6317,"(35.1279296495, -85.2082336438)",UNHBEH,LPA,4714000,4.706501e+10,Physical Inactivity,35.1279296495,-85.2082336438
3,2017,TN,Tennessee,Chattanooga,Census Tract,BRFSS,Unhealthy Behaviors,4714000-47065000800,No leisure-time physical activity among adults...,%,...,,1348,"(35.0684524247, -85.3130794178)",UNHBEH,LPA,4714000,4.706500e+10,Physical Inactivity,35.0684524247,-85.3130794178
4,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,4740000-47093004000,No leisure-time physical activity among adults...,%,...,,4313,"(36.0123147591, -83.959901727)",UNHBEH,LPA,4740000,4.709300e+10,Physical Inactivity,36.0123147591,-83.959901727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
544,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,4752006-47037015618,No leisure-time physical activity among adults...,%,...,,5712,"(36.0810876055, -86.6188609306)",UNHBEH,LPA,4752006,4.703702e+10,Physical Inactivity,36.0810876055,-86.6188609306
545,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,4752006-47037015610,No leisure-time physical activity among adults...,%,...,,7820,"(36.148106453, -86.5856887267)",UNHBEH,LPA,4752006,4.703702e+10,Physical Inactivity,36.148106453,-86.5856887267
546,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,4752006-47037014800,No leisure-time physical activity among adults...,%,...,,3127,"(36.151096554, -86.7597078621)",UNHBEH,LPA,4752006,4.703701e+10,Physical Inactivity,36.151096554,-86.7597078621
547,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,4752006-47037011002,No leisure-time physical activity among adults...,%,...,,2643,"(36.2249944111, -86.7399718241)",UNHBEH,LPA,4752006,4.703701e+10,Physical Inactivity,36.2249944111,-86.7399718241


In [38]:
obese_latlon.info() 
print('\n XXXXXXXXXXXXXXXXXXXXXXX\n')
no_activity_latlon.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 549 entries, 0 to 548
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Year                        549 non-null    int64  
 1   StateAbbr                   549 non-null    object 
 2   StateDesc                   549 non-null    object 
 3   CityName                    549 non-null    object 
 4   GeographicLevel             549 non-null    object 
 5   DataSource                  549 non-null    object 
 6   Category                    549 non-null    object 
 7   UniqueID                    549 non-null    object 
 8   Measure                     549 non-null    object 
 9   Data_Value_Unit             549 non-null    object 
 10  DataValueTypeID             549 non-null    object 
 11  Data_Value_Type             549 non-null    object 
 12  Data_Value                  523 non-null    float64
 13  Low_Confidence_Limit        523 non

In [39]:
# convert just columns "lat" and "lon" to float64
obese_latlon[["lat", "lon"]] = obese_latlon[["lat", "lon"]].apply(pd.to_numeric)

In [40]:
obese_latlon.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 549 entries, 0 to 548
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Year                        549 non-null    int64  
 1   StateAbbr                   549 non-null    object 
 2   StateDesc                   549 non-null    object 
 3   CityName                    549 non-null    object 
 4   GeographicLevel             549 non-null    object 
 5   DataSource                  549 non-null    object 
 6   Category                    549 non-null    object 
 7   UniqueID                    549 non-null    object 
 8   Measure                     549 non-null    object 
 9   Data_Value_Unit             549 non-null    object 
 10  DataValueTypeID             549 non-null    object 
 11  Data_Value_Type             549 non-null    object 
 12  Data_Value                  523 non-null    float64
 13  Low_Confidence_Limit        523 non

In [41]:
# convert just columns "lat" and "lon" to float64
no_activity_latlon[["lat", "lon"]] = obese_latlon[["lat", "lon"]].apply(pd.to_numeric)

In [42]:
no_activity_latlon.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 549 entries, 0 to 548
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Year                        549 non-null    int64  
 1   StateAbbr                   549 non-null    object 
 2   StateDesc                   549 non-null    object 
 3   CityName                    549 non-null    object 
 4   GeographicLevel             549 non-null    object 
 5   DataSource                  549 non-null    object 
 6   Category                    549 non-null    object 
 7   UniqueID                    549 non-null    object 
 8   Measure                     549 non-null    object 
 9   Data_Value_Unit             549 non-null    object 
 10  DataValueTypeID             549 non-null    object 
 11  Data_Value_Type             549 non-null    object 
 12  Data_Value                  523 non-null    float64
 13  Low_Confidence_Limit        523 non

In [43]:
from math import radians, cos, sin, asin, sqrt
def dist(lat1, long1, lat2, long2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lat1, long1, lat2, long2 = map(radians, [lat1, long1, lat2, long2])
    # haversine formula 
    dlon = long2 - long1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

In [44]:
#obese_latlon.columns

In [45]:
#print(obese_latlon.head())

In [46]:
#no_activity_latlon.head()

In [47]:
# There are two duplicate Uniqiue ID columns in both dataset
#before apply distance of obese (hotel) using UniqueID column or joining the data drop UniqueID column from no activity (member) so pandas is not confused on which UniqueID column to join
no_activity_latlon = no_activity_latlon.drop(columns = ['UniqueID'])

In [48]:
def find_nearest(lat, long):
    distances = obese_latlon.apply(
        lambda row: dist(lat, long, row['lat'], row['lon']), 
        axis=1)
    return obese_latlon.loc[distances.idxmin(), 'UniqueID']

In [49]:
#append obese nearest uniqiue id (ref: hotel name) to no activity dataset new column as oUniqueID
no_activity_latlon['UniqueID'] = no_activity_latlon.apply(
    lambda row: find_nearest(row['lat'], row['lon']), 
    axis=1)
# To check the data frame if it has a new column of obese [oUniqueID] (for each and no activity(member in ref)'s location in the list)
no_activity_latlon.head()

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,Measure,Data_Value_Unit,DataValueTypeID,...,PopulationCount,GeoLocation,CategoryID,MeasureId,CityFIPS,TractFIPS,Short_Question_Text,lat,lon,UniqueID
0,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,2451,"(35.9541120412, -83.9603611106)",UNHBEH,LPA,4740000,47093000000.0,Physical Inactivity,35.123371,-85.249368,4714000-47065010433
1,2017,TN,Tennessee,Clarksville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,5221,"(36.4955960329, -87.3598454821)",UNHBEH,LPA,4715160,47125100000.0,Physical Inactivity,36.019985,-83.940415,4740000-47093004100
2,2017,TN,Tennessee,Chattanooga,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,6317,"(35.1279296495, -85.2082336438)",UNHBEH,LPA,4714000,47065010000.0,Physical Inactivity,35.055331,-85.333062,4714000-47065010901
3,2017,TN,Tennessee,Chattanooga,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,1348,"(35.0684524247, -85.3130794178)",UNHBEH,LPA,4714000,47065000000.0,Physical Inactivity,36.003542,-83.939873,4740000-47093002900
4,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,4313,"(36.0123147591, -83.959901727)",UNHBEH,LPA,4740000,47093000000.0,Physical Inactivity,35.99859,-83.972815,4740000-47093003902


#### Finding the “distance” between no activity (ref: member) location to the closest obese location (ref: hotel)

In [50]:
no_activity_latlon = pd.merge(no_activity_latlon,obese_latlon[['UniqueID','lat','lon']],on='UniqueID', how='left')
no_activity_latlon.tail(5)

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,Measure,Data_Value_Unit,DataValueTypeID,...,CategoryID,MeasureId,CityFIPS,TractFIPS,Short_Question_Text,lat_x,lon_x,UniqueID,lat_y,lon_y
556,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,UNHBEH,LPA,4752006,47037020000.0,Physical Inactivity,36.15825,-86.871528,4752006-47037013202,36.15825,-86.871528
557,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,UNHBEH,LPA,4752006,47037020000.0,Physical Inactivity,35.100458,-89.939979,4748000-47157008000,35.100458,-89.939979
558,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,UNHBEH,LPA,4752006,47037010000.0,Physical Inactivity,35.165325,-90.014663,4748000-47157011200,35.165325,-90.014663
559,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,UNHBEH,LPA,4752006,47037010000.0,Physical Inactivity,36.071276,-86.938263,4752006-47037018410,36.071276,-86.938263
560,2017,TN,Tennessee,Memphis,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,UNHBEH,LPA,4748000,47157020000.0,Physical Inactivity,35.904043,-86.47891,4751560-47149040302,35.904043,-86.47891


In [51]:
# Rename the new columns as both the columns has same name, and python gets confused 
no_activity_latlon=no_activity_latlon.rename(columns = {'lat_x':'n_lat','lon_x':'n_lon','lat_y':'o_lat','lon_y':'o_lon'})
no_activity_latlon 

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,Measure,Data_Value_Unit,DataValueTypeID,...,CategoryID,MeasureId,CityFIPS,TractFIPS,Short_Question_Text,n_lat,n_lon,UniqueID,o_lat,o_lon
0,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,UNHBEH,LPA,4740000,4.709300e+10,Physical Inactivity,35.123371,-85.249368,4714000-47065010433,35.123371,-85.249368
1,2017,TN,Tennessee,Clarksville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,UNHBEH,LPA,4715160,4.712510e+10,Physical Inactivity,36.019985,-83.940415,4740000-47093004100,36.019985,-83.940415
2,2017,TN,Tennessee,Chattanooga,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,UNHBEH,LPA,4714000,4.706501e+10,Physical Inactivity,35.055331,-85.333062,4714000-47065010901,35.055331,-85.333062
3,2017,TN,Tennessee,Chattanooga,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,UNHBEH,LPA,4714000,4.706500e+10,Physical Inactivity,36.003542,-83.939873,4740000-47093002900,36.003542,-83.939873
4,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,UNHBEH,LPA,4740000,4.709300e+10,Physical Inactivity,35.998590,-83.972815,4740000-47093003902,35.998590,-83.972815
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,UNHBEH,LPA,4752006,4.703702e+10,Physical Inactivity,36.158250,-86.871528,4752006-47037013202,36.158250,-86.871528
557,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,UNHBEH,LPA,4752006,4.703702e+10,Physical Inactivity,35.100458,-89.939979,4748000-47157008000,35.100458,-89.939979
558,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,UNHBEH,LPA,4752006,4.703701e+10,Physical Inactivity,35.165325,-90.014663,4748000-47157011200,35.165325,-90.014663
559,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,UNHBEH,LPA,4752006,4.703701e+10,Physical Inactivity,36.071276,-86.938263,4752006-47037018410,36.071276,-86.938263


In [52]:
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km
# Creating a new column to generate the output by passing lat long information to Haversine Equation
no_activity_latlon['distance'] = [haversine(no_activity_latlon.n_lon[i],no_activity_latlon.n_lat[i],no_activity_latlon.o_lon[i],no_activity_latlon.o_lat[i]) for i in range(len(no_activity_latlon))]
no_activity_latlon['distance'] = no_activity_latlon['distance'].round(decimals=3)
# Printing the data table 
no_activity_latlon

#The Distance which we got in the last column is the Distance in “Kilo Meters” between Hotel and the member Locations

Unnamed: 0,Year,StateAbbr,StateDesc,CityName,GeographicLevel,DataSource,Category,Measure,Data_Value_Unit,DataValueTypeID,...,MeasureId,CityFIPS,TractFIPS,Short_Question_Text,n_lat,n_lon,UniqueID,o_lat,o_lon,distance
0,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,LPA,4740000,4.709300e+10,Physical Inactivity,35.123371,-85.249368,4714000-47065010433,35.123371,-85.249368,0.0
1,2017,TN,Tennessee,Clarksville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,LPA,4715160,4.712510e+10,Physical Inactivity,36.019985,-83.940415,4740000-47093004100,36.019985,-83.940415,0.0
2,2017,TN,Tennessee,Chattanooga,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,LPA,4714000,4.706501e+10,Physical Inactivity,35.055331,-85.333062,4714000-47065010901,35.055331,-85.333062,0.0
3,2017,TN,Tennessee,Chattanooga,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,LPA,4714000,4.706500e+10,Physical Inactivity,36.003542,-83.939873,4740000-47093002900,36.003542,-83.939873,0.0
4,2017,TN,Tennessee,Knoxville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,LPA,4740000,4.709300e+10,Physical Inactivity,35.998590,-83.972815,4740000-47093003902,35.998590,-83.972815,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,LPA,4752006,4.703702e+10,Physical Inactivity,36.158250,-86.871528,4752006-47037013202,36.158250,-86.871528,0.0
557,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,LPA,4752006,4.703702e+10,Physical Inactivity,35.100458,-89.939979,4748000-47157008000,35.100458,-89.939979,0.0
558,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,LPA,4752006,4.703701e+10,Physical Inactivity,35.165325,-90.014663,4748000-47157011200,35.165325,-90.014663,0.0
559,2017,TN,Tennessee,Nashville,Census Tract,BRFSS,Unhealthy Behaviors,No leisure-time physical activity among adults...,%,CrdPrv,...,LPA,4752006,4.703701e+10,Physical Inactivity,36.071276,-86.938263,4752006-47037018410,36.071276,-86.938263,0.0


In [53]:
no_activity_latlon.to_csv('no_activity_latlon.csv') 

In [54]:
no_activity_latlon['distance'].describe()

count    561.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: distance, dtype: float64