# Where are best locations for opening a new independent yarn shop?

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

In [2]:
# read in shops_by_county dataframe
shops_df = pd.read_csv('../data/df_shops_by_county.csv')
shops_df.head(2)


Unnamed: 0,address,city,id,latitude,location,longitude,name,pos_online,ravelry_retailer,shop_email,zip,country,state,geometry,index_right,STATEFP,COUNTYFP,COUNTYNS,GEOID,NAME
0,817B Regal Drive,Huntsville,9966,34.7091,"817B Regal Drive, Huntsville, Alabama",-86.5875,Fiber Art Work,True,True,fiberartwork@gmail.com,35801,United States,Alabama,POINT (-86.58750000000001 34.7091),2123,1,89,161570,1089,Madison
1,105 D Church Street,Madison,12262,34.6946,"105 D Church Street, Madison, Alabama",-86.7487,Hook A Frog Fiber & Fun,True,False,hookafrog@gmail.com,35758,United States,Alabama,POINT (-86.7487 34.6946),2123,1,89,161570,1089,Madison


In [3]:
shop_county_count_df = shops_df[['state', 'STATEFP', 'COUNTYFP', 'COUNTYNS', 'GEOID', 'NAME']].copy()
shop_county_count_df.head(2)

Unnamed: 0,state,STATEFP,COUNTYFP,COUNTYNS,GEOID,NAME
0,Alabama,1,89,161570,1089,Madison
1,Alabama,1,89,161570,1089,Madison


In [4]:
shop_count = shop_county_count_df.groupby('GEOID').count()
shop_count

Unnamed: 0_level_0,state,STATEFP,COUNTYFP,COUNTYNS,NAME
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1003,1,1,1,1,1
1015,1,1,1,1,1
1055,1,1,1,1,1
1073,2,2,2,2,2
1079,1,1,1,1,1
...,...,...,...,...,...
56023,1,1,1,1,1
56025,2,2,2,2,2
56029,2,2,2,2,2
56033,1,1,1,1,1


In [5]:
shop_county_count_df.drop_duplicates()

Unnamed: 0,state,STATEFP,COUNTYFP,COUNTYNS,GEOID,NAME
0,Alabama,1,89,161570,1089,Madison
4,Alabama,1,133,161592,1133,Winston
5,Alabama,1,79,161565,1079,Lawrence
6,Alabama,1,73,161562,1073,Jefferson
8,Alabama,1,97,161575,1097,Mobile
...,...,...,...,...,...,...
2088,Wyoming,56,39,1605083,56039,Teton
2089,Wyoming,56,13,1605072,56013,Fremont
2090,Wyoming,56,7,1605069,56007,Carbon
2091,Wyoming,56,33,1605081,56033,Sheridan


In [6]:
shop_count = shop_count.drop(['state', 'STATEFP', 'COUNTYFP', 'COUNTYNS'], 1)
shop_count

Unnamed: 0_level_0,NAME
GEOID,Unnamed: 1_level_1
1003,1
1015,1
1055,1
1073,2
1079,1
...,...
56023,1
56025,2
56029,2
56033,1


In [7]:
shop_count = shop_count.reset_index()

In [8]:
shop_count = shop_count.rename(columns = {'NAME' : 'count'})
shop_count

Unnamed: 0,GEOID,count
0,1003,1
1,1015,1
2,1055,1
3,1073,2
4,1079,1
...,...,...
932,56023,1
933,56025,2
934,56029,2
935,56033,1


In [9]:
shop_count.loc[shop_count['GEOID'] == 1089]

Unnamed: 0,GEOID,count
5,1089,4


In [10]:
shop_county_count_df = shop_county_count_df.merge(shop_count, left_on = 'GEOID', right_on = 'GEOID').drop_duplicates()
shop_county_count_df

Unnamed: 0,state,STATEFP,COUNTYFP,COUNTYNS,GEOID,NAME,count
0,Alabama,1,89,161570,1089,Madison,4
4,Alabama,1,133,161592,1133,Winston,1
5,Alabama,1,79,161565,1079,Lawrence,1
6,Alabama,1,73,161562,1073,Jefferson,2
8,Alabama,1,97,161575,1097,Mobile,1
...,...,...,...,...,...,...,...
2088,Wyoming,56,39,1605083,56039,Teton,1
2089,Wyoming,56,13,1605072,56013,Fremont,1
2090,Wyoming,56,7,1605069,56007,Carbon,1
2091,Wyoming,56,33,1605081,56033,Sheridan,1


In [11]:
# read in county population dataframe
pd.set_option('display.max_columns', 60)
population_df = pd.read_csv('../data/co-est2019-alldata.csv', encoding = "ISO-8859-1", engine='python')
population_df.head(2)


Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,NPOPCHG_2010,NPOPCHG_2011,NPOPCHG_2012,NPOPCHG_2013,NPOPCHG_2014,NPOPCHG_2015,NPOPCHG_2016,NPOPCHG_2017,NPOPCHG_2018,NPOPCHG_2019,BIRTHS2010,...,RNATURALINC2017,RNATURALINC2018,RNATURALINC2019,RINTERNATIONALMIG2011,RINTERNATIONALMIG2012,RINTERNATIONALMIG2013,RINTERNATIONALMIG2014,RINTERNATIONALMIG2015,RINTERNATIONALMIG2016,RINTERNATIONALMIG2017,RINTERNATIONALMIG2018,RINTERNATIONALMIG2019,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RDOMESTICMIG2016,RDOMESTICMIG2017,RDOMESTICMIG2018,RDOMESTICMIG2019,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019
0,40,3,6,1,0,Alabama,Alabama,4779736,4780125,4785437,4799069,4815588,4830081,4841799,4852347,4863525,4874486,4887681,4903185,5312,13632,16519,14493,11718,10548,11178,10961,13195,15504,14226,...,1.184225,0.943643,0.70147,0.973446,1.210028,1.046273,0.761796,0.9449,1.189188,0.618401,0.692264,0.566242,-0.395013,-0.023714,0.476276,-0.198307,-0.318543,-0.444016,0.471965,1.081522,1.917501,0.578434,1.186314,1.522549,0.563489,0.626357,0.745172,1.090366,1.773786,2.483744
1,50,3,6,1,1,Alabama,Autauga County,54571,54597,54773,55227,54954,54727,54893,54864,55243,55390,55533,55869,176,454,-273,-227,166,-29,379,147,143,336,150,...,1.862012,2.037449,1.490099,0.072727,-0.254127,0.218816,0.127714,0.236887,-0.054492,-0.216933,-0.126214,-0.287248,5.945455,-5.971992,-4.121042,1.842729,-1.949762,4.831664,1.06659,0.667129,4.84731,6.018182,-6.226119,-3.902226,1.970443,-1.712875,4.777171,0.849656,0.540916,4.560062


In [12]:
population_df = population_df[['SUMLEV', 'REGION', 'DIVISION', 'STATE', 'COUNTY', 'STNAME',
                               'CTYNAME', 'POPESTIMATE2019']].copy()
population_df.head(2)

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,POPESTIMATE2019
0,40,3,6,1,0,Alabama,Alabama,4903185
1,50,3,6,1,1,Alabama,Autauga County,55869


In [13]:
population_df.loc[population_df['COUNTY'] == 0]

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,POPESTIMATE2019
0,40,3,6,1,0,Alabama,Alabama,4903185
68,40,4,9,2,0,Alaska,Alaska,731545
98,40,4,8,4,0,Arizona,Arizona,7278717
114,40,3,7,5,0,Arkansas,Arkansas,3017804
190,40,4,9,6,0,California,California,39512223
249,40,4,8,8,0,Colorado,Colorado,5758736
314,40,1,1,9,0,Connecticut,Connecticut,3565287
323,40,3,5,10,0,Delaware,Delaware,973764
327,40,3,5,11,0,District of Columbia,District of Columbia,705749
329,40,3,5,12,0,Florida,Florida,21477737


In [14]:
# filter out state population rows (0 on the COUNTY column)

state_pop = population_df.loc[population_df['COUNTY'] == 0].index
pop_county_df = population_df.drop(state_pop)
pop_county_df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,POPESTIMATE2019
1,50,3,6,1,1,Alabama,Autauga County,55869
2,50,3,6,1,3,Alabama,Baldwin County,223234
3,50,3,6,1,5,Alabama,Barbour County,24686
4,50,3,6,1,7,Alabama,Bibb County,22394
5,50,3,6,1,9,Alabama,Blount County,57826


In [15]:
pop_county_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3142 entries, 1 to 3192
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   SUMLEV           3142 non-null   int64 
 1   REGION           3142 non-null   int64 
 2   DIVISION         3142 non-null   int64 
 3   STATE            3142 non-null   int64 
 4   COUNTY           3142 non-null   int64 
 5   STNAME           3142 non-null   object
 6   CTYNAME          3142 non-null   object
 7   POPESTIMATE2019  3142 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 220.9+ KB


In [16]:
pop_county_df.loc[pop_county_df['STNAME'] == 'Alabama']

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,POPESTIMATE2019
1,50,3,6,1,1,Alabama,Autauga County,55869
2,50,3,6,1,3,Alabama,Baldwin County,223234
3,50,3,6,1,5,Alabama,Barbour County,24686
4,50,3,6,1,7,Alabama,Bibb County,22394
5,50,3,6,1,9,Alabama,Blount County,57826
...,...,...,...,...,...,...,...,...
63,50,3,6,1,125,Alabama,Tuscaloosa County,209355
64,50,3,6,1,127,Alabama,Walker County,63521
65,50,3,6,1,129,Alabama,Washington County,16326
66,50,3,6,1,131,Alabama,Wilcox County,10373


In [17]:
popcounty_merge_df = pd.merge(pop_county_df, shop_county_count_df,
                              how = 'left', 
                              left_on = ['STATE', 'COUNTY', 'STNAME'],
                              right_on = ['STATEFP', 'COUNTYFP', 'state'])
popcounty_merge_df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,POPESTIMATE2019,state,STATEFP,COUNTYFP,COUNTYNS,GEOID,NAME,count
0,50,3,6,1,1,Alabama,Autauga County,55869,,,,,,,
1,50,3,6,1,3,Alabama,Baldwin County,223234,Alabama,1.0,3.0,161527.0,1003.0,Baldwin,1.0
2,50,3,6,1,5,Alabama,Barbour County,24686,,,,,,,
3,50,3,6,1,7,Alabama,Bibb County,22394,,,,,,,
4,50,3,6,1,9,Alabama,Blount County,57826,,,,,,,


In [18]:
popcounty_merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3142 entries, 0 to 3141
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   SUMLEV           3142 non-null   int64  
 1   REGION           3142 non-null   int64  
 2   DIVISION         3142 non-null   int64  
 3   STATE            3142 non-null   int64  
 4   COUNTY           3142 non-null   int64  
 5   STNAME           3142 non-null   object 
 6   CTYNAME          3142 non-null   object 
 7   POPESTIMATE2019  3142 non-null   int64  
 8   state            935 non-null    object 
 9   STATEFP          935 non-null    float64
 10  COUNTYFP         935 non-null    float64
 11  COUNTYNS         935 non-null    float64
 12  GEOID            935 non-null    float64
 13  NAME             935 non-null    object 
 14  count            935 non-null    float64
dtypes: float64(5), int64(6), object(4)
memory usage: 392.8+ KB


In [19]:
# population per current shop
popcounty_merge_df['shop_pop'] = popcounty_merge_df['POPESTIMATE2019']/popcounty_merge_df['count']

In [20]:
# determine median, mean of population per current shop
popcounty_merge_df['shop_pop'].describe()


count    9.350000e+02
mean     1.141100e+05
std      1.688900e+05
min      5.915000e+02
25%      2.519150e+04
50%      5.436600e+04
75%      1.344078e+05
max      2.253858e+06
Name: shop_pop, dtype: float64

shop population
- count    935
- mean     114110
- std      168890
- min      591
- 25%      25191
- 50%      54366
- 75%      134407
- max      2,253,858

Mean seems a reasonable measure of population likely to support a shop, although it is definitely not the whole picture. For my analysis this is a good place to start.

In [21]:
# replace null in shop_pop with current population
popcounty_merge_df['shop_pop'] = popcounty_merge_df['shop_pop'].fillna(popcounty_merge_df.POPESTIMATE2019)

In [28]:
# delete cell after adding calculated columns
popcounty_merge_df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,POPESTIMATE2019,state,STATEFP,COUNTYFP,COUNTYNS,GEOID,NAME,count,shop_pop,avail_pop,shop_potential
0,50,3,6,1,1,Alabama,Autauga County,55869,,,,,,,,55869.0,55869.0,1.027646
1,50,3,6,1,3,Alabama,Baldwin County,223234,Alabama,1.0,3.0,161527.0,1003.0,Baldwin,1.0,223234.0,168868.0,3.106133
2,50,3,6,1,5,Alabama,Barbour County,24686,,,,,,,,24686.0,24686.0,0.454071
3,50,3,6,1,7,Alabama,Bibb County,22394,,,,,,,,22394.0,22394.0,0.411912
4,50,3,6,1,9,Alabama,Blount County,57826,,,,,,,,57826.0,57826.0,1.063643


In [23]:
# population available to support another shop, assuming median population required
popcounty_merge_df['avail_pop'] = popcounty_merge_df['POPESTIMATE2019']-(popcounty_merge_df['count']*54366)

In [25]:
# replace null in avail_pop with current population
popcounty_merge_df['avail_pop'] = popcounty_merge_df['avail_pop'].fillna(popcounty_merge_df.POPESTIMATE2019)

In [27]:
# number of shops available population could support assuming the median population required
popcounty_merge_df['shop_potential'] = popcounty_merge_df['avail_pop']/54366