## Start date : July 10th, 2023 

### Data Exploration of 2018, 2019, 2020, 2022 Business Data

Author : Kenthia

Goals:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
pd.set_option('display.max_columns', None)
from shapely.geometry import Point
import fiona
import math
import plotly.express as px
from thefuzz import fuzz
from thefuzz import process

# 2018 DATA

In [None]:
#reading in the dataset 

stores_2018 = pd.read_csv('/srv/data/my_shared_data_folder/rafi/2018_Business_Academic_QCQ_grocery.csv')
stores_2018.drop(['Unnamed: 0'],inplace=True,axis=1)
territories = ['PR', 'FM', 'MP', 'GU', 'VI', 'MH']
stores_2018 = stores_2018.drop(stores_2018[stores_2018['STATE'].isin(territories)].index)
stores_2018.head()

In [None]:
#Cleaned lists from the string matches of supermarkets/companies actually owned by the parent coprporation
#***NOTE***There will be a margin of error from this process because stores were validated through manual checks
#***NOTE***It was unrealistic to manually inspect every store generated this way so there will be supermarkets/companies that do not actually belong to the parent coporation
#***NOTE***STOP AND SHOP SUPERMARKET was missing from string match so it was manually added
#KROGERS FINAL
kroger_clean = ["BAKER'S",
 'BAKERS', 
 'CITY MARKET',
 'DILLONS',
 'FOOD 4 LESS', 
 'FOODS CO', 
 'FRED MEYER', 
 "FRY'S FOOD",
 'GERBES SUPER MARKET',  
 'GERBES SUPER MARKETS', 
 'GERBES SUPERMARKET', 
 'HARRIS TEETER', 
 'JAYC', 
 'KING SOOPERS',
 'KROGER',
 "MARIANO'S",
 'METRO MARKET', 
 'PAY LESS SUPER MARKETS',
 "PICK'N SAVE", 
 'QFC', 
 'RALPHS', 
 'RULER FOODS', 
 "SMITH'S"]

#Ahold Delhaize FINAL:
ahold_clean = [ 'FOOD LION',
 'GIANT',
 'GIANT EAGLE',
 'GIANT FOOD',
 'HANNAFORD',
 'STOP & SHOP',
 'BFRESH MARKET',
 'EASTSIDE MARKETPLACE','STOP & SHOP SUPERMARKET']

#WALMART FINAL:
walmart_clean = ['WALMART GROCERY PICKUP',
 'WALMART GROCERY PKUP-DELIVERY', 
 "SAM'S CLUB DELI"]

#COSTCO FINAL:
costco_clean = ["COSTCO DELI"]

#ALBERTSONS FINAL:
albertsons_clean = ['ACME MARKETS',
'ALBERTSONS',
'ALBERTSONS MARKET',
'AMIGOS UNITED',
"ANDRONICO'S COMMUNITY MARKETS",
'CARRS/SAFEWAY',
'CARRS SAFEWAY',
'HAGGEN',
'HAGGEN FOOD & PHARMACY',
'HAGGEN FOOD',
'JEWEL-OSCO',
'KINGS FOOD MARKETS',
'KINGS FOOD MARKET',
'MARKET STREET',
"PAK 'N SAVE",
'PAVILIONS',
'RANDALLS',
'SAFEWAY',
'STAR MARKET',
'TOM THUMB',
'UNITED SUPERMARKETS',
'VONS',
'SUPER SAVER',
'CARRS QUALITY CTR PALMER SHPG',
 "SAAR'S SUPER SAVER FOODS"]

In [None]:
#placing function here to run storecount below on grocery_naics


def parent_name(row):
    '''Creates a new column called "PARENT NAME" based on the cleaned string match list''' 
    '''The new column generalizes individual companies into their parent coporation (ie. Walmart, Krogers, etc.)''' 
    companies = row['COMPANY']  
    
    if companies in walmart_clean:
        return 'WALMART'
    if companies in costco_clean:
        return 'COSTCO'
    if companies in ahold_clean:
        return 'AHOLD DELHAIZE'
    if companies in kroger_clean:
        return 'KROGER'
    if companies in albertsons_clean:
        return 'ALBERTSONS'
    else:
        return 'OTHER'

stores_2018['PARENT NAME'] = stores_2018.apply(parent_name, axis=1)

In [None]:
#filter by NAICS code "44511" - SUPERMARKETS/OTHER GROCERY (EXC CONVENIENCE) STRS

grocery_naics_2018 = stores_2018.loc[stores_2018[['PRIMARY NAICS CODE']].astype(str).apply(lambda x: x.str.startswith('44511')).any(axis=1)]
grocery_naics_2018

# There are 75227 supermarkets/related companies in the entire US in 2018

In [None]:
grocery_type= grocery_naics_2018["PRIMARY NAICS CODE"].unique().tolist()
grocery_type

In [None]:
store_count_2018= grocery_naics_2018.groupby('PARENT NAME').count()[["CITY"]].reset_index().rename(columns={"CITY":"STORE COUNT"})
store_count_2018

# 2019 DATA

In [None]:
#reading in the dataset 

stores_2019 = pd.read_csv('/srv/data/my_shared_data_folder/rafi/2019_Business_Academic_QCQ_grocery.csv')
stores_2019.drop(['Unnamed: 0'],inplace=True,axis=1)
territories = ['PR', 'FM', 'MP', 'GU', 'VI', 'MH']
stores_2019 = stores_2019.drop(stores_2019[stores_2019['STATE'].isin(territories)].index)
stores_2019.head()

In [None]:
#placing function here to run storecount below on grocery_naics


def parent_name(row):
    '''Creates a new column called "PARENT NAME" based on the cleaned string match list''' 
    '''The new column generalizes individual companies into their parent coporation (ie. Walmart, Krogers, etc.)''' 
    companies = row['COMPANY']  
    
    if companies in walmart_clean:
        return 'WALMART'
    if companies in costco_clean:
        return 'COSTCO'
    if companies in ahold_clean:
        return 'AHOLD DELHAIZE'
    if companies in kroger_clean:
        return 'KROGER'
    if companies in albertsons_clean:
        return 'ALBERTSONS'
    else:
        return 'OTHER'

stores_2019['PARENT NAME'] = stores_2019.apply(parent_name, axis=1)

In [None]:
#filter by NAICS code "44511" - SUPERMARKETS/OTHER GROCERY (EXC CONVENIENCE) STRS

grocery_naics_2019 = stores_2019.loc[stores_2019[['PRIMARY NAICS CODE']].astype(str).apply(lambda x: x.str.startswith('44511')).any(axis=1)]
grocery_naics_2019

# There 81480 supermarkets/related companies in the entire US in 2019

In [None]:
grocery_type= grocery_naics_2019["PRIMARY NAICS CODE"].unique().tolist()
grocery_type

In [None]:
store_count_2019= grocery_naics_2019.groupby('PARENT NAME').count()[["CITY"]].reset_index().rename(columns={"CITY":"STORE COUNT"})
store_count_2019

# 2020 DATA

In [None]:
#reading in the dataset 

stores_2020 = pd.read_csv('/srv/data/my_shared_data_folder/rafi/2020_Business_Academic_QCQ_grocery.csv')
stores_2020.drop(['Unnamed: 0'],inplace=True,axis=1)
territories = ['PR', 'FM', 'MP', 'GU', 'VI', 'MH']
stores_2020 = stores_2020.drop(stores_2020[stores_2020['STATE'].isin(territories)].index)
stores_2020.head()

In [None]:
#placing function here to run storecount below on grocery_naics


def parent_name(row):
    '''Creates a new column called "PARENT NAME" based on the cleaned string match list''' 
    '''The new column generalizes individual companies into their parent coporation (ie. Walmart, Krogers, etc.)''' 
    companies = row['COMPANY']  
    
    if companies in walmart_clean:
        return 'WALMART'
    if companies in costco_clean:
        return 'COSTCO'
    if companies in ahold_clean:
        return 'AHOLD DELHAIZE'
    if companies in kroger_clean:
        return 'KROGER'
    if companies in albertsons_clean:
        return 'ALBERTSONS'
    else:
        return 'OTHER'

stores_2020['PARENT NAME'] = stores_2020.apply(parent_name, axis=1)

In [None]:
#filter by NAICS code "44511" - SUPERMARKETS/OTHER GROCERY (EXC CONVENIENCE) STRS

grocery_naics_2020 = stores_2020.loc[stores_2020[['PRIMARY NAICS CODE']].astype(str).apply(lambda x: x.str.startswith('44511')).any(axis=1)]
grocery_naics_2020

# There are 76579 supermarkets/related companies in the entire US in 2020

In [None]:
grocery_type= grocery_naics_2020["PRIMARY NAICS CODE"].unique().tolist()
grocery_type

In [None]:
store_count_2020= grocery_naics_2020.groupby('PARENT NAME').count()[["CITY"]].reset_index().rename(columns={"CITY":"STORE COUNT"})
store_count_2020

# 2022 DATA

In [None]:
#reading in the dataset 

stores_2022 = pd.read_csv('/srv/data/my_shared_data_folder/rafi/2022_Business_Academic_QCQ_grocery.csv')
stores_2022.drop(['Unnamed: 0'],inplace=True,axis=1)
territories = ['PR', 'FM', 'MP', 'GU', 'VI', 'MH']
stores_2022 = stores_2022.drop(stores_2022[stores_2022['STATE'].isin(territories)].index)
stores_2022.head()

In [None]:
#placing function here to run storecount below on grocery_naics


def parent_name(row):
    '''Creates a new column called "PARENT NAME" based on the cleaned string match list''' 
    '''The new column generalizes individual companies into their parent coporation (ie. Walmart, Krogers, etc.)''' 
    companies = row['COMPANY']  
    
    if companies in walmart_clean:
        return 'WALMART'
    if companies in costco_clean:
        return 'COSTCO'
    if companies in ahold_clean:
        return 'AHOLD DELHAIZE'
    if companies in kroger_clean:
        return 'KROGER'
    if companies in albertsons_clean:
        return 'ALBERTSONS'
    else:
        return 'OTHER'

stores_2022['PARENT NAME'] = stores_2022.apply(parent_name, axis=1)

In [None]:
#filter by NAICS code "44511" - SUPERMARKETS/OTHER GROCERY (EXC CONVENIENCE) STRS

grocery_naics_2022 = stores_2022.loc[stores_2022[['PRIMARY NAICS CODE']].astype(str).apply(lambda x: x.str.startswith('44511')).any(axis=1)]
grocery_naics_2022

# There are 76579 supermarkets/related companies in the entire US in 2020

In [None]:
grocery_type= grocery_naics_2022["PRIMARY NAICS CODE"].unique().tolist()
grocery_type

In [None]:
store_count_2022= grocery_naics_2022.groupby('PARENT NAME').count()[["CITY"]].reset_index().rename(columns={"CITY":"STORE COUNT"})
store_count_2022

## LET'S EXPLORE SOME VISUALIZATIONS 

In [None]:
fig = px.choropleth(stores_2022, 
                    locations=stores_2022.loc[stores_2022['STATE']=='DC', 
                    color ='PARENT NAME',
                    locationmode='USA-states',
                    title='Albertsons in the US',
                    scope='usa',
                    color_continuous_scale="Viridis")

fig.show()