In [1]:
# Source file: https://www.kaggle.com/starbucks/store-locations

In [9]:
# Import dependencies:

import pandas as pd
from sqlalchemy import create_engine

In [10]:
# Read in csv:
raw_df = pd.read_csv('Resources/starbucks.csv')

# Take a look:
raw_df.head()

Unnamed: 0,Brand,Store Number,Store Name,Ownership Type,Street Address,City,State/Province,Country,Postcode,Phone Number,Timezone,Longitude,Latitude
0,Starbucks,47370-257954,"Meritxell, 96",Licensed,"Av. Meritxell, 96",Andorra la Vella,7,AD,AD500,376818720.0,GMT+1:00 Europe/Andorra,1.53,42.51
1,Starbucks,22331-212325,Ajman Drive Thru,Licensed,"1 Street 69, Al Jarf",Ajman,AJ,AE,,,GMT+04:00 Asia/Dubai,55.47,25.42
2,Starbucks,47089-256771,Dana Mall,Licensed,Sheikh Khalifa Bin Zayed St.,Ajman,AJ,AE,,,GMT+04:00 Asia/Dubai,55.47,25.39
3,Starbucks,22126-218024,Twofour 54,Licensed,Al Salam Street,Abu Dhabi,AZ,AE,,,GMT+04:00 Asia/Dubai,54.38,24.48
4,Starbucks,17127-178586,Al Ain Tower,Licensed,"Khaldiya Area, Abu Dhabi Island",Abu Dhabi,AZ,AE,,,GMT+04:00 Asia/Dubai,54.54,24.51


In [11]:
raw_df.dtypes


Brand              object
Store Number       object
Store Name         object
Ownership Type     object
Street Address     object
City               object
State/Province     object
Country            object
Postcode           object
Phone Number       object
Timezone           object
Longitude         float64
Latitude          float64
dtype: object

In [12]:
# Strip out countries other than US:

us_df = raw_df.loc[raw_df['Country'] == 'US']
us_df.head()

Unnamed: 0,Brand,Store Number,Store Name,Ownership Type,Street Address,City,State/Province,Country,Postcode,Phone Number,Timezone,Longitude,Latitude
11964,Starbucks,3513-125945,Safeway-Anchorage #1809,Licensed,5600 Debarr Rd Ste 9,Anchorage,AK,US,995042300,907-339-0900,GMT-09:00 America/Anchorage,-149.78,61.21
11965,Starbucks,74352-84449,Safeway-Anchorage #2628,Licensed,1725 Abbott Rd,Anchorage,AK,US,995073444,907-339-2800,GMT-09:00 America/Anchorage,-149.84,61.14
11966,Starbucks,12449-152385,Safeway - Anchorage #1813,Licensed,1501 Huffman Rd,Anchorage,AK,US,995153596,907-339-1300,GMT-09:00 America/Anchorage,-149.85,61.11
11967,Starbucks,24936-233524,100th & C St - Anchorage,Company Owned,"320 W. 100th Ave, 100, Southgate Shopping Ctr ...",Anchorage,AK,US,99515,(907) 227-9631,GMT-09:00 America/Anchorage,-149.89,61.13
11968,Starbucks,8973-85630,Old Seward & Diamond,Company Owned,1005 E Dimond Blvd,Anchorage,AK,US,995152050,907-344-4160,GMT-09:00 America/Anchorage,-149.86,61.14


In [13]:
# Create new dataframe with selected columns:
new_us_df = us_df[['Store Number', 'Street Address', 'City', 'State/Province', 'Country', 'Postcode', 'Longitude', 'Latitude']].copy()
new_us_df.head()


Unnamed: 0,Store Number,Street Address,City,State/Province,Country,Postcode,Longitude,Latitude
11964,3513-125945,5600 Debarr Rd Ste 9,Anchorage,AK,US,995042300,-149.78,61.21
11965,74352-84449,1725 Abbott Rd,Anchorage,AK,US,995073444,-149.84,61.14
11966,12449-152385,1501 Huffman Rd,Anchorage,AK,US,995153596,-149.85,61.11
11967,24936-233524,"320 W. 100th Ave, 100, Southgate Shopping Ctr ...",Anchorage,AK,US,99515,-149.89,61.13
11968,8973-85630,1005 E Dimond Blvd,Anchorage,AK,US,995152050,-149.86,61.14


In [14]:
# Rename columns to make them SQL friendly:

new_us_df = new_us_df.rename(columns = {'Store Number': 'store_number',
                                        'Street Address': 'address',
                                        'City': 'city',
                                        'State/Province': 'state',
                                        'Country': 'country',
                                        'Postcode': 'zipcode',
                                        'Longitude': 'longitude',
                                        'Latitude': 'latitude'
                                       })

new_us_df.head()

Unnamed: 0,store_number,address,city,state,country,zipcode,longitude,latitude
11964,3513-125945,5600 Debarr Rd Ste 9,Anchorage,AK,US,995042300,-149.78,61.21
11965,74352-84449,1725 Abbott Rd,Anchorage,AK,US,995073444,-149.84,61.14
11966,12449-152385,1501 Huffman Rd,Anchorage,AK,US,995153596,-149.85,61.11
11967,24936-233524,"320 W. 100th Ave, 100, Southgate Shopping Ctr ...",Anchorage,AK,US,99515,-149.89,61.13
11968,8973-85630,1005 E Dimond Blvd,Anchorage,AK,US,995152050,-149.86,61.14


In [15]:
# Dropping NA values for zip wasn't needed, since there were 13608 rows before and after.

#len(new_us_df)
#13608
#new_us_df['zipcode'] = new_us_df['zipcode'].dropna()
#len(new_us_df)
#13608

In [16]:
# Some zipcodes have the extra 4 digits on the right side, for 9 total.
# Remove four extra digits from string – Convert Zip+4 to Zip code:
# (source = https://stackoverflow.com/questions/44776115/remove-four-last-digits-from-string-convert-zip4-to-zip-code/44776170)
#  Ex.: df['collection_status'] = df['zip_code'].str[:5]

new_us_df['zipcode'] = new_us_df['zipcode'].str[:5]
new_us_df


Unnamed: 0,store_number,address,city,state,country,zipcode,longitude,latitude
11964,3513-125945,5600 Debarr Rd Ste 9,Anchorage,AK,US,99504,-149.78,61.21
11965,74352-84449,1725 Abbott Rd,Anchorage,AK,US,99507,-149.84,61.14
11966,12449-152385,1501 Huffman Rd,Anchorage,AK,US,99515,-149.85,61.11
11967,24936-233524,"320 W. 100th Ave, 100, Southgate Shopping Ctr ...",Anchorage,AK,US,99515,-149.89,61.13
11968,8973-85630,1005 E Dimond Blvd,Anchorage,AK,US,99515,-149.86,61.14
...,...,...,...,...,...,...,...,...
25567,74385-87621,554 N 3rd St,Laramie,WY,US,82072,-105.59,41.32
25568,73320-24375,3112 E. Grand,Laramie,WY,US,82070,-105.56,41.31
25569,22425-219024,3021 Grand Ave,Laramie,WY,US,82070,-105.56,41.31
25570,10849-103163,118 Westland Way,Rock Springs,WY,US,82901,-109.25,41.58


In [17]:
# Some zipcodes were only 4 digits, because the leading 0 got dropped somewhere.
# Show the zip for a store that didn't have a leading zero:

new_us_df.loc[new_us_df['store_number'] == '22967-176905']

Unnamed: 0,store_number,address,city,state,country,zipcode,longitude,latitude
16055,22967-176905,200 Bloomfield Ave,West Hartford,CT,US,6117,-72.72,41.79


In [18]:
# Add a leading zero to zip codes that were missing it and were only 4 digits:
# (source: https://www.datasciencemadesimple.com/add-leading-preceding-zeros-python/)

new_us_df['zipcode']=new_us_df['zipcode'].apply(lambda x: '{0:0>5}'.format(x))

In [19]:
# Show that a zipcode which previously had 4 digits now has 5, with a leading 0:

new_us_df.loc[new_us_df['store_number'] == '22967-176905']

Unnamed: 0,store_number,address,city,state,country,zipcode,longitude,latitude
16055,22967-176905,200 Bloomfield Ave,West Hartford,CT,US,6117,-72.72,41.79


In [20]:
new_us_df

Unnamed: 0,store_number,address,city,state,country,zipcode,longitude,latitude
11964,3513-125945,5600 Debarr Rd Ste 9,Anchorage,AK,US,99504,-149.78,61.21
11965,74352-84449,1725 Abbott Rd,Anchorage,AK,US,99507,-149.84,61.14
11966,12449-152385,1501 Huffman Rd,Anchorage,AK,US,99515,-149.85,61.11
11967,24936-233524,"320 W. 100th Ave, 100, Southgate Shopping Ctr ...",Anchorage,AK,US,99515,-149.89,61.13
11968,8973-85630,1005 E Dimond Blvd,Anchorage,AK,US,99515,-149.86,61.14
...,...,...,...,...,...,...,...,...
25567,74385-87621,554 N 3rd St,Laramie,WY,US,82072,-105.59,41.32
25568,73320-24375,3112 E. Grand,Laramie,WY,US,82070,-105.56,41.31
25569,22425-219024,3021 Grand Ave,Laramie,WY,US,82070,-105.56,41.31
25570,10849-103163,118 Westland Way,Rock Springs,WY,US,82901,-109.25,41.58


In [22]:
new_us_df.dtypes

store_number     object
address          object
city             object
state            object
country          object
zipcode          object
longitude       float64
latitude        float64
dtype: object

In [23]:
# Connect to local database:

rds_connection_string = "postgres:postgres@localhost:5432/starbucks_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [24]:
# Check for tables:
engine.table_names()

['income_data', 'starbucks_data']

In [26]:
# Use pandas to load csv converted DataFrame into database
# (using "locations" as the name of the table for the Starbucks locations):

new_us_df.to_sql(name='starbucks_data', con=engine, if_exists='append', index=False)

In [27]:
# Confirm data has been added by querying the starbucks_data table:

pd.read_sql_query('select * from starbucks_data', con=engine).head()

Unnamed: 0,store_number,address,city,state,country,zipcode,longitude,latitude
0,3513-125945,5600 Debarr Rd Ste 9,Anchorage,AK,US,99504,-149.78,61.21
1,74352-84449,1725 Abbott Rd,Anchorage,AK,US,99507,-149.84,61.14
2,12449-152385,1501 Huffman Rd,Anchorage,AK,US,99515,-149.85,61.11
3,24936-233524,"320 W. 100th Ave, 100, Southgate Shopping Ctr ...",Anchorage,AK,US,99515,-149.89,61.13
4,8973-85630,1005 E Dimond Blvd,Anchorage,AK,US,99515,-149.86,61.14
