# Extract

In [24]:
# Use operating system library to get paths
import os
path_current = os.getcwd()
path_parent = os.path.dirname(path_current)
path_data = os.path.join(path_parent, 'data')
path_est = os.path.join(path_data, 'establishments.xlsx')
path_neigh = os.path.join(path_data, 'neighborhoods.csv')
path_output = os.path.join(path_data, 'complete.csv')

In [25]:
import pandas as pd
coffee = pd.read_excel(path_est, sheet_name='coffee', dtype={'zipcode':str})
bars = pd.read_excel(path_est, sheet_name='bars', dtype={'zipcode':str})
neighborhoods = pd.read_csv(path_neigh, dtype={'zipcode':str})

# Explore

In [None]:
# Options that allow you to see about df
coffee.info()
coffee.dtypes
coffee.columns
len(coffee)
coffee.describe()
coffee.head(5)
coffee.tail(5)

# Transform

In [26]:
# Clean column names
for df in [coffee, bars]:
    # Convert column names to ASCII characters
    df.columns = [x.encode("ascii", "ignore").decode() for x in df.columns]
    # Rename columns
    df = df.rename(columns={'zip code':'zipcode'})

In [27]:
# Change datatype (str, int32, int64, float32, object, categorical)
for df in [coffee, bars]:
    df = df.astype({'zipcode': str})

In [28]:
# Concatenate all rows
df = pd.concat([bars, coffee], axis=0, ignore_index=True)

In [29]:
# Set values using .loc
condition_bar = df['name'].isin(bars['name'])
condition_coffee = df['name'].isin(coffee['name'])
df.loc[condition_bar, 'serves_alcohol'] = True
df.loc[condition_coffee, 'serves_coffee'] = True

In [30]:
# Clean values
df['zipcode'] = df['zipcode'].str[:5]

In [31]:
# Merge different datasets
df = df.merge(neighborhoods, how='left', on='zipcode')

In [32]:
# Handle duplicates
df = df.drop_duplicates(subset='name', ignore_index=True)

In [33]:
df

Unnamed: 0,name,dob_str,dob_mmddyy,dob_excel,address,city,zipcode,serves_alcohol,serves_coffee,neighborhood
0,Under the Volcano,"September 18, 1989",1989-09-18,32769,2349 Bissonet St,"Houston, TX",77005,True,,West U
1,93 'Til,"December 21, 2020",2020-12-21,44186,1601 W Main St,"Houston, TX",77006,True,,Montrose
2,Volcano Room,"May 5, 2016",2016-05-05,42495,4650 E NASA Pkwy,"Seabrook, TX",77586,True,,Clear Lake
3,Double Trouble,"December 1, 2010",2010-12-01,40513,3622 Main St,"Houston, TX",77002,True,True,Midtown
4,Ca Phe Phin,"September 17, 2010",2010-09-17,40438,1354 E NASA Pkwy,"Houston, TX",77058,,True,Clear Lake
5,Campesino Coffee,"November 18, 2016",2016-11-18,42692,2602 Waugh Dr,"Houston, TX",77006,,True,Montrose
6,Siphon Coffee,"May 23, 2014",2014-05-23,41782,701 W Alabama St,"Houston, TX",77006,,True,Montrose


# Load

In [34]:
# CSV
df.to_csv(path_output, index=False)

In [None]:
# Excel
with pd.ExcelWriter('Results.xlsx') as writer:  
    resultA.to_excel(writer, sheet_name="resultA")
    resultB.to_excel(writer, sheet_name="resultB")