# Building an ETL Pipeline
---

### Step 0: Install the required python packages

In [1]:
#pip install --upgrade sodapy

In [2]:
#pip install --upgrade db-dtypes

In [3]:
#pip install --upgrade pyarrow

In [4]:
#pip install --upgrade google-cloud-bigquery

#### Now, on the top of your Notebook select "Kernel" -> "Restart and Clear Output"
Then, continue from the next cell

### Step 1: Setup your NYC Open Data variables (ACTION REQUIRED HERE)

In [5]:
# import libraries
import pandas as pd
import numpy as np
from sodapy import Socrata
from google.cloud import bigquery
from google.oauth2 import service_account

In [6]:
# setup the host name for the API endpoint (the https:// part will be added automatically)
# only need to change this if you are not using NYC Open Data
data_url = 'data.cityofnewyork.us'

In [7]:
# setup the data set at the API endpoint (311 data in this case)
# For example: https://data.cityofnewyork.us/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9.json
# would give us 'erm2-nwe9'

# https://data.cityofnewyork.us/resource/qcdj-rwhu.json
sidewalk_dataset = 'qcdj-rwhu'

# https://data.cityofnewyork.us/resource/w7w3-xahh.json
businesses_dataset = 'w7w3-xahh'

In [8]:
# Setup your App Token, which you created in Week 6
# You can find your app token by logging into: https://data.cityofnewyork.us/profile/edit/developer_settings
#app_token = 'your app token here'
app_token = ""

In [9]:
# run this cell to setup your Socrata client that connects python to NYC Open Data

# create the client that points to the API endpoint
nyc_open_data_client = Socrata(data_url, app_token, timeout = 200)
print(f"nyc open data client name is: {nyc_open_data_client}")
print(f"nyc open data client data type is: {type(nyc_open_data_client)}")

nyc open data client name is: <sodapy.socrata.Socrata object at 0x7fac892f4430>
nyc open data client data type is: <class 'sodapy.socrata.Socrata'>


### Step 2: Setup your Google BigQuery variables (ACTION REQUIRED HERE)

If you did not create a key path in class on 3/30/22 (which created a json file on your computer), you must create one to continue:
1. Open BigQuery
2. On the top-left, click on the Navigation Menu
3. In the Navigation Menu, go to "IAM & Admin" -> "Sercive Accounts"
4. On the top of the page, click on "Create Service Account"
5. Account name: cis9440-spring2022
6. Click create and continue
7. Set Role to Owner
8. Click Continue
9. Click Done
10. In the new row for your Service Account, click on the 3 dots in the "Action" column. Select "Manage Keys"
11. Click "Add Key", then "Create New Key". Select the "JSON" radio button and click "Create"
12. In the next cell, set key_path to the exact file path of your new JSON file. For example, it will look like r'C:\Users\Downloads\cis9440-324315-70048a5e1138.json'

In [10]:
# CHANGE THIS TO YOUR FILE PATH
key_path = r'/Users/macbookpro/Desktop/cis9440-361100-642072a7b126.json'

In [11]:
# run this cell without changing anything to setup your credentials
credentials = service_account.Credentials.from_service_account_file(key_path,
                                                                    scopes=["https://www.googleapis.com/auth/cloud-platform"])
bigquery_client = bigquery.Client(credentials = credentials,
                                 project = credentials.project_id)

print(f"bigquery client name is: {bigquery_client}")
print(f"bigquery client data type is: {type(bigquery_client)}")

bigquery client name is: <google.cloud.bigquery.client.Client object at 0x7fac892f42b0>
bigquery client data type is: <class 'google.cloud.bigquery.client.Client'>


Now, you need to create your dataset id:
1. Go to bigquery
2. Inside the "Explorer" window, click on the 3 dots to the right of your cis9440 project called "View Actions"
3. Select "Create dataset"
4. Leave the Project ID as it is, name your Dataset ID etl_dataset
5. Expand your cis9440 project with the triangle on its left-hand side so you can see your new etl_dataset dataset
6. On the right of your etl_dataset, click the 3 dots for "View Actions" -> "Open"
7. You should now see the "Dataset info". Copy the entire "Dataset ID" and paste it in the variable below

In [45]:
#dataset_id = 'your dataset id here'   # PASTE THIS DATASET ID FROM ABOVE STEPS
dataset_id = 'cis9440-361100.sidewalk_cafe_ETL'

dataset_id = dataset_id.replace(':', '.')
print(f"your dataset_id is: {dataset_id}")

your dataset_id is: cis9440-361100.sidewalk_cafe_ETL


### Step 3: Extract data

1. connect to NYC Open Data with API Key
2. pull specific dataset as a pandas dataframe
3. Look at shape of extracted data

#### sodapy client.get parameters
1. select
2. where
3. order
4. limit
5. group

In [13]:
# Get the total number of records in our the entire data set
for ds in [sidewalk_dataset, businesses_dataset]:
    total_record_count = nyc_open_data_client.get(ds, select = "COUNT(*)")
    print(f"total records in {ds}: {total_record_count[0]['COUNT']}")

total records in qcdj-rwhu: 1116
total records in w7w3-xahh: 277185


In [14]:
# Now, loop through target data set to pull all rows in chunks (we cannot pull all rows at once)
# AGAIN, UPDATE WHERE FILTER INSIDE BELOW FUNCTION

def extract_socrata_data(data_set,
                         chunk_size = 10000,
                         where = None):
    
    # measure time this function takes
    import time
    start_time = time.time()
    
    # get total number or records
    if where == None:
        total_records = int(nyc_open_data_client.get(data_set,
                                                     select= "COUNT(*)")[0]["COUNT"])
    else:
        total_records = int(nyc_open_data_client.get(data_set,
                                                     where = where,
                                                     select= "COUNT(*)")[0]["COUNT"])
    
    # start at 0, empty list for results
    start = 0                   
    results = []                

    while True:

        if where == None:
            # fetch the set of records starting at 'start'
            results.extend(nyc_open_data_client.get(data_set,
                                                    offset = start,
                                                    limit = chunk_size))
            
        elif where != None:
            results.extend(nyc_open_data_client.get(data_set,
                                                    where = where,
                                                    offset = start,
                                                    limit = chunk_size))
        # update the starting record number
        start = start + chunk_size

        # if we have fetched all of the records (we have reached total_records), exit loop
        if (start > total_records):
            break

    # convert the list into a pandas data frame
    data = pd.DataFrame.from_records(results)

    end_time = time.time()
    print(f"function took {round(end_time - start_time, 1)} seconds")

    print(f"the shape of your dataframe is: {data.shape}")
    return data

In [15]:
sidewalk_df = extract_socrata_data(chunk_size = 10000,
                         data_set = sidewalk_dataset,
                         where = None)

function took 0.6 seconds
the shape of your dataframe is: (1116, 47)


In [16]:
businesses_df = extract_socrata_data(chunk_size = 10000,
                         data_set = businesses_dataset,
                         where = None)

function took 28.9 seconds
the shape of your dataframe is: (277185, 32)


In [17]:
businesses_df.columns

Index(['license_nbr', 'license_type', 'lic_expir_dd', 'license_status',
       'license_creation_date', 'industry', 'business_name', 'address_city',
       'address_state', 'address_zip', 'business_name_2', 'address_building',
       'address_street_name', 'contact_phone', 'address_borough', 'detail',
       'community_board', 'council_district', 'bin', 'bbl', 'nta',
       'census_tract', 'longitude', 'latitude', 'location',
       ':@computed_region_efsh_h5xi', ':@computed_region_f5dn_yrer',
       ':@computed_region_yeji_bk3q', ':@computed_region_92fq_4b7q',
       ':@computed_region_sbqj_enih', 'detail_2', 'address_street_name_2'],
      dtype='object')

In [18]:
sidewalk_df.columns

Index(['license_nbr', 'lic_status', 'business_name', 'business_name2',
       'building', 'street', 'city', 'state', 'zip', 'swc_type', 'swc_sq_ft',
       'swc_tables', 'swc_chairs', 'dohmh', 'latitude', 'longitude',
       'community_district', 'city_council_district', 'cd_url', 'app_id',
       'app_swc_type', 'app_sq_ft', 'app_tables', 'app_chairs', 'app_status',
       'app_status_date', 'expiration_date', 'app_too_date', 'submit_date',
       'intake', 'intake_dd', 'dpqa', 'send_package_dd', 'cp', 'cp_dd', 'cb',
       'cb_dd', 'hearing', 'hearing_dd', 'cc', 'cc_dd', 'moo', 'issuance',
       'hearing_public', 'hearing_public_dd', 'moo_dd', 'issuance_dd'],
      dtype='object')

In [19]:
data = sidewalk_df.merge(businesses_df,
                         how = 'inner',
                         left_on = "license_nbr",
                         right_on = "license_nbr",
                         suffixes=('', '_y'))

### Step 4: Data Profiling

1. Distinct values per column
2. Null values per column
3. Summary statistics per numeric column

In [20]:
# what are the columns in our dataframe?
data.columns

Index(['license_nbr', 'lic_status', 'business_name', 'business_name2',
       'building', 'street', 'city', 'state', 'zip', 'swc_type', 'swc_sq_ft',
       'swc_tables', 'swc_chairs', 'dohmh', 'latitude', 'longitude',
       'community_district', 'city_council_district', 'cd_url', 'app_id',
       'app_swc_type', 'app_sq_ft', 'app_tables', 'app_chairs', 'app_status',
       'app_status_date', 'expiration_date', 'app_too_date', 'submit_date',
       'intake', 'intake_dd', 'dpqa', 'send_package_dd', 'cp', 'cp_dd', 'cb',
       'cb_dd', 'hearing', 'hearing_dd', 'cc', 'cc_dd', 'moo', 'issuance',
       'hearing_public', 'hearing_public_dd', 'moo_dd', 'issuance_dd',
       'license_type', 'lic_expir_dd', 'license_status',
       'license_creation_date', 'industry', 'business_name_y', 'address_city',
       'address_state', 'address_zip', 'business_name_2', 'address_building',
       'address_street_name', 'contact_phone', 'address_borough', 'detail',
       'community_board', 'council_distr

In [21]:
# subset for only needed columns
data = data[['business_name',
             'business_name2',
             'latitude',
             'longitude',
             'street',
             'zip',
             'address_borough',
             'app_status',
             'swc_type',
             'swc_sq_ft',
             'swc_tables',
             'swc_chairs']]

In [22]:
# create and run a function to ceate data profiling dataframe

def create_data_profiling_df(data):
    
    # create an empty dataframe to gather information about each column
    data_profiling_df = pd.DataFrame(columns = ["column_name",
                                                "column_type",
                                                "unique_values",
                                                "duplicate_values",
                                                "null_values",
                                                "non_null_values"])

    # loop through each column to add rows to the data_profiling_df dataframe
    for column in data.columns:

        info_dict = {}

        try:
            info_dict["column_name"] = column
            info_dict["column_type"] = data[column].dtypes
            info_dict["unique_values"] = len(data[column].unique())
            info_dict["duplicate_values"] = data[column].count() - len(data[column].dropna().unique())
            info_dict["null_values"] = data[column].isna().sum()
            info_dict["non_null_values"] = data[column].count()

        except:
            print(f"unable to read column: {column}, you may want to drop this column")

        data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)

    data_profiling_df.sort_values(by = ['unique_values', "non_null_values"],
                                  ascending = [False, False],
                                  inplace=True)
    
    return data_profiling_df

In [23]:
# view your data profiling dataframe
data_profiling_df = create_data_profiling_df(data)
data_profiling_df

  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)
  data_profiling_df = data_profiling_df.append(info_dict, ignore_index=True)


Unnamed: 0,column_name,column_type,unique_values,duplicate_values,null_values,non_null_values
0,business_name,object,1075,82,0,1157
2,latitude,object,1067,90,0,1157
3,longitude,object,1067,90,0,1157
1,business_name2,object,720,74,364,793
9,swc_sq_ft,object,462,695,0,1157
4,street,object,288,869,0,1157
5,zip,object,86,1071,0,1157
11,swc_chairs,object,73,1084,0,1157
10,swc_tables,object,42,1115,0,1157
6,address_borough,object,4,1153,0,1157


### Step 5: Data Cleansing

1. drop unneeded columns
2. drop duplicate rows
3. check for outliers

In [24]:
# Run this to look at a list of your columns
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1157 entries, 0 to 1156
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   business_name    1157 non-null   object
 1   business_name2   793 non-null    object
 2   latitude         1157 non-null   object
 3   longitude        1157 non-null   object
 4   street           1157 non-null   object
 5   zip              1157 non-null   object
 6   address_borough  1157 non-null   object
 7   app_status       1157 non-null   object
 8   swc_type         1157 non-null   object
 9   swc_sq_ft        1157 non-null   object
 10  swc_tables       1157 non-null   object
 11  swc_chairs       1157 non-null   object
dtypes: object(12)
memory usage: 117.5+ KB


In [25]:
# address null values
data['business_name2'].fillna(data['business_name'], inplace=True)

In [26]:
# find number of duplicate rows
print(f"number of duplicate rows: {len(data[data.duplicated()])}")

number of duplicate rows: 67


In [27]:
# drop duplicate rows based on entire row
data = data.drop_duplicates(keep = 'first')

# Or, based on a subset of rows, uncomment below and adjust accordingly
## data = data.drop_duplicates(subset = ["subset column"], keep = 'first')
## data = data.drop_duplicates(subset = ["subset column 1", "subset column 2"], keep = 'first')

print(f"number of rows after duplicates dropped: {len(data)}")

number of rows after duplicates dropped: 1090


In [28]:
# update columns types
for column in ['latitude',
              'longitude',
              'zip',
              'swc_sq_ft',
              'swc_tables',
              'swc_chairs']:
    try:
        data[column] = data[column].astype(int)
        
    except:
        data[column] = data[column].astype(float)
        
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1090 entries, 0 to 1156
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   business_name    1090 non-null   object 
 1   business_name2   1090 non-null   object 
 2   latitude         1090 non-null   float64
 3   longitude        1090 non-null   float64
 4   street           1090 non-null   object 
 5   zip              1090 non-null   int64  
 6   address_borough  1090 non-null   object 
 7   app_status       1090 non-null   object 
 8   swc_type         1090 non-null   object 
 9   swc_sq_ft        1090 non-null   int64  
 10  swc_tables       1090 non-null   int64  
 11  swc_chairs       1090 non-null   int64  
dtypes: float64(2), int64(4), object(6)
memory usage: 110.7+ KB


### Step 4: Create Location Dimension

In [29]:
# first, copy the entire table
location_dim = data.copy()

In [30]:
location_dim.columns

Index(['business_name', 'business_name2', 'latitude', 'longitude', 'street',
       'zip', 'address_borough', 'app_status', 'swc_type', 'swc_sq_ft',
       'swc_tables', 'swc_chairs'],
      dtype='object')

In [31]:
# second, subset for only the wanted columns in the dimension
location_dim = location_dim[['latitude', 'longitude', 'street',
       'zip', 'address_borough']]

In [32]:
# third, drop duplicate rows in dimension
unique_row = ['latitude', 'longitude']
location_dim = location_dim.drop_duplicates(subset = unique_row, keep = 'first')
location_dim = location_dim.reset_index(drop = True)
location_dim

Unnamed: 0,latitude,longitude,street,zip,address_borough
0,40.725336,-74.003132,SPRING ST,10012,Manhattan
1,40.690944,-73.996030,HENRY ST,11201,Brooklyn
2,40.765636,-73.918461,30TH AVE,11103,Queens
3,40.743358,-73.996435,7TH AVE,10011,Manhattan
4,40.675769,-73.980687,5TH AVE,11215,Brooklyn
...,...,...,...,...,...
1062,40.802894,-73.964133,AMSTERDAM AVE,10025,Manhattan
1063,40.851081,-73.939921,W 181ST ST,10033,Manhattan
1064,40.682199,-73.995803,COURT ST,11231,Brooklyn
1065,40.745294,-73.978217,3RD AVE,10016,Manhattan


In [33]:
# fourth, add a surrogate key
location_dim.insert(0, 'location_id', range(1, 1 + len(location_dim)))
location_dim

Unnamed: 0,location_id,latitude,longitude,street,zip,address_borough
0,1,40.725336,-74.003132,SPRING ST,10012,Manhattan
1,2,40.690944,-73.996030,HENRY ST,11201,Brooklyn
2,3,40.765636,-73.918461,30TH AVE,11103,Queens
3,4,40.743358,-73.996435,7TH AVE,10011,Manhattan
4,5,40.675769,-73.980687,5TH AVE,11215,Brooklyn
...,...,...,...,...,...,...
1062,1063,40.802894,-73.964133,AMSTERDAM AVE,10025,Manhattan
1063,1064,40.851081,-73.939921,W 181ST ST,10033,Manhattan
1064,1065,40.682199,-73.995803,COURT ST,11231,Brooklyn
1065,1066,40.745294,-73.978217,3RD AVE,10016,Manhattan


In [34]:
# fifth, add the surrogate key to the data table
data = data.merge(location_dim,
                  left_on = unique_row,
                  right_on = unique_row,
                  how = 'left')

data.head(100)

Unnamed: 0,business_name,business_name2,latitude,longitude,street_x,zip_x,address_borough_x,app_status,swc_type,swc_sq_ft,swc_tables,swc_chairs,location_id,street_y,zip_y,address_borough_y
0,NEW INDIAN FOODS LLC,THE BOMBAY BREAD BAR,40.725336,-74.003132,SPRING ST,10012,Manhattan,Pending Review,Unenclosed,175,11,22,1,SPRING ST,10012,Manhattan
1,"AMY SCHERBER, INC.",AMY'S BREAD,40.690944,-73.996030,HENRY ST,11201,Brooklyn,Application Review Completed,Unenclosed,208,10,20,2,HENRY ST,11201,Brooklyn
2,VEMC GROUP CORP.,SHADY LADY,40.765636,-73.918461,30TH AVE,11103,Queens,Application Review Completed,Unenclosed,334,14,36,3,30TH AVE,11103,Queens
3,DLK Restaurants LLC,The Copper Still,40.743358,-73.996435,7TH AVE,10011,Manhattan,Application Review Completed,Unenclosed,275,10,30,4,7TH AVE,10011,Manhattan
4,MEZCALS OF 5TH AVE. REST CORP.,MESCAL,40.675769,-73.980687,5TH AVE,11215,Brooklyn,Application Review Completed,Unenclosed,318,13,26,5,5TH AVE,11215,Brooklyn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,343 BROOME STREET RESTAURANT INC,RANDOLPH BEER,40.719525,-73.994823,BROOME ST,10013,Manhattan,Application Review Completed,Unenclosed,132,6,18,96,BROOME ST,10013,Manhattan
96,"COBRA CATERERS, INC.",HUDSON HOUND,40.736142,-74.006059,HUDSON ST,10014,Manhattan,Application Review Completed,Unenclosed,185,7,25,97,HUDSON ST,10014,Manhattan
97,LUKE'S LOUNGE INC,LUKE'S LOUNGE INC,40.849230,-73.854970,HONE AVE,10461,Bronx,Application Review Completed,Unenclosed,83,2,8,98,HONE AVE,10461,Bronx
98,"CARABEAN, LLC",GALWAY HOOKER BAR,40.734564,-74.002125,7TH AVE S,10014,Manhattan,Application Review Completed,Enclosed,268,8,18,99,7TH AVE S,10014,Manhattan


### Step 5: Create Business Dimension

In [35]:
# first, copy the entire table
business_dim = data.copy()

In [36]:
business_dim.columns

Index(['business_name', 'business_name2', 'latitude', 'longitude', 'street_x',
       'zip_x', 'address_borough_x', 'app_status', 'swc_type', 'swc_sq_ft',
       'swc_tables', 'swc_chairs', 'location_id', 'street_y', 'zip_y',
       'address_borough_y'],
      dtype='object')

In [37]:
# second, subset for only the wanted columns in the dimension
business_dim = business_dim[['business_name', 'business_name2', 'swc_type', 'app_status']]

In [38]:
# third, drop duplicate rows in dimension
unique_row = ['business_name', 'business_name2']
business_dim = business_dim.drop_duplicates(subset = unique_row, keep = 'first')
business_dim = business_dim.reset_index(drop = True)
business_dim

Unnamed: 0,business_name,business_name2,swc_type,app_status
0,NEW INDIAN FOODS LLC,THE BOMBAY BREAD BAR,Unenclosed,Pending Review
1,"AMY SCHERBER, INC.",AMY'S BREAD,Unenclosed,Application Review Completed
2,VEMC GROUP CORP.,SHADY LADY,Unenclosed,Application Review Completed
3,DLK Restaurants LLC,The Copper Still,Unenclosed,Application Review Completed
4,MEZCALS OF 5TH AVE. REST CORP.,MESCAL,Unenclosed,Application Review Completed
...,...,...,...,...
1079,THE ELEMENT RESTAURANT GROUP INC,181 CABRINI,Unenclosed,Application Review Completed
1080,TAP NYC LLC,TAP NYC LLC,Unenclosed,Application Review Completed
1081,345 COURT ST. CORP.,MARCO POLO RISTORANTE,Unenclosed,Application Review Completed
1082,MUNSTER CAFE LLC,THE FLYING COCK,Unenclosed,Application Review Completed


In [39]:
# fourth, add a surrogate key
business_dim.insert(0, 'business_id', range(1000, 1000 + len(business_dim)))
business_dim

Unnamed: 0,business_id,business_name,business_name2,swc_type,app_status
0,1000,NEW INDIAN FOODS LLC,THE BOMBAY BREAD BAR,Unenclosed,Pending Review
1,1001,"AMY SCHERBER, INC.",AMY'S BREAD,Unenclosed,Application Review Completed
2,1002,VEMC GROUP CORP.,SHADY LADY,Unenclosed,Application Review Completed
3,1003,DLK Restaurants LLC,The Copper Still,Unenclosed,Application Review Completed
4,1004,MEZCALS OF 5TH AVE. REST CORP.,MESCAL,Unenclosed,Application Review Completed
...,...,...,...,...,...
1079,2079,THE ELEMENT RESTAURANT GROUP INC,181 CABRINI,Unenclosed,Application Review Completed
1080,2080,TAP NYC LLC,TAP NYC LLC,Unenclosed,Application Review Completed
1081,2081,345 COURT ST. CORP.,MARCO POLO RISTORANTE,Unenclosed,Application Review Completed
1082,2082,MUNSTER CAFE LLC,THE FLYING COCK,Unenclosed,Application Review Completed


In [40]:
# fifth, add the surrogate key to the data table
data = data.merge(business_dim,
                  left_on = unique_row,
                  right_on = unique_row,
                  how = 'left')

data.head(100)

Unnamed: 0,business_name,business_name2,latitude,longitude,street_x,zip_x,address_borough_x,app_status_x,swc_type_x,swc_sq_ft,swc_tables,swc_chairs,location_id,street_y,zip_y,address_borough_y,business_id,swc_type_y,app_status_y
0,NEW INDIAN FOODS LLC,THE BOMBAY BREAD BAR,40.725336,-74.003132,SPRING ST,10012,Manhattan,Pending Review,Unenclosed,175,11,22,1,SPRING ST,10012,Manhattan,1000,Unenclosed,Pending Review
1,"AMY SCHERBER, INC.",AMY'S BREAD,40.690944,-73.996030,HENRY ST,11201,Brooklyn,Application Review Completed,Unenclosed,208,10,20,2,HENRY ST,11201,Brooklyn,1001,Unenclosed,Application Review Completed
2,VEMC GROUP CORP.,SHADY LADY,40.765636,-73.918461,30TH AVE,11103,Queens,Application Review Completed,Unenclosed,334,14,36,3,30TH AVE,11103,Queens,1002,Unenclosed,Application Review Completed
3,DLK Restaurants LLC,The Copper Still,40.743358,-73.996435,7TH AVE,10011,Manhattan,Application Review Completed,Unenclosed,275,10,30,4,7TH AVE,10011,Manhattan,1003,Unenclosed,Application Review Completed
4,MEZCALS OF 5TH AVE. REST CORP.,MESCAL,40.675769,-73.980687,5TH AVE,11215,Brooklyn,Application Review Completed,Unenclosed,318,13,26,5,5TH AVE,11215,Brooklyn,1004,Unenclosed,Application Review Completed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,343 BROOME STREET RESTAURANT INC,RANDOLPH BEER,40.719525,-73.994823,BROOME ST,10013,Manhattan,Application Review Completed,Unenclosed,132,6,18,96,BROOME ST,10013,Manhattan,1095,Unenclosed,Application Review Completed
96,"COBRA CATERERS, INC.",HUDSON HOUND,40.736142,-74.006059,HUDSON ST,10014,Manhattan,Application Review Completed,Unenclosed,185,7,25,97,HUDSON ST,10014,Manhattan,1096,Unenclosed,Application Review Completed
97,LUKE'S LOUNGE INC,LUKE'S LOUNGE INC,40.849230,-73.854970,HONE AVE,10461,Bronx,Application Review Completed,Unenclosed,83,2,8,98,HONE AVE,10461,Bronx,1097,Unenclosed,Application Review Completed
98,"CARABEAN, LLC",GALWAY HOOKER BAR,40.734564,-74.002125,7TH AVE S,10014,Manhattan,Application Review Completed,Enclosed,268,8,18,99,7TH AVE S,10014,Manhattan,1098,Enclosed,Application Review Completed


### Step 6: Creating Fact table

In [41]:
data.columns

Index(['business_name', 'business_name2', 'latitude', 'longitude', 'street_x',
       'zip_x', 'address_borough_x', 'app_status_x', 'swc_type_x', 'swc_sq_ft',
       'swc_tables', 'swc_chairs', 'location_id', 'street_y', 'zip_y',
       'address_borough_y', 'business_id', 'swc_type_y', 'app_status_y'],
      dtype='object')

In [42]:
# take a subset of fact_table for only the needed columns: which are keys and measures
fact_table = data[['business_id',
              'location_id',
              'swc_sq_ft',
               'swc_tables',
               'swc_chairs']]

fact_table

Unnamed: 0,business_id,location_id,swc_sq_ft,swc_tables,swc_chairs
0,1000,1,175,11,22
1,1001,2,208,10,20
2,1002,3,334,14,36
3,1003,4,275,10,30
4,1004,5,318,13,26
...,...,...,...,...,...
1085,2079,1064,324,11,22
1086,2080,900,106,6,12
1087,2081,1065,252,15,30
1088,2082,1066,130,5,10


### Step 6: Deliver Facts and Dimensions to Data Warehouse (BigQuery)

In [43]:
# create a function to load dataframes to BigQuery

def load_table_to_bigquery(df,
                          table_name,
                          dataset_id):

    dataset_id = dataset_id #change 301800 to match your project id

    dataset_ref = bigquery_client.dataset(dataset_id)
    job_config = bigquery.LoadJobConfig()
    job_config.autodetect = True
    job_config.write_disposition = "WRITE_TRUNCATE"

    upload_table_name = f"{dataset_id}.{table_name}"
    
    load_job = bigquery_client.load_table_from_dataframe(df,
                                                upload_table_name,
                                                job_config = job_config)
        
    print(f"""completed loading {table_name} --
         {load_job}""")

In [46]:
load_table_to_bigquery(df = location_dim,
                       table_name = "location_dim",
                       dataset_id = dataset_id)

completed loading location_dim --
         LoadJob<project=cis9440-361100, location=US, id=c49249d7-d719-4db7-8d07-9207c19fe3e5>


In [47]:
load_table_to_bigquery(df = business_dim,
                       table_name = "business_dim",
                       dataset_id = dataset_id)

completed loading business_dim --
         LoadJob<project=cis9440-361100, location=US, id=0fa02955-7ec7-41e2-8491-ff2714c26aa0>


In [48]:
load_table_to_bigquery(df = fact_table,
                       table_name = "sidewalk_fact",
                       dataset_id = dataset_id)

completed loading sidewalk_fact --
         LoadJob<project=cis9440-361100, location=US, id=c9c83a2f-e7b9-43b2-a03f-6f13f25d88d6>
