# ETL Pipeline (first steps)
---

### Step 0: Install the required python packages

In [None]:
pip install --upgrade sodapy

#### Now, on the top of your Notebook select "Kernel" -> "Restart and Clear Output"
Then, continue from the next cell

### Step 1: Setup your NYC Open Data variables (ACTION REQUIRED HERE)

In [None]:
# import libraries
import pandas as pd
import numpy as np
from sodapy import Socrata

In [None]:
# setup the host name for the API endpoint (the https:// part will be added automatically)
# only need to change this if you are not using NYC Open Data
data_url = 'data.cityofnewyork.us'

In [None]:
# setup the data set at the API endpoint (311 data in this case)
# https://data.cityofnewyork.us/resource/8wbx-tsch.json
# would give us '8wbx-tsch'
data_set = 'buk3-3qpr'

In [None]:
# Setup your App Token, which you created in Week 6
# You can find your app token by logging into: https://data.cityofnewyork.us/profile/edit/developer_settings
app_token = 'YOUR APP TOKEN HERE'

In [None]:
# run this cell to setup your Socrata client that connects python to NYC Open Data

# create the client that points to the API endpoint
nyc_open_data_client = Socrata(data_url, app_token, timeout = 200)
print(f"nyc open data client name is: {nyc_open_data_client}")
print(f"nyc open data client data type is: {type(nyc_open_data_client)}")

### Step 3: Extract data

1. connect to NYC Open Data with API Key
2. pull specific dataset as a pandas dataframe
3. Look at shape of extracted data

#### sodapy client.get parameters
1. select
2. where
3. order
4. limit
5. group

In [None]:
# Get the total number of records in our the entire data set
total_record_count = nyc_open_data_client.get(data_set, select = "COUNT(*)")
print(f"total records in {data_set}: {total_record_count}")

In [None]:
# Now, loop through target data set to pull all rows in chunks (we cannot pull all rows at once)
# AGAIN, UPDATE WHERE FILTER INSIDE BELOW FUNCTION

def extract_socrata_data(total_records: int,
                         chunk_size: int,
                         data_set = data_set):
    
    # measure time this function takes
    import time
    start_time = time.time()

    start = 0                   # start at 0
    results = []                # empty list for results
    record_count = total_records

    while True:

        # fetch the set of records starting at 'start'
        results.extend(nyc_open_data_client.get(data_set,
                                                offset = start,
                                                limit = chunk_size))

        # update the starting record number
        start = start + chunk_size

        # if we have fetched all of the records (we have reached record_count), exit loop
        if (start > record_count):
            break

    # convert the list into a pandas data frame
    data = pd.DataFrame.from_records(results)

    end_time = time.time()
    print(f"function took {round(end_time - start_time, 1)} seconds")

    print(f"the shape of your dataframe is: {data.shape}")
    return data

data = extract_socrate_data(total_records = 6308,
                            chunk_size = 2000,
                            data_set = data_set)

### Step 4: Data Profiling

1. Distinct values per column
2. Null values per column
3. Summary statistics per numeric column

In [None]:
# what are the columns in our dataframe?
data.columns

In [None]:
data[["site_name", "site_location", "category", "acres"]].sort_values(by = "acres",
                                                                     ascending = False).head(20)