# ETL Pipeline (first steps)
---

### Step 0: Install the required python packages

In [1]:
pip install --upgrade sodapy

Note: you may need to restart the kernel to use updated packages.


#### Now, on the top of your Notebook select "Kernel" -> "Restart and Clear Output"
Then, continue from the next cell

### Step 1: Setup your NYC Open Data variables (ACTION REQUIRED HERE)

In [2]:
# import libraries
import pandas as pd
import numpy as np
from sodapy import Socrata

In [3]:
# setup the host name for the API endpoint (the https:// part will be added automatically)
# only need to change this if you are not using NYC Open Data
data_url = 'data.cityofnewyork.us'

In [4]:
# setup the data set at the API endpoint (311 data in this case)
# https://data.cityofnewyork.us/resource/8wbx-tsch.json
# would give us '8wbx-tsch'
data_set = 'buk3-3qpr'

In [5]:
# Setup your App Token, which you created in Week 6
# You can find your app token by logging into: https://data.cityofnewyork.us/profile/edit/developer_settings

#app_token = 'YOUR APP TOKEN HERE'
app_token = ''

In [6]:
# run this cell to setup your Socrata client that connects python to NYC Open Data

# create the client that points to the API endpoint
nyc_open_data_client = Socrata(data_url, app_token, timeout = 200)
print(f"nyc open data client name is: {nyc_open_data_client}")
print(f"nyc open data client data type is: {type(nyc_open_data_client)}")

nyc open data client name is: <sodapy.socrata.Socrata object at 0x7fa1bbf36910>
nyc open data client data type is: <class 'sodapy.socrata.Socrata'>


### Step 3: Extract data

1. connect to NYC Open Data with API Key
2. pull specific dataset as a pandas dataframe
3. Look at shape of extracted data

#### sodapy client.get parameters
1. select
2. where
3. order
4. limit
5. group

In [7]:
# Get the total number of records in our the entire data set
total_record_count = nyc_open_data_client.get(data_set, select = "COUNT(*)")
print(f"total records in {data_set}: {total_record_count}")

total records in buk3-3qpr: [{'COUNT': '6310'}]


In [8]:
# Now, loop through target data set to pull all rows in chunks (we cannot pull all rows at once)
# AGAIN, UPDATE WHERE FILTER INSIDE BELOW FUNCTION

def extract_socrata_data(total_records: int,
                         chunk_size: int,
                         data_set = data_set):
    
    # measure time this function takes
    import time
    start_time = time.time()

    start = 0                   # start at 0
    results = []                # empty list for results
    record_count = total_records

    while True:

        # fetch the set of records starting at 'start'
        results.extend(nyc_open_data_client.get(data_set,
                                                offset = start,
                                                limit = chunk_size))

        # update the starting record number
        start = start + chunk_size

        # if we have fetched all of the records (we have reached record_count), exit loop
        if (start > record_count):
            break

    # convert the list into a pandas data frame
    data = pd.DataFrame.from_records(results)

    end_time = time.time()
    print(f"function took {round(end_time - start_time, 1)} seconds")

    print(f"the shape of your dataframe is: {data.shape}")
    return data

data = extract_socrata_data(total_records = 6308,
                            chunk_size = 2000,
                            data_set = data_set)

function took 8.5 seconds
the shape of your dataframe is: (6310, 24)


### Step 4: Data Profiling

1. Distinct values per column
2. Null values per column
3. Summary statistics per numeric column

In [9]:
# what are the columns in our dataframe?
data.columns

Index(['propnum', 'prop_id', 'boro', 'ampsdistrict', 'prop_name', 'site_name',
       'prop_location', 'site_location', 'acres', 'category', 'sub_category',
       'rated', 'council_district', 'zipcode', 'communityboard',
       'jurisdiction', 'nysassembly', 'nyssenate', 'uscongress', 'precinct',
       'comfortstation', 'multipolygon', 'sourcefc', 'reason_not_rated'],
      dtype='object')

In [10]:
data[["site_name", "site_location", "category", "acres"]].sort_values(by = "acres",
                                                                     ascending = False).head(20)

Unnamed: 0,site_name,site_location,category,acres
4973,"Pelham-Bay Circles, Drives & Medians",(e) Eastchester Bay To (nw) Pelham Bridge,Large Park,97.5739975
6165,NE Forest,East 233 St. To Major Deegan Expwy,Large Park,97.09400177
4690,Northwest Latourette,Rockland Ave & Forest Hill Rd - North Of Buck'...,Large Park,96.90499878
4668,Central LaTourette,Rockland Ave & Forest Hill Rd - North Of Buck'...,Large Park,96.90499878
4667,Southeast LaTourette,Rockland Ave & Forest Hill Rd - North Of Buck'...,Large Park,96.90499878
1846,Forest Park Golf Course,Forest Park Dr n/o Oak Ridge,Other,96.19999695
4651,High Rock Park,"Richmond Pkwy., Manor Rd., Summit Ave., Rockla...",Large Park,94.09999847
6175,Mosholu Golf Course,Jerome Ave To Mosholu Prkwy,Large Park,93.55999756
4743,Silver Lake Golf Course,"Victory Blvd, Clove Rd, Forest Ave",Other,93.29399872
4755,Freshkills Park,"Victory Blvd,Signs Rd,Travis Ave,Arthur Kill Rd",Large Park,920.42602539
