# Data Source Systems
Source data from various source systems and ingest them using python code.

Parquet files
CSV files
APIs
RDBMS databases
HTML

In [24]:
# import modules
import certifi
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import urllib3
from urllib3 import request
from unicodedata import normalize

#### Parquet & CSV files Data Sources

In [7]:
# curl -O https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet
# curl -O https://data.cityofnewyork.us/resource/h9gi-nx95.csv?$limit=500

#### Sourcing Parquet data

In [39]:
# Read data from the Parquet file. We use pandas read_parquet method for ease and speed.
def parquet_data_source(parquet_file_name):
    try:
        df_parquet = pd.read_parquet(parquet_file_name)
    except Exception as e:
        df_parquet = pd.DataFrame()
    return df_parquet

### Sourcing CSV data

In [42]:
# Read data from the CSV file. We use pandas read_csv method for ease and speed.
def csv_data_source(csv_file_name):
    try:
        df_csv = pd.read_csv(csv_file_name)
    except Exception as e:
        df_csv = pd.DataFrame()
    return df_csv

### Sourcing Data from RDBMS tables

In [28]:
# # Read sqlite query results into a pandas DataFrame
# with sqlite3.connect("movies.sqlite") as conn:
#     df = pd.read_sql("select * from movies", conn)
# df.head()

### Sourcing data from APIs
Please make sure to install the certifi library 

In [45]:
# URL for the API

def api_data_source(api_endpoint):
    try:
        # Create a Pool manager that can be used to read the API response 
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
        
        # Check if API is available to retrieve the data
        apt_status = http.request('GET', url).status
        
        if apt_status == 200:
            # Retrieve API data
            response = http.request('GET', url)
            data = json.loads(response.data.decode('utf-8'))
            
            # Normalize the data into a DataFrame
            df_api = pd.json_normalize(data)
        else:
            df_api = pd.DataFrame()
    except Exception as e:
        df_api = pd.DataFrame()
    return df_api

### Sourcing data from Webpages
Please visit the url https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)

In [44]:
# get data from url

def web_data_source(web_url, matching_keyword):
    try:
        df_html = pd.read_html(web_url, match = matching_keyword)
        df_html[0]
    except Exception as e:
        df_html = pd.DataFrame()
    return df_html

### Data Extraction From All Sources

In [46]:
def data_extraction():
    parquet_file_name = 'yellow_tripdata_2022-01.parquet'
    csv_file_name = 'h9gi-nx95.csv'
    api_endpoint = 'https://data.cityofnewyork.us/resource/h9gi-nx95.json?$limit=500'
    web_url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'
    matching_keyword = 'by country'

    df_parquet, df_csv, df_api, df_html = (parquet_data_source(parquet_file_name),
                                           csv_data_source(csv_file_name),
                                           api_data_source(api_endpoint),
                                           web_data_source(web_url, matching_keyword))

    return df_parquet, df_csv, df_api, df_html

In [47]:
data_extraction()

200


(         VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
 0               1  2022-01-01 00:35:40   2022-01-01 00:53:29              2.0   
 1               1  2022-01-01 00:33:43   2022-01-01 00:42:07              1.0   
 2               2  2022-01-01 00:53:21   2022-01-01 01:02:19              1.0   
 3               2  2022-01-01 00:25:21   2022-01-01 00:35:23              1.0   
 4               2  2022-01-01 00:36:48   2022-01-01 01:14:20              1.0   
 ...           ...                  ...                   ...              ...   
 2463926         2  2022-01-31 23:36:53   2022-01-31 23:42:51              NaN   
 2463927         2  2022-01-31 23:44:22   2022-01-31 23:55:01              NaN   
 2463928         2  2022-01-31 23:39:00   2022-01-31 23:50:00              NaN   
 2463929         2  2022-01-31 23:36:42   2022-01-31 23:48:45              NaN   
 2463930         2  2022-01-31 23:46:00   2022-02-01 00:13:00              NaN   
 
          trip