# Extract of ETL (Extract-Transform-Load) pipeline
[Link to GitHub](https://github.com/stanislavlia/datascience_club_projects/tree/main/project1_etl_pipeline)

In [38]:
!apt install python3-venv

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
python3-venv is already the newest version (3.10.6-1~22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [39]:
!python3 -m venv etl_venv

In [40]:
!source etl_venv/bin/activate

In [41]:
!pip list

Package                            Version
---------------------------------- --------------------
absl-py                            1.4.0
accelerate                         0.34.2
aiohappyeyeballs                   2.4.3
aiohttp                            3.10.10
aiosignal                          1.3.1
alabaster                          0.7.16
albucore                           0.0.16
albumentations                     1.4.15
altair                             4.2.2
annotated-types                    0.7.0
anyio                              3.7.1
argon2-cffi                        23.1.0
argon2-cffi-bindings               21.2.0
array_record                       0.5.1
arviz                              0.19.0
astropy                            6.1.4
astropy-iers-data                  0.2024.10.21.0.33.21
astunparse                         1.6.3
async-timeout                      4.0.3
atpublic                           4.1.0
attrs                              24.2.0
audioread      

In [42]:
!pip install --upgrade pip
!pip install requests
!pip install tqdm
!pip install click



In [43]:
!pip show requests
!echo "===================="
!pip show tqdm
!echo "===================="
!pip show click

Name: requests
Version: 2.32.3
Summary: Python HTTP for Humans.
Home-page: https://requests.readthedocs.io
Author: Kenneth Reitz
Author-email: me@kennethreitz.org
License: Apache-2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: certifi, charset-normalizer, idna, urllib3
Required-by: bigframes, CacheControl, community, diffusers, earthengine-api, fastai, folium, gcsfs, gdown, geocoder, google-api-core, google-cloud-bigquery, google-cloud-storage, google-colab, huggingface-hub, kaggle, kagglehub, moviepy, music21, pandas-datareader, panel, pooch, pymystem3, requests-oauthlib, spacy, Sphinx, tensorflow, tensorflow-datasets, transformers, tweepy, wandb, weasel, yfinance
Name: tqdm
Version: 4.66.5
Summary: Fast, Extensible Progress Meter
Home-page: https://tqdm.github.io
Author: 
Author-email: 
License: MPL-2.0 AND MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: bigquery-magics, cmdstanpy, dopamine_rl, gdown, google-generativeai, huggingface-hub,

In [44]:
import requests
from pprint import pprint
import json
from tqdm import tqdm
import time
import click

In [45]:
RANDOMUSER_API_URL = "https://randomuser.me/api/"

In [46]:
def parse_json(user_json: dict) -> dict:
    # Extract ID
    id = user_json["results"][0]["id"]["name"] + " " + str(user_json["results"][0]["id"]["value"])

    # Extract name details
    first_name = user_json["results"][0]["name"]["first"]
    last_name = user_json["results"][0]["name"]["last"]

    # Extract location details
    location_city = user_json["results"][0]["location"]["city"]
    location_country = user_json["results"][0]["location"]["country"]
    location_latitude = user_json["results"][0]["location"]["coordinates"]["latitude"]
    location_longitude = user_json["results"][0]["location"]["coordinates"]["longitude"]
    location_postcode = user_json["results"][0]["location"]["postcode"]
    location_state = user_json["results"][0]["location"]["state"]
    location_street_info = f"{user_json['results'][0]['location']['street']['name']}, {user_json['results'][0]['location']['street']['number']}"

    # Extract other fields
    email = user_json["results"][0].get("email")
    gender = user_json["results"][0].get("gender")

    # Extract login details
    login_uuid = user_json["results"][0]["login"].get("uuid")
    login_username = user_json["results"][0]["login"].get("username")
    login_password = user_json["results"][0]["login"].get("password")

    # Extract contact details
    phone = user_json["results"][0].get("phone")
    cell = user_json["results"][0].get("cell")

    # Extract date of birth and registration details
    date_of_birth = user_json["results"][0]["dob"].get("date")
    age = user_json["results"][0]["dob"].get("age")
    date_of_registration = user_json["results"][0]["registered"].get("date")

    # Extract picture link
    photo_link = user_json["results"][0]["picture"].get("large")

    return {
        "id": id,
        "firstname": first_name,
        "lastname": last_name,
        "location_city": location_city,
        "location_country": location_country,
        "location_state": location_state,
        "location_latitude": location_latitude,
        "location_longitude": location_longitude,
        "location_postcode": location_postcode,
        "location_street_info": location_street_info,
        "email": email,
        "gender": gender,
        "login_uuid": login_uuid,
        "login_username": login_username,
        "login_password": login_password,
        "phone": phone,
        "cell": cell,
        "date_of_birth": date_of_birth,
        "age": age,
        "date_of_registration": date_of_registration,
        "photo_link": photo_link
    }

In [47]:
def fetch_user_from_api(url : str):

    r = requests.get(url=url)
    user_json = r.json()

    parsed_user = parse_json(user_json)

    return parsed_user

In [48]:
def load_batch_data(result_path : str, n_users : int):

    print(f"Collecting data from {RANDOMUSER_API_URL}; n_users = {n_users}")

    users = []

    for i in tqdm(range(n_users), desc="Fetching users from API..."):

        user = fetch_user_from_api(url=RANDOMUSER_API_URL)
        users.append(user)


    #save users to file
    batch_data = {"n_users" : n_users,
                  "users" : users}

    print("Saving users to file ", result_path)
    with open(result_path, "w") as file:
        json.dump(batch_data, file, indent=2, ensure_ascii=False)


    print("JOB DONE")

In [49]:
@click.command()
@click.option('--result_path', type=str, help='Path to save loaded batch of users')
@click.option('--n_users', type=int, help='How many users to fetch from API')
def load_batch_cli(result_path: str, n_users: int):

    load_batch_data(result_path=result_path,
                    n_users=n_users)

In [None]:
# load_batch_cli()

In [59]:
if __name__ == "__main__":
    import sys
    import click

    sys.argv = sys.argv[:1]

    path = 'output.json'
    num = 100

    with click.Context(load_batch_cli) as ctx:
        ctx.invoke(load_batch_cli, result_path=path, n_users=num)

Collecting data from https://randomuser.me/api/; n_users = 100


Fetching users from API...: 100%|██████████| 100/100 [00:13<00:00,  7.32it/s]

Saving users to file  output.json
JOB DONE





In [None]:
#How to use
#python3 extract.py --result_path batch100users.json --n_users 100
#python3 extract.py --result_path batch15users.json --n_users 15