# Transform of ETL (Extract-Transform-Load) pipeline
[Link to GitHub](https://github.com/stanislavlia/datascience_club_projects/blob/main/project1_etl_pipeline/transform.py)

In [380]:
# Import Pandas and json modules
import pandas as pd
import json

In [381]:
# Define the file path for the batch of users (extracted in a previous phase) in JSON format
# Note: Update the path as needed to point to the correct location of your batch file
BATCH_PATH = "/content/drive/MyDrive/42_DS_Club/01_ETL_pipeline/batch1000users.json"

In [382]:
# Open the JSON file at the specified BATCH_PATH in read mode
with open(BATCH_PATH, "r") as file:
    # Load the JSON content from the file into 'batch_json' dictionary
    batch_json = json.load(file)

In [383]:
from random import randint

# Select a random user from the 'users' list in 'batch_json'
batch_json['users'][randint(0, len(batch_json['users']) - 1)]

{'id': 'CPF 908.501.136-90',
 'firstname': 'Raquélen',
 'lastname': 'Ramos',
 'location_city': 'Piracicaba',
 'location_country': 'Brazil',
 'location_state': 'Rio Grande do Sul',
 'location_latitude': '-77.6499',
 'location_longitude': '135.5003',
 'location_postcode': 31767,
 'location_street_info': 'Rua Belo Horizonte , 3478',
 'email': 'raquelen.ramos@example.com',
 'gender': 'female',
 'login_uuid': 'cc2d52ac-3152-4110-bb78-748ab8e5cdf6',
 'login_username': 'orangeladybug336',
 'login_password': 'jungle',
 'phone': '(59) 3789-4605',
 'cell': '(47) 6803-5595',
 'date_of_birth': '1975-10-29T08:28:55.575Z',
 'age': 49,
 'date_of_registration': '2015-12-10T23:10:48.694Z',
 'photo_link': 'https://randomuser.me/api/portraits/women/71.jpg',
 'extract_time': '2024-11-05 10:07:35.349456'}

**[Pandas](https://pandas.pydata.org/)**

Pandas is a powerful, open-source data analysis and manipulation library for Python. It’s widely used in data science, finance, statistics, and any field where data analysis is essential. With Pandas, you can effortlessly work with large datasets, transforming raw data into clean, organized formats.

The primary data structure in Padas is **[DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)**. It is two-dimensional, size-mutable, potentially heterogeneous tabular data.

For an easy introduction to Pandas, it’s **highly** recommended to complete a **[Pandas course on Kaggle](https://www.kaggle.com/learn/pandas)**

In [384]:
# Create a DataFrame from the 'users' data in the dictionary 'batch_json'
df = pd.DataFrame(batch_json["users"])

In [385]:
# Display the first 3 rows of the DataFrame
df.head(3)

Unnamed: 0,id,firstname,lastname,location_city,location_country,location_state,location_latitude,location_longitude,location_postcode,location_street_info,...,login_uuid,login_username,login_password,phone,cell,date_of_birth,age,date_of_registration,photo_link,extract_time
0,,رضا,سلطانی نژاد,بندرعباس,Iran,تهران,14.712,-164.155,38135,"میدان امام خمینی, 1028",...,6581a435-c5ea-4ad7-b590-97ed42e2bd88,organicbear655,falcon1,021-81785565,0966-022-6250,1970-03-07T17:28:08.557Z,54,2019-12-22T21:24:10.015Z,https://randomuser.me/api/portraits/men/83.jpg,2024-11-05 10:06:55.380951
1,CPF 233.903.426-12,Itatiara,Cardoso,Bauru,Brazil,Rio Grande do Norte,-78.3875,155.2252,78883,"Rua Santa Maria , 5839",...,72e59a1b-cee3-44e0-9750-9319ce387aa4,angrymeercat308,feeling,(92) 5466-2079,(10) 6903-5933,1949-08-09T14:36:44.999Z,75,2014-05-05T22:55:47.073Z,https://randomuser.me/api/portraits/women/73.jpg,2024-11-05 10:06:55.552323
2,INSEE 2870263766424 52,Rose,Garcia,Amiens,France,Dordogne,-19.7834,52.149,48134,"Rue de la Barre, 7479",...,e51eba03-6141-40c3-88d3-4eb3fb393acc,brownfish926,slipper,04-15-26-78-47,06-66-43-30-81,1987-03-07T12:50:30.019Z,37,2006-06-19T05:56:30.214Z,https://randomuser.me/api/portraits/women/92.jpg,2024-11-05 10:06:55.676033


In [386]:
# Display a random sample of 5 rows from the DataFrame
df.sample(5)

Unnamed: 0,id,firstname,lastname,location_city,location_country,location_state,location_latitude,location_longitude,location_postcode,location_street_info,...,login_uuid,login_username,login_password,phone,cell,date_of_birth,age,date_of_registration,photo_link,extract_time
330,NSS 00 83 18 3477 4,Miguel,Garica,Estación de Apulco,Mexico,Queretaro,22.803,-74.6694,34942,"Periférico Norte Tijerina, 3515",...,b5f5c4a4-9caf-4e19-a068-268f43c7b5cb,happylion221,987654,(621) 847 3871,(677) 841 2317,1967-09-01T13:06:54.858Z,57,2013-07-21T19:19:01.266Z,https://randomuser.me/api/portraits/men/97.jpg,2024-11-05 10:07:59.580586
140,NSS 76 02 92 7574 6,Benito,Guajardo,San José Temascatio,Mexico,Tamaulipas,23.5202,-91.874,32688,"Andador Sur Urbina, 9353",...,262e1bd4-a476-4fcf-83c8-e2179b05907f,silverlion926,buzz,(667) 035 0490,(664) 910 6794,1951-10-24T16:34:03.649Z,73,2004-08-02T09:28:59.445Z,https://randomuser.me/api/portraits/men/49.jpg,2024-11-05 10:07:25.084774
615,PPS 5809199T,Seamus,Wilson,Blessington,Ireland,Fingal,-0.5453,-50.9165,13896,"Strand Road, 3993",...,dfe6c140-03f6-4cc0-b0c9-be965e5d9708,silverlion730,717171,031-838-6405,081-468-8654,1996-11-03T11:19:56.917Z,28,2013-10-15T18:08:01.853Z,https://randomuser.me/api/portraits/men/38.jpg,2024-11-05 10:08:51.972933
412,SIN 331579607,Jade,Young,Fountainbleu,Canada,Newfoundland and Labrador,-68.0259,86.385,G5I 3Q5,"Arctic Way, 8524",...,8f21d256-06ea-4f6b-8829-1619c51884e0,orangebutterfly484,break,I05 O38-0812,L75 G92-8745,1950-04-15T14:05:40.538Z,74,2013-12-21T05:25:10.669Z,https://randomuser.me/api/portraits/women/75.jpg,2024-11-05 10:08:16.005951
370,BSN 78468499,Dinja,Van der Gugten,Bunne,Netherlands,Flevoland,38.0887,147.7505,2215 DE,"Blotekamperweg, 2215",...,3c603222-24b9-4f74-a828-2e02fb85802a,orangedog908,bull,(0944) 408686,(06) 47176735,1947-09-10T00:31:59.993Z,77,2005-02-22T13:49:39.347Z,https://randomuser.me/api/portraits/women/93.jpg,2024-11-05 10:08:08.198711


In [387]:
# Display the last 3 rows of the DataFrame
df.tail(3)

Unnamed: 0,id,firstname,lastname,location_city,location_country,location_state,location_latitude,location_longitude,location_postcode,location_street_info,...,login_uuid,login_username,login_password,phone,cell,date_of_birth,age,date_of_registration,photo_link,extract_time
997,TFN 088005704,Sally,Hamilton,Shepparton,Australia,Tasmania,-87.4114,3.2026,4664,"W 6th St, 7010",...,d539bdbb-448a-48d8-be28-f39429ac7e54,orangewolf304,bottom,05-6748-3231,0489-110-653,1978-06-14T18:03:56.811Z,46,2008-05-15T17:52:47.225Z,https://randomuser.me/api/portraits/women/13.jpg,2024-11-05 10:10:39.267692
998,,آوینا,کامروا,بیرجند,Iran,مرکزی,13.1566,93.6851,58932,"راستوان, 1025",...,1d0a154a-86ca-4cd7-926e-94edea48772c,whitemeercat369,glotest,062-54417867,0950-072-2582,1996-02-29T06:25:10.969Z,28,2018-12-07T10:56:56.997Z,https://randomuser.me/api/portraits/women/87.jpg,2024-11-05 10:10:39.456009
999,SSN 773-20-5816,Marc,Wilson,Las Cruces,United States,Alabama,81.5999,-88.0205,56227,"Hamilton Ave, 2109",...,af735708-e841-4502-bcb6-3dcb24ea510b,sadgorilla272,salsero,(338) 730-0780,(482) 657-7576,1993-03-27T18:21:54.211Z,31,2018-04-29T20:33:40.445Z,https://randomuser.me/api/portraits/men/70.jpg,2024-11-05 10:10:39.588519


In [388]:
# Display a concise summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    1000 non-null   object
 1   firstname             1000 non-null   object
 2   lastname              1000 non-null   object
 3   location_city         1000 non-null   object
 4   location_country      1000 non-null   object
 5   location_state        1000 non-null   object
 6   location_latitude     1000 non-null   object
 7   location_longitude    1000 non-null   object
 8   location_postcode     1000 non-null   object
 9   location_street_info  1000 non-null   object
 10  email                 1000 non-null   object
 11  gender                1000 non-null   object
 12  login_uuid            1000 non-null   object
 13  login_username        1000 non-null   object
 14  login_password        1000 non-null   object
 15  phone                 1000 non-null   o

In [389]:
# Select the 'location_latitude' and 'location_longitude' columns from the DataFrame
df[['location_latitude', 'location_longitude']]

Unnamed: 0,location_latitude,location_longitude
0,14.7120,-164.1550
1,-78.3875,155.2252
2,-19.7834,52.1490
3,84.0463,139.9236
4,89.4772,-171.7408
...,...,...
995,54.9202,45.6438
996,-39.2062,41.4535
997,-87.4114,3.2026
998,13.1566,93.6851


In [390]:
# Select the 'location_latitude' and 'location_longitude' columns from the DataFrame using the .get() method
df.get(['location_latitude', 'location_longitude'])

Unnamed: 0,location_latitude,location_longitude
0,14.7120,-164.1550
1,-78.3875,155.2252
2,-19.7834,52.1490
3,84.0463,139.9236
4,89.4772,-171.7408
...,...,...
995,54.9202,45.6438
996,-39.2062,41.4535
997,-87.4114,3.2026
998,13.1566,93.6851


In [391]:
# Select the 'location_latitude' and 'location_longitude' columns from the DataFrame using the .loc() method
df.loc[:, ['location_latitude', 'location_longitude']]

Unnamed: 0,location_latitude,location_longitude
0,14.7120,-164.1550
1,-78.3875,155.2252
2,-19.7834,52.1490
3,84.0463,139.9236
4,89.4772,-171.7408
...,...,...
995,54.9202,45.6438
996,-39.2062,41.4535
997,-87.4114,3.2026
998,13.1566,93.6851


In [392]:
# Convert the 'location_longitude' and 'location_latitude' columns to float type
df['location_longitude'] = df['location_longitude'].astype(float)
df['location_latitude'] = df['location_latitude'].astype(float)

In [393]:
# Get the data types of the 'location_latitude' and 'location_longitude' columns
df.get(['location_latitude', 'location_longitude']).dtypes

Unnamed: 0,0
location_latitude,float64
location_longitude,float64


In [394]:
# Retrieve the column names of the DataFrame 'df'
df.columns

Index(['id', 'firstname', 'lastname', 'location_city', 'location_country',
       'location_state', 'location_latitude', 'location_longitude',
       'location_postcode', 'location_street_info', 'email', 'gender',
       'login_uuid', 'login_username', 'login_password', 'phone', 'cell',
       'date_of_birth', 'age', 'date_of_registration', 'photo_link',
       'extract_time'],
      dtype='object')

Python has a wide range of modules for most typical tasks. While using external modules carries some risk, it is often more productive than "reinventing the wheel". In our dataset, we have a column called `'location_country'`, which currently contains strings that are not very useful. But, if we convert these strings into a standardized format (such as [ISO](https://www.iso.org/)), we can extract some valuable information.

Regarding `'location_country'` column: we can convert the country names to [ISO2](https://www.iban.com/country-codes/) format using the [country-converter](https://pypi.org/project/country-converter/) module.

In [395]:
# Install the 'country_converter' package
!pip install country_converter



In [396]:
# Import 'country_converter' module
import country_converter

In [397]:
# Define a function to convert a country name to ISO2 code
def convert_country_code(country_name: str) -> str:
  iso_code = country_converter.convert(country_name, to='ISO2')
  return iso_code

# Apply the convert_country_code function to the 'location_country' column
# and create a new column 'country_iso2' with the ISO2 country codes
df['country_iso2'] = df['location_country'].apply(convert_country_code)

In [398]:
# Count the occurrences of unique combinations of the values (including NaN)
# in the 'location_country' and 'country_iso2' columns
df.get(['location_country', 'country_iso2']).value_counts(dropna=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
location_country,country_iso2,Unnamed: 2_level_1
United States,US,64
Australia,AU,63
India,IN,56
Iran,IR,55
Mexico,MX,51
United Kingdom,GB,51
Spain,ES,51
Turkey,TR,49
Netherlands,NL,49
France,FR,49


In [399]:
# Count the number of NaN values in the 'phone' and 'cell' columns
df.get(['phone', 'cell']).isna().sum()

Unnamed: 0,0
phone,0
cell,0


In [400]:
# Select the 'phone' and 'cell' columns
df.get(['phone', 'cell'])

Unnamed: 0,phone,cell
0,021-81785565,0966-022-6250
1,(92) 5466-2079,(10) 6903-5933
2,04-15-26-78-47,06-66-43-30-81
3,(854) 819-2025,(855) 844-6892
4,(854) 379-2952,(326) 767-5770
...,...,...
995,(974)-843-4246,(556)-367-5935
996,48888465,27926269
997,05-6748-3231,0489-110-653
998,062-54417867,0950-072-2582


The `'phone'` and `'cell'` columns do not contain missing (`NaN`) values, but the phone number formats differ, making the values unclear. A good approach is [to standardize all phone numbers](https://medium.com/@marc.bolle/fetching-and-formatting-phone-numbers-in-python-794ae16ab198) according to the [E.164](https://en.wikipedia.org/wiki/E.164) format using the [phonenumbers module](https://github.com/daviddrysdale/python-phonenumbers/tree/dev) [(documentation available)](https://daviddrysdale.github.io/python-phonenumbers/#-is_valid_number).

In [401]:
!pip3 install phonenumbers



In [402]:
import phonenumbers as ph
from phonenumbers.phonenumberutil import NumberParseException

The relevant methods of the module do not correctly convert US and Canadian numbers that contain letters and do not fully parse all `'()'`, and `'-'` characters. Here is my simple function to handle this:

In [403]:
# Define a function to convert alpha characters to their corresponding digits
def convert_alpha(phone_number):
  # Mapping of alpha characters to their respective digits on a phone keypad
  letters_to_digit = {
      'A': '2', 'B': '2', 'C': '2',
      'D': '3', 'E': '3', 'F': '3',
      'G': '4', 'H': '4', 'I': '4',
      'J': '5', 'K': '5', 'L': '5',
      'M': '6', 'N': '6', 'O': '6',
      'P': '7', 'Q': '7', 'R': '7', 'S': '7',
      'T': '8', 'U': '8', 'V': '8',
      'W': '9', 'X': '9', 'Y': '9', 'Z': '9'
  }

  # Skips any character that is not a number or in the map
  # Convert the character to its corresponding digit if it is in the map
  return ''.join(letters_to_digit.get(c, c) for c in phone_number.upper()
                   if letters_to_digit.get(c, c).isdigit())

In [404]:
# Define a function to normalize a phone number
def normalize_number(number: str, cc_iso2: str) -> str:
  try:
    # Parse the phone number after converting alphabetic characters using the ISO2 code
    number_obj = ph.parse(convert_alpha(number), region=cc_iso2)
  except NumberParseException as e:
    return 'number_not_parsed'
  # Check if the parsed number is valid
    if not ph.is_valid_number(number_obj):
      return 'number_not_valid'
  # Format the valid number to E.164 format
  normalized = ph.format_number(number_obj, ph.PhoneNumberFormat.E164)
  return normalized

By the way, without ISO2 code the `'parse'` method does not work

In [405]:
# Normalize the 'phone' and 'cell' columns for each row
# Store the results in a new columns 'normalized_phone' and 'normalized_cell'
df['normalized_phone'] = df.apply(lambda row: normalize_number(row['phone'], row['country_iso2']), axis=1)
df['normalized_cell'] = df.apply(lambda row: normalize_number(row['cell'], row['country_iso2']), axis=1)

In [406]:
# Count the occurrences of unique value in the 'normalized_phone', including NaN's
df.get(['normalized_phone']).value_counts(dropna=False)

Unnamed: 0_level_0,count
normalized_phone,Unnamed: 1_level_1
+12129122479,1
+526717940692,1
+526400434466,1
+526416590475,1
+526461900536,1
...,...
+3589390326,1
+3589452734,1
+3589482310,1
+3589560013,1


In [407]:
# Count the occurrences of unique value in the 'normalized_cell', including NaN's
df.get(['normalized_cell']).value_counts(dropna=False)

Unnamed: 0_level_0,count
normalized_cell,Unnamed: 1_level_1
+12066122895,1
+526725957874,1
+526393494960,1
+526400332382,1
+526457145297,1
...,...
+358490435669,1
+358491129409,1
+358492754082,1
+358493019113,1


In [408]:
from datetime import datetime

[Python Datetime](https://www.w3schools.com/python/python_datetime.asp)

[pandas.to_datetime](https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html)

In [409]:
# Convert data-related columns to datetime object
df["date_of_registration"] = pd.to_datetime(df["date_of_registration"])
df["date_of_birth"] = pd.to_datetime(df["date_of_birth"])
df["extract_time"] = pd.to_datetime(df["extract_time"])

In [410]:
# Extract the year, month, and day from 'date_of_registration' and create separate columns
df["year_of_registration"] = df["date_of_registration"].dt.year
df["month_of_registration"] = df["date_of_registration"].dt.month
df["day_of_registration"] = df["date_of_registration"].dt.day

In [411]:
# Standardize the 'gender' column by replacing "male" with "M" and "female" with "F"
df["gender"] = df["gender"].replace({"male": "M", "female": "F"})

In [412]:
# Store in new columns the lenght's values of the 'login_password' and 'login_username'
df["password_length"] = df["login_password"].apply(len)
df["loging_length"] = df["login_username"].apply(len)


It is strongly reccomended to set time stamp for tracking data modifications:

In [413]:
# Add the current timestamp to the new 'transform_timestamp' column
df["transform_timestamp"] = datetime.now()

Here we stop. But even in this simple and plain dataset, there are plenty of opportunities for verifying and converting data, as well as creating new features that can provide deeper insights.

In [414]:
output_path = '/content/drive/MyDrive/42_DS_Club/01_ETL_pipeline/transformed_batch1000users.csv'
df.to_csv(output_path)