# Transform of ETL (Extract-Transform-Load) pipeline
[Link to GitHub](https://github.com/stanislavlia/datascience_club_projects/blob/main/project1_etl_pipeline/transform.py)

In [112]:
!pip3 install pandas



In [113]:
# Import Pandas and json modules
import pandas as pd
import json

In [114]:
# Define the file path for the batch of users (extracted in a previous phase) in JSON format
# Note: Update the path as needed to point to the correct location of your batch file
# BATCH_PATH = '/content/drive/MyDrive/42_DS_Club/01_ETL_pipeline/batch1000users.json'
BATCH_PATH = 'batch1000users.json'

In [115]:
# Open the JSON file at the specified BATCH_PATH in read mode
with open(BATCH_PATH, "r") as file:
    # Load the JSON content from the file into 'batch_json' dictionary
    batch_json = json.load(file)

In [116]:
from random import randint

# Select a random user from the 'users' list in 'batch_json'
batch_json['users'][randint(0, len(batch_json['users']) - 1)]

{'id': ' None',
 'firstname': 'Ceyhan',
 'lastname': 'Poçan',
 'location_city': 'Samsun',
 'location_country': 'Turkey',
 'location_state': 'Tokat',
 'location_latitude': '69.5728',
 'location_longitude': '114.0988',
 'location_postcode': 81130,
 'location_street_info': 'Maçka Cd, 8777',
 'email': 'ceyhan.pocan@example.com',
 'gender': 'female',
 'login_uuid': '3b41ebab-c0cd-4895-903c-0429fb854724',
 'login_username': 'smallbird327',
 'login_password': 'camilla',
 'phone': '(753)-241-3063',
 'cell': '(577)-404-8976',
 'date_of_birth': '1954-10-31T21:25:45.113Z',
 'age': 70,
 'date_of_registration': '2013-08-10T23:30:47.660Z',
 'photo_link': 'https://randomuser.me/api/portraits/women/96.jpg',
 'extract_time': '2024-11-06 19:28:53.390692'}

**[Pandas](https://pandas.pydata.org/)**

Pandas is a powerful, open-source data analysis and manipulation library for Python. It’s widely used in data science, finance, statistics, and any field where data analysis is essential. With Pandas, you can effortlessly work with large datasets, transforming raw data into clean, organized formats.

The primary data structure in Padas is **[DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)**. It is two-dimensional, size-mutable, potentially heterogeneous tabular data.

For an easy introduction to Pandas, it’s **highly** recommended to complete a **[Pandas course on Kaggle](https://www.kaggle.com/learn/pandas)**

In [117]:
# Create a DataFrame from the 'users' data in the dictionary 'batch_json'
df = pd.DataFrame(batch_json["users"])

In [118]:
# Display the first 3 rows of the DataFrame
df.head(3)

Unnamed: 0,id,firstname,lastname,location_city,location_country,location_state,location_latitude,location_longitude,location_postcode,location_street_info,...,login_uuid,login_username,login_password,phone,cell,date_of_birth,age,date_of_registration,photo_link,extract_time
0,,Tyler,Green,Blenheim,New Zealand,Auckland,52.0345,-87.3133,61785,"Harewood Road, 1254",...,f3225009-aff7-40c3-bbf1-a0691aadda5f,whitecat703,quest1,(066)-886-3947,(977)-396-9271,1947-11-19T22:56:37.545Z,76,2009-11-05T21:18:51.984Z,https://randomuser.me/api/portraits/men/57.jpg,2024-11-06 19:25:37.723380
1,AVS 756.6533.3915.87,Rocco,Le Gall,Murgenthal,Switzerland,Graubünden,-57.6771,-86.0432,3461,"Rue de la Mairie, 568",...,d740a95c-bbc3-4905-a9d0-a499471bcf79,ticklishbear663,madrid,077 077 94 68,075 506 56 98,1961-08-25T15:42:41.778Z,63,2014-03-02T00:10:50.552Z,https://randomuser.me/api/portraits/men/61.jpg,2024-11-06 19:25:38.258638
2,,Adem,Keseroğlu,Adıyaman,Turkey,Konya,65.3337,-51.0028,54131,"Fatih Sultan Mehmet Cd, 6502",...,07cac087-535a-4cbb-9e1a-9212470c1933,beautifulostrich658,thing,(240)-806-2105,(427)-505-7703,1953-01-11T22:43:11.092Z,71,2003-09-26T14:13:38.572Z,https://randomuser.me/api/portraits/men/17.jpg,2024-11-06 19:25:38.731692


In [119]:
# Display a random sample of 5 rows from the DataFrame
df.sample(5)

Unnamed: 0,id,firstname,lastname,location_city,location_country,location_state,location_latitude,location_longitude,location_postcode,location_street_info,...,login_uuid,login_username,login_password,phone,cell,date_of_birth,age,date_of_registration,photo_link,extract_time
20,,بیتا,رضایی,آبادان,Iran,قم,21.0535,153.1912,72440,"شهید کبیری طامه, 7328",...,43a3b808-b303-455a-bd7e-fa37281c3c42,whitezebra680,joanne,050-82731538,0902-552-6118,1956-07-26T00:22:39.576Z,68,2011-07-24T22:14:40.045Z,https://randomuser.me/api/portraits/women/18.jpg,2024-11-06 19:25:48.867410
988,BSN 67293686,Buşra,Swagerman,Gendt,Netherlands,Friesland,31.251,78.2662,5342 NT,"Dammaat, 6648",...,1ae8a543-4293-4900-817c-a8aae1e01968,smallmouse553,jessica1,(008) 0054528,(06) 17574400,1983-04-01T23:26:57.747Z,41,2015-11-13T22:52:18.184Z,https://randomuser.me/api/portraits/women/89.jpg,2024-11-06 19:35:35.427764
930,INSEE 2900044111510 65,Gabrielle,Nguyen,Caen,France,Paris,-19.6525,14.3837,30259,"Rue Gasparin, 3657",...,6bbf5e3c-3642-4ff3-becb-27cb2a3d93c2,sadcat541,rockies,04-28-52-81-26,06-10-81-07-85,1990-01-30T16:16:50.316Z,34,2016-03-19T23:40:19.053Z,https://randomuser.me/api/portraits/women/14.jpg,2024-11-06 19:34:57.916572
880,CPF 621.543.459-57,Marivana,Alves,Paranaguá,Brazil,São Paulo,-77.2015,142.3469,84585,"Rua Primeiro de Maio , 1147",...,a0a4189f-d142-43e8-a1d6-5ad17f8c05f1,happyfrog668,splurge,(95) 7039-0899,(99) 5207-5201,1992-01-23T06:29:55.909Z,32,2003-11-16T02:45:04.598Z,https://randomuser.me/api/portraits/women/47.jpg,2024-11-06 19:34:27.806780
240,NSS 89 49 54 8420 4,Reina,Aguirre,Pinal de Amoles,Mexico,Yucatan,22.9805,152.9402,28864,"Continuación Iraq, 23",...,d074079f-392b-4ee8-943e-d206093edab0,tinyswan231,coming,(652) 986 7027,(601) 585 6011,1945-09-11T16:01:39.002Z,79,2010-02-07T22:45:58.818Z,https://randomuser.me/api/portraits/women/2.jpg,2024-11-06 19:28:04.090451


In [120]:
# Display the last 3 rows of the DataFrame
df.tail(3)

Unnamed: 0,id,firstname,lastname,location_city,location_country,location_state,location_latitude,location_longitude,location_postcode,location_street_info,...,login_uuid,login_username,login_password,phone,cell,date_of_birth,age,date_of_registration,photo_link,extract_time
997,SVNR 47 090294 H 449,Edelbert,Henschel,Stadtroda,Germany,Bremen,-52.8726,-71.1584,24549,"Tannenweg, 7053",...,35ebed42-f6bc-48ad-a2be-fda6250543b4,redswan948,bush,0130-3227206,0174-8458563,1994-02-09T18:19:05.342Z,30,2015-07-23T11:44:23.837Z,https://randomuser.me/api/portraits/men/43.jpg,2024-11-06 19:35:40.157342
998,SID 239569126,Slavoljub,Marinković,Bosilegrad,Serbia,Peć,61.2031,-132.7531,66528,"Gorskog Cara, 4793",...,b607427a-ae3f-4ed6-9be9-d3fd6ac23757,whitemeercat866,raider,026-7124-341,066-4919-175,1957-11-15T22:55:19.794Z,66,2021-02-24T06:29:56.808Z,https://randomuser.me/api/portraits/men/84.jpg,2024-11-06 19:35:40.647949
999,,Sedef,Çapanoğlu,Karaman,Turkey,Diyarbakır,69.3068,150.7686,85476,"Tunalı Hilmi Cd, 5290",...,ee188ad4-d739-45e1-b8ee-1984e218cd10,tinybutterfly610,toriamos,(115)-781-0298,(552)-874-1352,1973-09-14T14:39:46.251Z,51,2007-12-14T13:12:01.334Z,https://randomuser.me/api/portraits/women/87.jpg,2024-11-06 19:35:41.123036


In [121]:
# Display a concise summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   id                    1000 non-null   object
 1   firstname             1000 non-null   object
 2   lastname              1000 non-null   object
 3   location_city         1000 non-null   object
 4   location_country      1000 non-null   object
 5   location_state        1000 non-null   object
 6   location_latitude     1000 non-null   object
 7   location_longitude    1000 non-null   object
 8   location_postcode     1000 non-null   object
 9   location_street_info  1000 non-null   object
 10  email                 1000 non-null   object
 11  gender                1000 non-null   object
 12  login_uuid            1000 non-null   object
 13  login_username        1000 non-null   object
 14  login_password        1000 non-null   object
 15  phone                 1000 non-null   o

In [122]:
# Select the 'location_latitude' and 'location_longitude' columns from the DataFrame
df[['location_latitude', 'location_longitude']]

Unnamed: 0,location_latitude,location_longitude
0,52.0345,-87.3133
1,-57.6771,-86.0432
2,65.3337,-51.0028
3,-63.5964,155.6821
4,-38.0238,-55.6996
...,...,...
995,55.6881,-126.4076
996,65.6947,-93.6611
997,-52.8726,-71.1584
998,61.2031,-132.7531


In [123]:
# Select the 'location_latitude' and 'location_longitude' columns from the DataFrame using the .get() method
df.get(['location_latitude', 'location_longitude'])

Unnamed: 0,location_latitude,location_longitude
0,52.0345,-87.3133
1,-57.6771,-86.0432
2,65.3337,-51.0028
3,-63.5964,155.6821
4,-38.0238,-55.6996
...,...,...
995,55.6881,-126.4076
996,65.6947,-93.6611
997,-52.8726,-71.1584
998,61.2031,-132.7531


In [124]:
# Select the 'location_latitude' and 'location_longitude' columns from the DataFrame using the .loc() method
df.loc[:, ['location_latitude', 'location_longitude']]

Unnamed: 0,location_latitude,location_longitude
0,52.0345,-87.3133
1,-57.6771,-86.0432
2,65.3337,-51.0028
3,-63.5964,155.6821
4,-38.0238,-55.6996
...,...,...
995,55.6881,-126.4076
996,65.6947,-93.6611
997,-52.8726,-71.1584
998,61.2031,-132.7531


In [125]:
# Convert the 'location_longitude' and 'location_latitude' columns to float type
df['location_longitude'] = df['location_longitude'].astype(float)
df['location_latitude'] = df['location_latitude'].astype(float)

In [126]:
# Get the data types of the 'location_latitude' and 'location_longitude' columns
df.get(['location_latitude', 'location_longitude']).dtypes

location_latitude     float64
location_longitude    float64
dtype: object

In [127]:
# Retrieve the column names of the DataFrame 'df'
df.columns

Index(['id', 'firstname', 'lastname', 'location_city', 'location_country',
       'location_state', 'location_latitude', 'location_longitude',
       'location_postcode', 'location_street_info', 'email', 'gender',
       'login_uuid', 'login_username', 'login_password', 'phone', 'cell',
       'date_of_birth', 'age', 'date_of_registration', 'photo_link',
       'extract_time'],
      dtype='object')

Python has a wide range of modules for most typical tasks. While using external modules carries some risk, it is often more productive than "reinventing the wheel". In our dataset, we have a column called `'location_country'`, which currently contains strings that are not very useful. But, if we convert these strings into a standardized format (such as [ISO](https://www.iso.org/)), we can extract some valuable information.

Regarding `'location_country'` column: we can convert the country names to [ISO2](https://www.iban.com/country-codes/) format using the [country-converter](https://pypi.org/project/country-converter/) module.

In [128]:
# Install the 'country_converter' package
!pip3 install country_converter



In [129]:
# Import 'country_converter' module
import country_converter

In [130]:
# Define a function to convert a country name to ISO2 code
def convert_country_code(country_name: str) -> str:
  iso_code = country_converter.convert(country_name, to='ISO2')
  return iso_code

# Apply the convert_country_code function to the 'location_country' column
# and create a new column 'country_iso2' with the ISO2 country codes
df['country_iso2'] = df['location_country'].apply(convert_country_code)

In [131]:
# Count the occurrences of unique combinations of the values (including NaN)
# in the 'location_country' and 'country_iso2' columns
df.get(['location_country', 'country_iso2']).value_counts(dropna=False)

location_country  country_iso2
Ireland           IE              60
Netherlands       NL              57
Spain             ES              56
Serbia            RS              54
Ukraine           UA              54
Brazil            BR              53
Turkey            TR              52
New Zealand       NZ              52
United States     US              50
Norway            NO              48
Australia         AU              48
Finland           FI              48
Iran              IR              46
United Kingdom    GB              45
Mexico            MX              44
Germany           DE              42
Canada            CA              41
Switzerland       CH              41
Denmark           DK              40
France            FR              35
India             IN              34
Name: count, dtype: int64

In [132]:
# Count the number of NaN values in the 'phone' and 'cell' columns
df.get(['phone', 'cell']).isna().sum()

phone    0
cell     0
dtype: int64

In [133]:
# Select the 'phone' and 'cell' columns
df.get(['phone', 'cell'])

Unnamed: 0,phone,cell
0,(066)-886-3947,(977)-396-9271
1,077 077 94 68,075 506 56 98
2,(240)-806-2105,(427)-505-7703
3,077 628 04 50,079 341 14 16
4,925-715-104,695-383-194
...,...,...
995,(846)-702-7439,(461)-385-7218
996,(607)-448-4494,(083)-968-5969
997,0130-3227206,0174-8458563
998,026-7124-341,066-4919-175


The `'phone'` and `'cell'` columns do not contain missing (`NaN`) values, but the phone number formats differ, making the values unclear. A good approach is [to standardize all phone numbers](https://medium.com/@marc.bolle/fetching-and-formatting-phone-numbers-in-python-794ae16ab198) according to the [E.164](https://en.wikipedia.org/wiki/E.164) format using the [phonenumbers module](https://github.com/daviddrysdale/python-phonenumbers/tree/dev) [(documentation available)](https://daviddrysdale.github.io/python-phonenumbers/#-is_valid_number).

In [134]:
!pip3 install phonenumbers



In [135]:
import phonenumbers as ph
from phonenumbers.phonenumberutil import NumberParseException

The relevant methods of the module do not correctly convert US and Canadian numbers that contain letters and do not fully parse all `'()'`, and `'-'` characters. Here is my simple function to handle this:

In [136]:
# Define a function to convert alpha characters to their corresponding digits
def convert_alpha(phone_number : str) -> str:
  # Mapping of alpha characters to their respective digits on a phone keypad
  letters_to_digit = {
      'A': '2', 'B': '2', 'C': '2',
      'D': '3', 'E': '3', 'F': '3',
      'G': '4', 'H': '4', 'I': '4',
      'J': '5', 'K': '5', 'L': '5',
      'M': '6', 'N': '6', 'O': '6',
      'P': '7', 'Q': '7', 'R': '7', 'S': '7',
      'T': '8', 'U': '8', 'V': '8',
      'W': '9', 'X': '9', 'Y': '9', 'Z': '9'
  }

  # Skips any character that is not a number or in the map
  # Convert the character to its corresponding digit if it is in the map
  return ''.join(letters_to_digit.get(c, c) for c in phone_number.upper()
                   if letters_to_digit.get(c, c).isdigit())

In [137]:
# Define a function to normalize a phone number
def normalize_number(number: str, cc_iso2: str) -> str:
  try:
    # Parse the phone number after converting alphabetic characters using the ISO2 code
    number_obj = ph.parse(convert_alpha(number), region=cc_iso2)
  except NumberParseException as e:
    return 'number_not_parsed'
  # Check if the parsed number is valid
    if not ph.is_valid_number(number_obj):
      return 'number_not_valid'
  # Format the valid number to E.164 format
  normalized = ph.format_number(number_obj, ph.PhoneNumberFormat.E164)
  return normalized

By the way, without ISO2 code the `'parse'` method does not work

In [138]:
# Normalize the 'phone' and 'cell' columns for each row
# Store the results in a new columns 'normalized_phone' and 'normalized_cell'
df['normalized_phone'] = df.apply(lambda row: normalize_number(row['phone'], row['country_iso2']), axis=1)
df['normalized_cell'] = df.apply(lambda row: normalize_number(row['cell'], row['country_iso2']), axis=1)

In [139]:
# Count the occurrences of unique value in the 'normalized_phone', including NaN's
df.get(['normalized_phone']).value_counts(dropna=False)

normalized_phone 
number_not_parsed    2
+986657982821        1
+986539563284        1
+12608307760         1
+13963925868         1
                    ..
+989084560917        1
+989198460257        1
+989593473441        1
+989869615214        1
+984006359426        1
Name: count, Length: 999, dtype: int64

In [140]:
# Count the occurrences of unique value in the 'normalized_cell', including NaN's
df.get(['normalized_cell']).value_counts(dropna=False)

normalized_cell  
number_not_parsed    1
+12228885462         1
+12255710233         1
+12265280368         1
+12496163711         1
                    ..
+13117578564         1
+13113932972         1
+13112431012         1
+13089535966         1
+13082701373         1
Name: count, Length: 1000, dtype: int64

In [141]:
from datetime import datetime

[Python Datetime](https://www.w3schools.com/python/python_datetime.asp)

[pandas.to_datetime](https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html)

In [142]:
# Convert data-related columns to datetime object
df["date_of_registration"] = pd.to_datetime(df["date_of_registration"])
df["date_of_birth"] = pd.to_datetime(df["date_of_birth"])
df["extract_time"] = pd.to_datetime(df["extract_time"])

In [143]:
# Extract the year, month, and day from 'date_of_registration' and create separate columns
df["year_of_registration"] = df["date_of_registration"].dt.year
df["month_of_registration"] = df["date_of_registration"].dt.month
df["day_of_registration"] = df["date_of_registration"].dt.day

In [144]:
# Standardize the 'gender' column by replacing "male" with "M" and "female" with "F"
df["gender"] = df["gender"].replace({"male": "M", "female": "F"})

In [145]:
# Store in new columns the lenght's values of the 'login_password' and 'login_username'
df["password_length"] = df["login_password"].apply(len)
df["login_length"] = df["login_username"].apply(len)


It is strongly reccomended to set time stamp for tracking data modifications:

In [146]:
# Add the current timestamp to the new 'transform_timestamp' column
df["transform_timestamp"] = datetime.now()

Here we stop. But even in this simple and plain dataset, there are plenty of opportunities for verifying and converting data, as well as creating new features that can provide deeper insights.

In [147]:
output_path = 'batch1000users.csv'
df.to_csv(output_path)

To save a DataFrame along with its data types and structure, use to_pickle(). This method preserves all metadata, including column data types, making it convenient for reloading the data exactly as it was saved.

In [149]:
output_path_pkl = 'batch1000users.pkl'
df.to_pickle(output_path_pkl)
