In [None]:
# #install all libraries needed for the project
# !pip install pandas
# !pip install glob2

#uncomment if machine has not installed following libraries and modules

In [1]:
#import libraries
import glob 
import pandas as pd 
import xml.etree.ElementTree as et
from datetime import datetime 

In [2]:
#file path that needs to be available globally
log_file = "log_file.txt" 
target_file = "transformed_data.csv" 

## 1. Extraction

In [3]:
#extract csv files
def extract_from_csv(filepath): 
    df = pd.read_csv(filepath) 
    return df 

In [4]:
#extract json files
def extract_from_json(filepath): 
    df = pd.read_json(filepath, lines=True) 
    return df 

In [5]:
#extract xml files
def extract_from_xml(filepath):
    df = pd.DataFrame(columns=['car_model', 'year_of_manufacture', 'price', 'fuel'])
    tree = et.parse(filepath)
    root = tree.getroot()
    for car in root:
        car_model = car.find('car_model').text
        year_of_manufacture = int(car.find('year_of_manufacture').text)
        price = float(car.find('price').text)
        fuel = car.find('fuel').text
        df = pd.concat([df, pd.DataFrame([{'car_model':car_model, 'year_of_manufacture':year_of_manufacture,'price':price,'fuel':fuel}])], ignore_index=True)
    return df

In [6]:
#function that calls extraction of file types to one dataframe
def extract():
    # create an empty data frame to hold extracted data
    extracted_data = pd.DataFrame(columns=['car_model', 'year_of_manufacture', 'price', 'fuel'])  

    # process all csv files 
    for csvfile in glob.glob("*.csv"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_csv(csvfile))], ignore_index=True) 
         
    # process all json files 
    for jsonfile in glob.glob("*.json"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_json(jsonfile))], ignore_index=True) 
     
    # process all xml files 
    for xmlfile in glob.glob("*.xml"): 
        extracted_data = pd.concat([extracted_data, pd.DataFrame(extract_from_xml(xmlfile))], ignore_index=True) 
         
    return extracted_data 

## 2. Transformation

In [7]:
def transform(data):
    #round price to 2 decimal places
    data['price'] = round(data.price,2)
    return data

## 3. Loading

In [8]:
def load_data(target_file, transformed_data): 
    transformed_data.to_csv(target_file)

## 4. Logging

In [9]:
def log_progress(message): 
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) 
    with open(log_file,"a") as f: 
        f.write(timestamp + ',' + message + '\n') 

## Testing ETL 

In [10]:
# Log the initialization of the ETL process 
log_progress("ETL Job Started") 
 
# Log the beginning of the Extraction process 
log_progress("Extract phase Started") 
extracted_data = extract() 
 
# Log the completion of the Extraction process 
log_progress("Extract phase Ended") 
 
# Log the beginning of the Transformation process 
log_progress("Transform phase Started") 
transformed_data = transform(extracted_data) 
print("Transformed Data") 
print(transformed_data) 
 
# Log the completion of the Transformation process 
log_progress("Transform phase Ended") 
 
# Log the beginning of the Loading process 
log_progress("Load phase Started") 
load_data(target_file,transformed_data) 
 
# Log the completion of the Loading process 
log_progress("Load phase Ended") 
 
# Log the completion of the ETL process 
log_progress("ETL Job Ended") 

Transformed Data
        car_model year_of_manufacture     price    fuel
0            ritz                2014   5000.00  Petrol
1             sx4                2013   7089.55  Diesel
2            ciaz                2017  10820.90  Petrol
3         wagon r                2011   4253.73  Petrol
4           swift                2014   6865.67  Diesel
..            ...                 ...       ...     ...
85          camry                2006   3731.34  Petrol
86   land cruiser                2010  52238.81  Diesel
87  corolla altis                2012   8805.97  Petrol
88     etios liva                2013   5149.25  Petrol
89        etios g                2014   7089.55  Petrol

[90 rows x 4 columns]
