In [84]:
#Step 1: Import packages
import os
import glob
import pandas as pd
import sqlalchemy
from dotenv import load_dotenv
import mysql.connector
import boto3

In [87]:
s3 = boto3.client('s3')

In [92]:
s3.upload_file('data/source1.csv', 'etl-workflows', 'raw_data/source1.csv')
s3.upload_file('data/source1.xml', 'etl-workflows', 'raw_data/source1.xml')
s3.upload_file('data/source1.json', 'etl-workflows', 'raw_data/source1.json')
s3.upload_file('data/source2.csv', 'etl-workflows', 'raw_data/source2.csv')
s3.upload_file('data/source2.xml', 'etl-workflows', 'raw_data/source2.xml')
s3.upload_file('data/source2.json', 'etl-workflows', 'raw_data/source2.json')
s3.upload_file('data/source3.csv', 'etl-workflows', 'raw_data/source3.csv')
s3.upload_file('data/source3.xml', 'etl-workflows', 'raw_data/source3.xml')
s3.upload_file('data/source3.json', 'etl-workflows', 'raw_data/source3.json')

In [71]:
#Step 2: Define function to transform data
def convert_units(df):
    df['height'] = df['height'] * 0.0254  # Convert height to meters
    df['weight'] = df['weight'] * 0.453592  # Convert weight to kilograms
    return df  # Ensure the modified DataFrame is returned


In [72]:
#Step 4: Read files and combine dataframe
user = []

df = pd.read_csv('data/source1.csv')
user.append(df)

df = pd.read_xml('data/source1.xml')
user.append(df)

df = pd.read_json('data/source1.json', lines=True)
user.append(df)

combined_df = pd.concat(user, ignore_index=True)


In [73]:
#Step 5: Transform the units and save
transformed_df = convert_units(combined_df)
transformed_df.to_csv('data/transformed.csv', index=False)

In [85]:
#Step 6: Set environment file(To avoid hardcoding the credentials)
load_dotenv()
db_host = os.getenv("DB_HOST")
db_user = os.getenv("DB_USER")
db_password = os.getenv("DB_PASSWORD")
db_port = os.getenv("DB_PORT")
db_name = os.getenv("DB_NAME")

access_key = os.getenv("AWS_ACCESS_KEY_ID")
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
default_region = os.getenv("AWS_DEFAULT_REGION")

In [75]:
# Step 7: Establish connection to RDS
client = mysql.connector.connect(host=db_host, user=db_user, password=db_password, port=db_port)
mycursor = client.cursor()

In [76]:
# Step 9: Create the database (if it doesn't exist) and use it
mycursor.execute("CREATE DATABASE IF NOT EXISTS etldb1;")
mycursor.execute("use etldb1")
client.commit()

In [77]:
# Step 10: Create the tables (users)
mycursor.execute("""CREATE TABLE IF NOT EXISTS users (Name VARCHAR(255), height FLOAT, weight FLOAT);""")
client.commit()

In [80]:
# Step 11: Create the engine for SQLAlchemy
engine = sqlalchemy.create_engine(f"mysql+mysqlconnector://{db_user}:{db_password}@{db_host}:{db_port}/etldb1")

In [81]:
# Step 12: Read the CSV files into DataFrames(Ignoring as we already have the data in transformed_df dataframe from Step 5)
#transformed_df = pd.read_csv(os.path.join(output_path, '*.csv')     

# Step 13: Upload DataFrames to the respective tables
transformed_df.to_sql("users", engine, if_exists='append', index=False)

13

In [82]:
# Step 14: Verify the uploads
print(pd.read_sql("SELECT * FROM users;", engine))

     Name   height   weight
0    alex  1.67081  51.2514
1    ajay  1.81661  61.9108
2   alice  1.76276  69.4132
3    ravi  1.73279  64.5643
4     joe  1.72187  65.4533
5   simon  1.72466  50.9701
6   jacob  1.69621  54.7349
7   cindy  1.68885  57.8103
8    ivan  1.71755  51.7730
9    jack  1.74498  55.9279
10    tom  1.77292  64.1787
11  tracy  1.77825  61.8972
12   john  1.72466  50.9701


In [90]:
s3.upload_file('data/transformed.csv', 'etl-workflows', 'transformed_data/transformed.csv')