### This notebook ingests the data from Google Cloud Storage to Databricks table.
- The source of data is a transactional CSV file stored in Google Cloud Storage (GCS) which is publicaly available.
- The dataset consists of order transactions, including user details, item details, and purchase information.
- Source data: https://storage.googleapis.com/gyg-store/transaction_data.csv


In [0]:
# Importing the required libraries and functions.
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp

In [0]:
# Reading the file present in public google cloud storage bucket
url = "https://storage.googleapis.com/gyg-store/transaction_data.csv"

df = pd.read_csv(url)

In [0]:
# Converting to spark dataframe
df_spark = spark.createDataFrame(df)

# Aligning the schema with table and adding load_date
df_schema = (
    df_spark.withColumn("TransactionId", col("TransactionId").cast("int"))
    .withColumn("NumberOfItemsPurchased", col("NumberOfItemsPurchased").cast("int"))
    .withColumn("ItemCode", col("ItemCode").cast("int"))
    .withColumn("UserId", col("UserId").cast("int"))
    .withColumn("CostPerItem", col("CostPerItem").cast("decimal(18,2)"))
    .withColumn("load_date", current_timestamp())
)

root
 |-- UserId: integer (nullable = true)
 |-- TransactionId: integer (nullable = true)
 |-- TransactionTime: string (nullable = true)
 |-- ItemCode: integer (nullable = true)
 |-- ItemDescription: string (nullable = true)
 |-- NumberOfItemsPurchased: integer (nullable = true)
 |-- CostPerItem: decimal(18,2) (nullable = true)
 |-- Country: string (nullable = true)
 |-- load_date: timestamp (nullable = false)



In [0]:
# Ingesting the data into staging table
schema = "stg_store"
table_name = "stg_orders"

df_schema.write.format("delta").mode("overwrite").saveAsTable(f"{schema}.{table_name}")