# Import Libraries

In [None]:
import pandas as pd

# Import Dataset

In [None]:
# Read the csv file into a pandas dataframe
df = pd.read_csv("https://raw.githubusercontent.com/uOttawa-Collabs/CSI4142-Winter-2024-Project/master/shopping_trends.csv")

In [None]:
df.head()

In [None]:
df.dtypes

# Data Cleaning

## Normalize Data Types

In [None]:
# Convert the Gender to type string
df["Gender"] = df["Gender"].astype("string")

In [None]:
# Convert the Item Purchased to type string
df["Item Purchased"] = df["Item Purchased"].astype("string")

In [None]:
# Convert the Category to type string
df["Category"] = df["Category"].astype("string")

In [None]:
# Convert the Location to type string
df["Location"] = df["Location"].astype("string")

In [None]:
# Convert the Size to type string
df["Size"] = df["Size"].astype("string")

In [None]:
# Convert the Color to type string
df["Color"] = df["Color"].astype("string")

In [None]:
# Convert the Seasion to type string
df["Season"] = df["Season"].astype("string")

In [None]:
# Change the Subscription Status to bool type
df["Subscription Status"] = df["Subscription Status"] == "Yes"

In [None]:
# Convert the Shipping Type to type string
df["Shipping Type"] = df["Shipping Type"].astype("string")

In [None]:
# Change the Discount Applied to bool type
df["Discount Applied"] = df["Discount Applied"] == "Yes"

In [None]:
# Change the Promo Code Used to bool type
df["Promo Code Used"] = df["Promo Code Used"] == "Yes"

In [None]:
# Convert the Frequency Purchases to type string
df["Frequency of Purchases"] = df["Frequency of Purchases"].astype("string")

In [None]:
df.dtypes

## Drop Unrequired Columns

In [None]:
# Drop the Payment Method column
df = df.drop(columns = ["Payment Method"])

# Drop the Preferred Payment Method column
df = df.drop(columns=["Preferred Payment Method"])

In [None]:
df.head()

## Normalize Fixed Point Numbers

In [None]:
df["Purchase Amount (USD)"] = df["Purchase Amount (USD)"] * 100
df["Previous Purchases"] = df["Previous Purchases"] * 100
df["Review Rating"] = (df["Review Rating"] * 10).astype(int)

In [None]:
df.head()

## Generate Age Group

In [None]:
age_groups = []

for age in df["Age"]:
  if age < 18 or age > 70:
    raise ValueError("Age exceeded valid range")

  age_group = (age - 1) // 5 - 2
  age_groups.append(age_group)

df["Age Group"] = age_groups

In [None]:
df.head()

In [None]:
df.dtypes

# Check data quality

i.e. null values, duplicates, outliers

In [None]:
# Check if there are null values
df.isnull().sum()

In [None]:
# Check if there are duplicates
df.duplicated().sum()

In [None]:
# Check data types
df.dtypes

In [None]:
# Data Profiling - Get summary statistics of the dataset, to know if there are any outliers
df.describe()

# Generate Tables

## Customer Dimension

In [None]:
df_customer = df[[
    "Customer ID",
    "Age",
    "Gender",
    "Subscription Status",
    "Previous Purchases",
    "Frequency of Purchases"
]]
df_customer.head()

## Shipping Type Dimension

In [None]:
list_shipping_type = df["Shipping Type"].unique().tolist()
list_shipping_type.sort()

df_shipping_type = pd.DataFrame({
    "Shipping Type ID": range(1, len(list_shipping_type) + 1),
    "Shipping Type": list_shipping_type
})
df_shipping_type

## Product Dimension

In [None]:
df_product = df[[
    "Item Purchased",
    "Category",
    "Size",
    "Color"
]].drop_duplicates().reset_index(drop=True)
df_product.insert(0, "Product ID", range(1, len(df_product) + 1))
df_product.head()

## Location Dimension

In [None]:
list_location = df["Location"].unique().tolist()
list_location.sort()

df_location = pd.DataFrame({
    "Location ID": range(1, len(list_location) + 1),
    "Location": list_location
})
df_location.head()

## Age Group Dimension

In [None]:
# Calculate total purchase amount per age group
age_group_min = df["Age Group"].min()
age_group_max = df["Age Group"].max()
total_purchase_amount_map = [0] * (age_group_max - age_group_min + 1)

for index, row in df.iterrows():
  amount = row["Purchase Amount (USD)"]
  group = row["Age Group"]

  if group < 1:
    raise ValueError("Invalid age group")

  total_purchase_amount_map[group - age_group_min] += amount

# Construct intervals
age_group_intervals = ["[18, 20]"]
for i in range(21, df["Age"].max(), 5):
  age_group_intervals.append(f"[{i}, {i + 4}]")

# Construct dataframe
df_age_group = pd.DataFrame({
    "Age Group ID": list(range(age_group_min, age_group_max + 1)),
    "Interval": age_group_intervals,
    "Total Purchase Amount": total_purchase_amount_map
})
df_age_group

## Season Dimension

In [None]:
list_season = ["Spring", "Summer", "Fall", "Winter"]

df_season = pd.DataFrame({
    "Season ID": range(1, len(list_season) + 1),
    "Season": list_season
})
df_season.head()

## Fact Table

In [None]:
# Construct reverse lookup maps
rlookup_shipping_type = {
    row["Shipping Type"]: row["Shipping Type ID"]
    for _, row in df_shipping_type.iterrows()
}
rlookup_product = {
    (
        row["Item Purchased"],
        row["Category"],
        row["Size"],
        row["Color"]
    ): row["Product ID"]
    for _, row in df_product.iterrows()
}
rlookup_location = {
    row["Location"]: row["Location ID"]
    for _, row in df_location.iterrows()
}
rlookup_season = {
    row["Season"]: row["Season ID"]
    for _, row in df_season.iterrows()
}

list_fact_shipping_type_id = [
    rlookup_shipping_type.get(item)
    for item in df["Shipping Type"]
]
list_fact_product_id = [
    rlookup_product.get(tuple(row))
    for _, row in df[[
        "Item Purchased", "Category", "Size", "Color"
    ]].iterrows()
]
list_fact_location_id = [
    rlookup_location.get(item)
    for item in df["Location"]
]
list_fact_season_id = [
    rlookup_season.get(item)
    for item in df["Season"]
]

# Build fact table
df_fact = pd.DataFrame({
    # Customer IDs are unique, no need for any processing
    "Customer ID": df["Customer ID"],
    "Shipping Type ID": list_fact_shipping_type_id,
    "Product ID": list_fact_product_id,
    "Location ID": list_fact_location_id,
    "Season ID": list_fact_season_id,
    "Age Group ID": df["Age Group"],
    "Purchase Amount (USD)": df["Purchase Amount (USD)"],
    "Review Rating": df["Review Rating"],
    "Discount Applied": df["Discount Applied"],
    "Promo Code Used": df["Promo Code Used"],
})
df_fact.head()

# Write Processed Data

In [None]:
config_write_to_csv = False

# postgresql://username:password@host:port/database
config_database_connection_string_filename = "connection.txt"
config_database_query_add_constraints_filename = "constraints.sql"

In [None]:
import re


def convert_column_name(name):
    name = re.sub(r"[^\w\ ]", "", name)
    name = name.strip()
    name = name.lower()
    name = name.replace(" ", "_")
    return name

In [None]:
def write_to_csv(name_dataframe_pairs):
    for (name, dataframe) in name_dataframe_pairs:
        dataframe.to_csv(f"{name}.out.csv", index=False)

In [None]:
import sqlalchemy
import csv
import io


def get_connection_string():
    with open(config_database_connection_string_filename) as f:
        return f.read().strip()


def postgres_copy_method(table, conn, keys, data_iter, pre_truncate=False, fatal_failure=False):
    connection = conn.connection
    cursor = connection.cursor()

    string_io = io.StringIO()
    writer = csv.writer(string_io, quoting=csv.QUOTE_MINIMAL)
    writer.writerows(data_iter)
    string_io.seek(0)

    columns = ", ".join(f'"{key}"' for key in keys)
    table_name = "{}.{}".format(table.schema, table.name) if table.schema else table.name

    # Bulk load
    sql_query = f"COPY {table_name} ({columns}) FROM STDIN WITH CSV"
    cursor.copy_expert(sql=sql_query, file=string_io)

    return cursor.rowcount


def write_to_database(name_dataframe_pairs):
    engine = sqlalchemy.create_engine(get_connection_string())
    # Copy dataframe to database
    for (name, dataframe) in name_dataframe_pairs:
        dataframe.to_sql(
            name,
            engine,
            schema="public",
            index=False,
            if_exists="replace",
            method=postgres_copy_method
        )
    # Add constrains (e.g. PK and FK)
    with open(config_database_query_add_constraints_filename) as f:
        query_add_constraints = f.read()
    with engine.connect() as connection:
        connection.execute(query_add_constraints)

In [None]:
name_dataframe_pairs = [
    ("customer", df_customer.copy()),
    ("shipping_type", df_shipping_type.copy()),
    ("product", df_product.copy()),
    ("location", df_location.copy()),
    ("age_group", df_age_group.copy()),
    ("season", df_season.copy()),
    ("fact", df_fact.copy())
]

for (_, dataframe) in name_dataframe_pairs:
    for column in dataframe.columns:
        dataframe.rename(
            columns={column: convert_column_name(str(column))},
            inplace=True
        )

if config_write_to_csv:
    write_to_csv(name_dataframe_pairs)
else:
    write_to_database(name_dataframe_pairs)