#### Lab2 - Data Collection and Pre-processing - Manu Mathew - 8990691

##### Adding the required imports 

In [14]:
import pandas as pd
from dataclasses import dataclass
from dataclasses import field
import re
from typing import Optional

##### Load raw CSV, display first 3 rows

In [15]:
df = pd.read_csv("data/Grocery_Transactions.csv")
print(df.head(3))

         date  customer_id   product  product_id  price  quantity  \
0  2025-05-26         3000    Apples           1  13.23         3   
1  2025-05-04         3001      Milk           4  27.94         1   
2  2025-05-19         3002  Tomatoes           9  13.72         6   

  shipping_city  coupon_id  
0        Ottawa          1  
1     Vancouver          4  
2        Ottawa          5  


##### Justify dict vs namedtuple vs class
- Dictionary: Mutable, unordered collection of key-value pairs.
- NamedTuple: Similar to tuple, but it is object with named fields.
- Class: A blueprint for creating objects with attributes and behaviors.

I have chosen class, so that I can include the implementations of ingest, wrangle, clean, transform, feature-engineer and serialization methods, so that the codebase is more scalable.

##### Implement Transaction class and use it to populate data structure an object-oriented data structure

In [16]:
@dataclass
class Transaction:
    date: str
    customer_id: str
    product:str
    product_id: str
    price: float
    quantity: int
    shipping_city: str
    coupon_id:int
    discount_percentage: float = field(default=0.0) 
    finalPrice:float = field(default=0.0)
    
    def clean_dataset(self):
        try:
            self.price = float(self.price)
            if self.price < 0:
                self.price = 0.0
        except:
            self.price = 0.0


#### Create a method load_transactions() which takes the dataset path and returns list of Transactions

In [6]:
def load_transactions(path: str) -> list[Transaction]:
    df = pd.read_csv(path)
    transactions = []

    for record in df.to_dict(orient="records"):
        transaction = Transaction(
            date=record["date"],
            customer_id=record["customer_id"],
            product = record["product"],
            product_id=record["product_id"],
            price=record["price"],
            quantity=record["quantity"],
            coupon_id=record["coupon_id"],
            shipping_city=record["shipping_city"]
        )
        transactions.append(transaction)
    print(transactions)
    return transactions




#### Create a method called quick_profiling() which calculates the Min price, mean of price, max price and unique city count. This metod takes the list of trasactions as input.

In [7]:
def quick_profiling(transactionList:list) -> None:
    prices_list = []
    unique_shipping_cities= []
    for transaction in transactionList:
        if isinstance(transaction.price, (int, float)):
            prices_list.append(float(transaction.price))
    for transaction in transactionList:
        unique_shipping_cities.append(transaction.shipping_city)

    print(f"  - Min: {min(prices_list)}")
    print(f"  - Mean: {sum(prices_list)/len(prices_list):.2f}")
    print(f"  - Max: {max(prices_list)}")
    print(f"Unique Shipping Cities Count: {len(set(unique_shipping_cities))}\n")



#### Create a method to insert dirty data into the datasets

In [8]:
def inject_messy(transactionList:list) -> None:
    if len(transactionList) >= 3:
        transactionList[0].price = "N/A"
        transactionList[1].price = "?"
        transactionList[2].price = "?"
    print("Injected dirty values to three transaction objects.\n")
    print(transactionList)


#### Create a method to clean the dataset and find the count of the unclean dataset before clean and after clean

In [9]:
def frequencyOfMessyOrCleanDataset(tx:list):
    prices = [t.price for t in tx]
    count = 0
    for price in prices:
        if not isinstance(price, float):
            count += 1
    return count


def clean_all(transactionList : list) -> None:
    before = frequencyOfMessyOrCleanDataset(transactionList)
    for tx in transactionList:
        tx.clean_dataset()
    after = frequencyOfMessyOrCleanDataset(transactionList)
    print(f"Cleaned transactions — before: {before}, after: {after}\n")


##### Create a method for which calculates the discounts based on the coupon code

In [None]:
def feature_engineering(transactionsList:list) -> None:
    for transaction in transactionsList:
        transaction.final_price = transaction.price * transaction.quantity * (1 - transaction.discount_percentage)     


In [11]:
def load_coupons(path: str) -> dict[int, str]:
    df = pd.read_csv(path)
    return dict(zip(df["coupon_id"], df["coupon_code"]))

def transform_transactions(transactions: list, coupon_map: dict[int, str]) -> None:
    for t in transactions:
        code = coupon_map.get(t.coupon_id)
        if code:
            match = re.search(r"(\d+)", code)
            if match:
                t.discount_percentage = int(match.group(1)) / 100
            else:
                t.discount_percentage = 0.0
        else:
             t.discount_percentage = 0.0
    print("Transformation applied on coupon_id and generated a discount_percentage.\n")

##### Create a method for feature engineering and calculate the final price based on the discounts

In [None]:
def featureEngineering(transactions: list) -> None:
    for transaction in transactions:
        transaction.finalPrice = transaction.price * transaction.quantity * (1 - transaction.discount_percentage)
        print(transaction.finalPrice, transaction.price,  transaction.quantity,  transaction.discount_percentage)
    print(transactions)

#### Main method

In [13]:
if __name__ == "__main__":
    dataset_path = "data/Grocery_Transactions.csv"
    second_dataset_path = "data/Grocery_Coupons.csv"
    listOfTransactions = load_transactions(dataset_path)
    quick_profiling(listOfTransactions)
    inject_messy(listOfTransactions)
    clean_all(listOfTransactions)
    coupon_dict=load_coupons(second_dataset_path)
    transform_transactions(listOfTransactions,coupon_dict)
    featureEngineering(listOfTransactions)


[Transaction(date='2025-05-26', customer_id=3000, product='Apples', product_id=1, price=13.23, quantity=3, shipping_city='Ottawa', coupon_id=1, discount_percentage=0.0, finalPrice=0.0), Transaction(date='2025-05-04', customer_id=3001, product='Milk', product_id=4, price=27.94, quantity=1, shipping_city='Vancouver', coupon_id=4, discount_percentage=0.0, finalPrice=0.0), Transaction(date='2025-05-19', customer_id=3002, product='Tomatoes', product_id=9, price=13.72, quantity=6, shipping_city='Ottawa', coupon_id=5, discount_percentage=0.0, finalPrice=0.0), Transaction(date='2025-05-16', customer_id=3003, product='Tomatoes', product_id=9, price=44.79, quantity=4, shipping_city='Halifax', coupon_id=2, discount_percentage=0.0, finalPrice=0.0), Transaction(date='2025-05-07', customer_id=3004, product='Tomatoes', product_id=9, price=2.21, quantity=1, shipping_city='Winnipeg', coupon_id=3, discount_percentage=0.0, finalPrice=0.0), Transaction(date='2025-05-04', customer_id=3005, product='Orange 