In [1]:
# Cell 1: Imports and Seed Initialization
import random
import string
import math
from datetime import datetime, timedelta
import pandas as pd
import numpy as np

random.seed(42)
np.random.seed(42)


In [2]:
# Cell 2: Constants and Data Lists
PRODUCTS = [
    "Laptop", "Keyboard", "Monitor", "Mouse", "Charger", "Headphones", "USB Cable",
    "Webcam", "Router", "Phone Case", "Power Bank", "SSD", "Speaker", "Smartwatch",
    "Microphone", "Tablet", "Graphic Card", "HDMI Cable", "Printer", "Ink Cartridge"
]
REGIONS = ["Mumbai", "Delhi", "Kolkata", "Chennai", "Bengaluru", "Hyderabad", "Ahmedabad", "Pune", "Lucknow", "Jaipur"]
DELIVERY_STATUSES = ["Delivered", "Pending", "In Transit", "Out for delivery", "Returned", "Cancelled", "Delayed"]
PAYMENT_STATUSES = ["Paid", "Unpaid", "COD", "Refunded"]

first_names = ["Rahul","Sita","Amit","Priya","Ramesh","Anita","Deepak","Neha","Sourav","Meena",
               "Akhil","Jaya","Rohit","Sana","Vikas","Pooja","Karan","Rita","Kamal","Suresh",
               "Imran","Latika","Mohan","Sahil","Geeta","Arjun","Ritika","Sneha","Vijay"]
last_names = ["Kumar","Devi","Sharma","Singh","Patel","Khan","Gupta","Kaur","Reddy","Kapoor",
              "Nair","Verma","Bose","Chowdhury","Iyer"]


In [3]:
# Cell 3: Helper Functions - random_date and format_date_mixed
def random_date(start, end):
    delta = end - start
    rand_days = random.randrange(delta.days + 1)
    return start + timedelta(days=rand_days)

def format_date_mixed(dt):
    formats = [
        "%Y-%m-%d", "%d/%m/%Y", "%d-%m-%Y", "%d %b %Y", "%b %d %Y", "%Y/%m/%d", "%d.%m.%Y",
        "%d %B %Y", "%d-%b-%Y"
    ]
    f = random.choice(formats)
    return dt.strftime(f)


In [4]:
# Cell 4: Email Generator and Quantity Word Converter
def make_email(name):
    base = ''.join(e for e in name.lower().replace(" ", ".") if e.isalnum() or e=='.')
    domain = random.choice(["gmail.com","yahoo.com","hotmail.com","outlook.com","mail.com"])
    email = f"{base}@{domain}"
    if random.random() < 0.08:
        email = email.replace(".", "", 1)
    if random.random() < 0.04:
        email = email.replace("@", "")
    if random.random() < 0.03:
        email = base + "@" + domain.replace(".com", "con")
    return email

def qty_word(n):
    words = {1:"one", 2:"two", 3:"three", 4:"four", 5:"five"}
    return words.get(n, str(n))


In [5]:
# Cell 5: Generate Random Order with Product Pricing and Date
def generate_random_order(idx, start_date, end_date):
    name = f"{random.choice(first_names)} {random.choice(last_names)}"
    email = make_email(name)
    region = random.choice(REGIONS)
    product = random.choice(PRODUCTS)
    base_price = {
        "Laptop": 45000, "Graphic Card":25000, "Tablet":15000, "Monitor":9000, "Phone Case":299,
        "Charger":499, "Headphones":1500, "USB Cable":199, "Webcam":2200, "Router":3500,
        "Power Bank":1200, "SSD":4200, "Keyboard":800, "Mouse":500, "Speaker":1800,
        "Smartwatch":7000, "Microphone":3000, "Printer":8000, "Ink Cartridge":800
    }
    unit_price = int(base_price.get(product, random.randint(200,10000)) * (1 + random.uniform(-0.15, 0.15)))
    qty = random.choices([1,1,1,2,2,3,4,5], weights=[40,40,40,20,10,6,3,2])[0]
    order_date = random_date(start_date, end_date)
    delivery = random.choice(DELIVERY_STATUSES)
    payment = random.choice(PAYMENT_STATUSES)
    total = unit_price * qty
    return {
        "OrderID": f"ORD{1000+idx}",
        "CustomerName": name,
        "Email": email,
        "Region": region,
        "Product": product,
        "Quantity": qty,
        "UnitPrice": unit_price,
        "Total": total,
        "OrderDate": order_date.strftime("%Y-%m-%d"),
        "DeliveryStatus": delivery,
        "PaymentStatus": payment
    }


In [6]:
# Cell 6: Function to Introduce Noise in Data Row
def introduce_noise(row):
    r = row.copy()
    if random.random() < 0.06:
        r["Email"] = ""
    if random.random() < 0.03:
        r["CustomerName"] = r["CustomerName"].split()[0]
    if random.random() < 0.08:
        typo = r["Region"]
        if len(typo) > 4:
            i = random.randint(0, len(typo)-1)
            typo = typo[:i] + random.choice(string.ascii_lowercase) + typo[i+1:]
        r["Region"] = typo
    if random.random() < 0.9:
        dt = datetime.strptime(r["OrderDate"], "%Y-%m-%d")
        r["OrderDate"] = format_date_mixed(dt)
    if random.random() < 0.03:
        r["Quantity"] = qty_word(int(r["Quantity"]))
    if random.random() < 0.02:
        r["Quantity"] = -abs(int(r["Quantity"]))
        r["Total"] = int(r["Quantity"]) * int(r["UnitPrice"])
    if random.random() < 0.03:
        r["UnitPrice"] = ""
    if random.random() < 0.06:
        change = random.choice([0.9, 1.1, 1.25, 0.5])
        try:
            r["Total"] = int(max(0, math.floor(int(r["UnitPrice"]) * int(r["Quantity"]) * change)))
        except Exception:
            r["Total"] = ""
    if random.random() < 0.05:
        r["Product"] = r["Product"] + random.choice([" "," Pro"," Plus"," - refurbished","(used)"])
    return r


In [7]:
# Cell 7: Create Raw Dataframe with Noisy Data and Duplicates
def create_raw_dataframe(n_rows=600, start_date_str="2015-01-31", end_date_str="2024-12-31"):
    start_date = datetime.strptime(start_date_str, "%Y-%m-%d")
    end_date = datetime.strptime(end_date_str, "%Y-%m-%d")
    rows = []
    for i in range(n_rows):
        base = generate_random_order(i+1, start_date, end_date)
        noisy = introduce_noise(base)
        rows.append(noisy)
    
    for dup_idx in random.sample(range(n_rows), k=max(5, n_rows // 50)):
        rows.append(rows[dup_idx].copy())
    
    df = pd.DataFrame(rows)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    return df


In [8]:
# Cell 8: Generate raw data and save to CSV or display (optional)
df_raw = create_raw_dataframe(n_rows=600)
print("Raw data sample:")
display(df_raw.head(10))

# Optional: Save raw data for later use
df_raw.to_csv("raw_order_data.csv", index=False)
print("Raw data saved as 'raw_order_data.csv'")


Raw data sample:


Unnamed: 0,OrderID,CustomerName,Email,Region,Product,Quantity,UnitPrice,Total,OrderDate,DeliveryStatus,PaymentStatus
0,ORD1082,Rahul Kaur,rahul.kaur@hotmail.com,Lucknow,Ink Cartridge,1,889.0,889,13.10.2015,Cancelled,Unpaid
1,ORD1219,Suresh Nair,suresh.nair@gmail.com,Pune,Ink Cartridge,2,785.0,1570,13 Sep 2017,Pending,COD
2,ORD1056,Rita Verma,rita.verma@yahoo.com,Delhi,Speaker,1,1765.0,1765,15/12/2015,Delayed,COD
3,ORD1495,Ramesh Gupta,,Chennai,Keyboard,1,740.0,740,Nov 14 2018,Delayed,Unpaid
4,ORD1265,Vijay Sharma,vijay.sharma@mail.com,Lucknow,Tablet,2,15412.0,15412,19-03-2018,Delivered,Refunded
5,ORD1156,Neha Singh,neha.singh@hotmail.com,Bengaluru,Headphones,1,1645.0,1645,2018-05-06,Out for delivery,COD
6,ORD1484,Meena Iyer,meena.iyer@hotmail.com,Bengaluru,Charger,two,,910,19 Jul 2023,Pending,Paid
7,ORD1322,Geeta Kumar,geeta.kumar@yahoo.com,Lucknow,Phone Case,1,290.0,290,16-Apr-2021,Delayed,COD
8,ORD1583,Imran Khan,imran.khan@yahoo.com,Pune,Microphone,1,2967.0,2967,18 Jul 2022,Out for delivery,Unpaid
9,ORD1292,Latika Singh,latika.singh@hotmail.com,Kolkata,Ink Cartridge,1,767.0,767,27.08.2020,Cancelled,Paid


Raw data saved as 'raw_order_data.csv'
