<a href="https://colab.research.google.com/github/yaswanthkillampalli/cyber-threat-prediction/blob/main/data_creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Installing faker**

In [None]:
!pip install faker



**Importing necessary modules**

In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

**Initialise Faker**

In [None]:
fake = Faker()

**Number of records to generate**

In [None]:
num_records = 100000

**Lists for categorical data**

In [None]:
order_statuses = ["Delivered", "Shipped", "Processing", "Cancelled", "Returned"]
product_categories = ["Electronics", "Clothing", "Home & Kitchen", "Books", "Sports", "Beauty", "Toys"]
payment_methods = ["Credit Card", "PayPal", "Debit Card", "Gift Card", "Bank Transfer", "paypal", "credit card"]
shipping_methods = ["Standard", "Express", "Next-Day"]
cancellation_reasons = ["Out of stock", "Customer request", "Payment failed", "Fraud detected", None]

**Generate product data**

In [None]:
products = {}
for _ in range(200):
    product_id = f"PID{random.randint(1000, 9999)}"
    products[product_id] = {
        "category": random.choice(product_categories),
        "price": round(random.uniform(10, 500), 2)
    }

**Generate Data**

In [None]:
data = []
for _ in range(num_records):
    order_id = f"ORD{random.randint(100000, 999999)}"
    customer_id = f"CUS{random.randint(1000, 9999)}"
    order_date_dt = fake.date_time_between(start_date='-2y', end_date='now')
    order_date = order_date_dt.strftime('%Y-%m-%d')
    order_time = order_date_dt.strftime('%H:%M:%S')

    product_id = random.choice(list(products.keys()))
    product_info = products[product_id]
    quantity = random.randint(1, 5)
    total_amount = round(product_info["price"] * quantity, 2)

    order_status = random.choice(order_statuses)
    payment_method = random.choice(payment_methods) if order_status != "Cancelled" else None

    # Add shipping date if applicable
    if order_status in ["Shipped", "Delivered"]:
        shipping_date = (order_date_dt + timedelta(days=random.randint(1, 3))).strftime('%Y-%m-%d')
    else:
        shipping_date = None

    # Add cancellation reason if applicable
    cancellation_reason = random.choice(cancellation_reasons) if order_status == "Cancelled" else None

    customer_rating = round(random.uniform(1.0, 5.0), 1) if order_status == "Delivered" else np.nan

    data.append({
        "OrderID": order_id,
        "CustomerID": customer_id,
        "ProductID": product_id,
        "OrderDate": order_date,
        "OrderTime": order_time,
        "Quantity": quantity,
        "ProductPrice": product_info["price"],
        "TotalAmount": total_amount,
        "OrderStatus": order_status,
        "PaymentMethod": payment_method,
        "ShippingMethod": random.choice(shipping_methods),
        "ShippingDate": shipping_date,
        "ProductCategory": product_info["category"],
        "CustomerRating": customer_rating,
        "CancellationReason": cancellation_reason,
    })


**Introduce some duplicates**

In [None]:
df = pd.DataFrame(data)
duplicate_rows = df.sample(n=500)
df = pd.concat([df, duplicate_rows]).reset_index(drop=True)

df.to_csv("synthetic_ecommerce_orders.csv", index=False)

print("synthetic_ecommerce_orders.csv created successfully.")

synthetic_ecommerce_orders.csv created successfully.
