In [1]:
import os

from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain_openai import ChatOpenAI
from markdown_it.rules_block import reference

print("Generating data")

Generating data


In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
import os
import pandas as pd

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


In [4]:
df = pd.read_csv('public/economy_delivery_data/real_data.csv')
df.head()

Unnamed: 0,Order ID,Customer ID,Platform,Order Date & Time,Delivery Time (Minutes),Product Category,Order Value (INR),Customer Feedback,Service Rating,Delivery Delay,Refund Requested
0,ORD000001,CUST2824,JioMart,19:29.5,30,Fruits & Vegetables,382,"Fast delivery, great service!",5,No,No
1,ORD000002,CUST1409,Blinkit,54:29.5,16,Dairy,279,Quick and reliable!,5,No,No
2,ORD000003,CUST5506,JioMart,21:29.5,25,Beverages,599,Items missing from order.,2,No,Yes
3,ORD000004,CUST5012,JioMart,19:29.5,42,Beverages,946,Items missing from order.,2,Yes,Yes
4,ORD000005,CUST4657,Blinkit,49:29.5,30,Beverages,334,"Fast delivery, great service!",5,No,No


In [5]:
def gen_product_name():
    pass

In [6]:
import random
from faker import Faker

# Define the possible customer types
customer_types = ["Vip", "Bronze", "Sliver", "Gold", "Diamond"]

# Initialize Faker
fake = Faker('vi_VN')

# Create a dictionary to hold the fake data for each unique CustomerID
customer_info = {}

for cid in df['Customer ID'].unique():
    Faker.seed(random.randint(0, 100000))
    # Generate fake data
    country = fake.country()
    country_code = fake.country_code()
    # Note: Faker does not tie city to a specific country, so we simply generate a random city.
    city = fake.city()
    address = fake.address().replace("\n", ", ")  # Remove newlines for a single-line address.
    # Generate a fake name; if you want the name to reflect a specific locale based on the country,
    # you could create a new Faker instance with a locale (if available) for that country.
    name = fake.name()
    ctype = random.choice(customer_types)
    post_code = fake.postcode()

    # Save the fake data in the dictionary
    customer_info[cid] = {
        "CustomerCountry": country,
        "CustomerAddress": address,
        "CustomerCity": city,
        "CustomerName": name,
        "CustomerType": ctype,
        "CustomerPostCode": post_code,
        "CustomerCountryCode": country_code,
        "Currency": fake.currency_code(),
        "JobCustomer": fake.job(),
        "Phone": fake.phone_number(),
    }


In [7]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

list_company: list[str] = []
for _ in range(100):
    list_company.append(fake.company())

list_category: list[str] = ["Appliances", "Baby", "Wallets", "Beauty", "Fashion", "Mobile", "Laptop", "Tech", "Other", "Food"]

openAI = ChatGroq(
    model='llama-3.1-8b-instant',
    max_tokens=30
)

household_items = ["Shoe", "Phone", "Laptop", "Pillow", "Blanket", "Lamp", "Television", "Clothes"]

brands = ["Nike", "Apple", "Samsung", "Sony", "Adidas", "Microsoft"]

product_description = {}

def random_product_name():

    return random.choice(household_items) + " " + random.choice(brands)

def random_product_description():
    for item in household_items:
        for brand in brands:
            product = item + " " + brand
            message = [('human', """
                give me the product description in **max 15 words** of {product_name}. If the product is not real, could you self write fake description? return as text only.
                """
            )]
            chain_message = ChatPromptTemplate.from_messages(message)
            chain = chain_message | openAI
            response = chain.invoke({
                "product_name": product
            })
            product_description[product] = response.content

random_product_description()


In [8]:
order_info = {}

for oid in df['Order ID'].unique():
    Faker.seed(random.randint(0, 100000))

    order_price = fake.pricetag()
    order_payment_methods = random.choice(['Visa', 'MasterCard', 'Exchange', 'Money'])
    order_delivery_methods = random.choice(['Fast', 'Standard', 'VIP'])
    order_total_items = random.randint(1, 10)
    product_name = random_product_name()
    product_desc = product_description[product_name]

    order_info[oid] = {
        'OrderValue': order_price,
        'OrderPaymentMethod': order_payment_methods,
        'OrderDeliveryMethod': order_delivery_methods,
        'OrderTotalItems': order_total_items,
        'productName': product_name,
        'productInfo': product_desc,
        'productId': fake.bothify('PROD#####'),
        'productBrand': random.choice(list_company),
        'productType': random.choice(list_category),
        "productImageUrl": fake.image_url(),
    }


In [9]:
customer_df = pd.DataFrame.from_dict(customer_info, orient='index').reset_index().rename(columns={'index': 'Customer ID'})

In [10]:
order_df = pd.DataFrame.from_dict(order_info, orient='index').reset_index().rename(columns={'index': 'Order ID'})

In [11]:
df = df.merge(customer_df, on='Customer ID', how='left')
df = df.merge(order_df, on='Order ID', how='left')

In [12]:
df.head()

Unnamed: 0,Order ID,Customer ID,Platform,Order Date & Time,Delivery Time (Minutes),Product Category,Order Value (INR),Customer Feedback,Service Rating,Delivery Delay,...,OrderValue,OrderPaymentMethod,OrderDeliveryMethod,OrderTotalItems,productName,productInfo,productId,productBrand,productType,productImageUrl
0,ORD000001,CUST2824,JioMart,19:29.5,30,Fruits & Vegetables,382,"Fast delivery, great service!",5,No,...,₫695.04,Money,Fast,2,Laptop Sony,"**Sony Zephyr X1: Sleek, powerful laptop with ...",PROD50435,Lê Hoàng Công ty Trách nhiệm hữu hạn,Food,https://picsum.photos/752/387
1,ORD000002,CUST1409,Blinkit,54:29.5,16,Dairy,279,Quick and reliable!,5,No,...,"₫29,032.82",Exchange,Fast,6,Blanket Sony,Soft and cozy Sony Blanket with wireless charg...,PROD74377,Vũ Lê Công ty TNHH,Wallets,https://picsum.photos/478/795
2,ORD000003,CUST5506,JioMart,21:29.5,25,Beverages,599,Items missing from order.,2,No,...,₫6.76,MasterCard,Fast,5,Clothes Sony,"Sony does not make clothes, it's a technology ...",PROD00518,Vũ và đối tác Công ty TNHH MTV,Tech,https://dummyimage.com/697x850
3,ORD000004,CUST5012,JioMart,19:29.5,42,Beverages,946,Items missing from order.,2,Yes,...,₫52.85,Exchange,Fast,7,Television Nike,"Nike doesn't manufacture televisions, however ...",PROD64075,Lê và Dương Doanh nghiệp tư nhân,Baby,https://placekitten.com/675/924
4,ORD000005,CUST4657,Blinkit,49:29.5,30,Beverages,334,"Fast delivery, great service!",5,No,...,"₫4,393.46",Exchange,VIP,6,Blanket Samsung,"**Samsung Cozyblanket**: Premium, ultra-soft m...",PROD46803,Mai Công ty TNHH,Appliances,https://dummyimage.com/721x374


In [14]:
# df.to_csv('public/economy_delivery_data/real_data_generate.csv', index=False)
first_60_rows = df.head(60)
first_60_rows.to_csv('public/economy_delivery_data/real_data_generate.csv', index=False)

In [18]:
# cut random 200 rows to become reference data
df1 = pd.read_csv('public/economy_delivery_data/real_data_generate.csv')
df1 = df1.sample(n=20, random_state=42)
df1.to_csv('public/economy_delivery_data/reference_data.csv', index=False)

In [19]:
reference_data = []
for col in df.columns:
    unique_values = df[col].dropna().unique()  # Remove NaN and get unique values
    if len(unique_values) > 50:
        continue
    unique_values_str = ", ".join(map(str, unique_values))  # Limit display to first 10 unique values
    reference_data.append({"column_name": col, "reference_value": unique_values_str})

reference_df = pd.DataFrame(reference_data)
reference_df.to_csv('public/economy_delivery_data/reference_data_value.csv', index=False)