In [31]:
!pip install -U google-genai
!pip install ctgan sdv
!pip install kagglehub
!pip install openai



In [2]:
import pandas as pd
import numpy as np
import os

from dotenv import load_dotenv

In [6]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/online-retail-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'online-retail-dataset' dataset.
Path to dataset files: /kaggle/input/online-retail-dataset


In [9]:
# List files to find the correct CSV name
files = os.listdir(path)
print("Files in directory:", files)

Files in directory: ['online_retail_II.xlsx']


In [11]:
df = pd.read_excel(path + "/online_retail_II.xlsx")

In [12]:
df.shape

(525461, 8)

In [13]:
df_sample = df.sample(n=5000)
df_sample.shape

(5000, 8)

In [14]:
df_sample.columns

Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')

In [15]:
df_sample = df_sample.dropna(subset=['Customer ID', 'Price', 'Quantity'])

In [17]:
import os
import pandas as pd
from sdv.metadata import Metadata
from sdv.single_table import CTGANSynthesizer

# Define file paths
METADATA_FILE = 'metadata.json'
MODEL_FILE = 'my_ctgan_model.pkl'
cols_to_model = ['Quantity', 'Price', 'Country']

# 1. Handle Metadata
if os.path.exists(METADATA_FILE):
    metadata = Metadata.load_from_json(METADATA_FILE)
    print("✓ Metadata loaded from file.")
else:
    # Detect and save if it doesn't exist
    df_for_ctgan = df_sample[cols_to_model].copy()
    metadata = Metadata.detect_from_dataframe(data=df_for_ctgan, table_name='retail_patterns')
    metadata.save_to_json(METADATA_FILE)
    print("! Metadata detected and saved.")

# 2. Handle the Trained Model
if os.path.exists(MODEL_FILE):
    # Load the pre-trained synthesizer
    synthesizer = CTGANSynthesizer.load(MODEL_FILE)
    print("Trained model loaded from file. Skipping training phase.")
else:
    # Initialize and train if no model is found
    print("No model found. Starting training (this may take a few minutes)...")
    synthesizer = CTGANSynthesizer(
        metadata,
        epochs=100,
        cuda=True,
        verbose=True
    )
    synthesizer.fit(df_sample[cols_to_model])
    # Save the model so you don't have to train again
    synthesizer.save(MODEL_FILE)
    print(f"Training complete. Model saved to {MODEL_FILE}.")

# 3. Generate 1,000 numerical/country records
# This works instantly once the model is loaded or trained
df_numerical_sim = synthesizer.sample(num_rows=1000)

Gen. (-0.47) | Discrim. (0.00): 100%|██████████| 100/100 [00:15<00:00,  6.25it/s]


In [35]:
import os
import json
import time
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
# Ensure your .env has OPENAI_API_KEY
client = OpenAI(api_key=os.getenv("OPENAI_API"))

def process_in_batches_openai(df, batch_size=20):
    all_results = []

    for i in range(0, len(df), batch_size):
        batch = df.iloc[i : i + batch_size]
        context_list = [f"Qty {row['Quantity']}, Price {row['Price']}" for _, row in batch.iterrows()]

        # System instructions are crucial for OpenAI JSON mode
        messages = [
            {"role": "system", "content": "You are a retail data simulator. Output a JSON list of objects."},
            {"role": "user", "content": f"Simulate retail transactions for these context points: {context_list}. Each object must have keys: 'Description' and 'Review'."}
        ]

        try:
            print(f"Processing batch {i//batch_size + 1}...")
            response = client.chat.completions.create(
                model="gpt-4o-mini", # More cost-effective for simulation
                messages=messages,
                response_format={"type": "json_object"} # FORCES valid JSON
            )

            # OpenAI returns the JSON as a string inside a content field
            content = response.choices[0].message.content
            batch_data = json.loads(content)

            # OpenAI often wraps the list in a key (e.g., {"transactions": [...]})
            # This line ensures we extract just the list regardless of the key name
            if isinstance(batch_data, dict):
                key = list(batch_data.keys())[0]
                batch_data = batch_data[key]

            all_results.extend(batch_data)

            # OpenAI Tier 1/Free limits are usually higher, but 2-5 sec is safe
            time.sleep(2)

        except Exception as e:
            print(f"Error in batch {i}: {e}")
            all_results.extend([{'Description': 'Error', 'Review': 'N/A'}] * len(batch))

    return pd.DataFrame(all_results)

# 1. Generate text
df_text = process_in_batches_openai(df_numerical_sim)

# 2. Join
df_final = pd.concat([df_numerical_sim.reset_index(drop=True), df_text], axis=1)

Processing batch 1...
Error in batch 0: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************WRa-. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'code': 'invalid_api_key', 'param': None}, 'status': 401}
Processing batch 2...
Error in batch 20: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************WRa-. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'code': 'invalid_api_key', 'param': None}, 'status': 401}
Processing batch 3...
Error in batch 40: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-proj-********************************************WRa-. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'code': 'invalid_api_key', 'param': None}, 'status'

In [None]:
# 3. Save as the final simulated dataset (Satisfies Deliverable 2)
df_final.to_csv("simulated_business_records.csv", index=False)