In [1]:
import pandas as pd
import numpy as np
import os

from dotenv import load_dotenv

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/online-retail-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\User\.cache\kagglehub\datasets\lakshmi25npathi\online-retail-dataset\versions\1


In [4]:
# List files to find the correct CSV name
files = os.listdir(path)
print("Files in directory:", files)

Files in directory: ['online_retail_II.xlsx']


In [5]:
df = pd.read_excel(path + "/online_retail_II.xlsx")

In [6]:
df.shape

(525461, 8)

In [7]:
df_sample = df.sample(n=5000)
df_sample.shape

(5000, 8)

In [8]:
df_sample.columns

Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')

In [9]:
df_sample = df_sample.dropna(subset=['Customer ID', 'Price', 'Quantity'])

In [10]:
import os
import pandas as pd
from sdv.metadata import Metadata
from sdv.single_table import CTGANSynthesizer

# Define file paths
METADATA_FILE = 'metadata.json'
MODEL_FILE = 'my_ctgan_model.pkl'
cols_to_model = ['Quantity', 'Price', 'Country']

# 1. Handle Metadata
if os.path.exists(METADATA_FILE):
    metadata = Metadata.load_from_json(METADATA_FILE)
    print("✓ Metadata loaded from file.")
else:
    # Detect and save if it doesn't exist
    df_for_ctgan = df_sample[cols_to_model].copy()
    metadata = Metadata.detect_from_dataframe(data=df_for_ctgan, table_name='retail_patterns')
    metadata.save_to_json(METADATA_FILE)
    print("! Metadata detected and saved.")

# 2. Handle the Trained Model
if os.path.exists(MODEL_FILE):
    # Load the pre-trained synthesizer
    synthesizer = CTGANSynthesizer.load(MODEL_FILE)
    print("Trained model loaded from file. Skipping training phase.")
else:
    # Initialize and train if no model is found
    print("No model found. Starting training (this may take a few minutes)...")
    synthesizer = CTGANSynthesizer(
        metadata,
        epochs=100,
        cuda=True,
        verbose=True
    )
    synthesizer.fit(df_sample[cols_to_model])
    # Save the model so you don't have to train again
    synthesizer.save(MODEL_FILE)
    print(f"Training complete. Model saved to {MODEL_FILE}.")

# 3. Generate 1,000 numerical/country records
# This works instantly once the model is loaded or trained
df_numerical_sim = synthesizer.sample(num_rows=1000)

✓ Metadata loaded from file.
Trained model loaded from file. Skipping training phase.



The 'load' function will be deprecated in future versions of SDV. Please use 'utils.load_synthesizer' instead.



In [13]:
df_numerical_sim.head()
len(df_numerical_sim)

1000

In [11]:
import os
import json
import time
import pandas as pd
import google.generativeai as genai
from dotenv import load_dotenv

load_dotenv()

# Configure Gemini API
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

def process_in_batches_gemini(df, batch_size=20):
    all_results = []
    
    # Use Gemini 2.5 Flash - the latest and most capable Flash model
    model = genai.GenerativeModel(
        'gemini-2.5-flash',  # Latest Flash model
        generation_config=genai.types.GenerationConfig(
            temperature=0.7,
            max_output_tokens=8192,
            response_mime_type="application/json"  # Strict JSON mode
        )
    )
    
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i : i + batch_size]
        context_list = [
            {"index": idx, "quantity": row['Quantity'], "price": row['Price']} 
            for idx, (_, row) in enumerate(batch.iterrows())
        ]
        
        # Structured prompt for Gemini 2.5
        prompt = f"""Generate exactly {len(batch)} retail transaction records.

Input context: {json.dumps(context_list)}

Return a JSON array with this structure:
[
  {{"Description": "product name and details", "Review": "customer review"}},
  ...
]

Requirements:
- Exactly {len(batch)} objects in the array
- Each Description: 30-60 characters
- Each Review: 50-100 characters
- Make reviews realistic and varied
- Base quantity and price mentions on the input context"""
        
        try:
            print(f"Processing batch {i//batch_size + 1} (rows {i}-{i+len(batch)-1})...")
            
            response = model.generate_content(prompt)
            content = response.text.strip()
            
            # Gemini 2.5 with JSON MIME type should return clean JSON
            if content.startswith('```'):
                content = content.split('```')[1]
                if content.startswith('json'):
                    content = content[4:]
                content = content.strip()
            
            batch_data = json.loads(content)
            
            # Handle wrapped responses
            if isinstance(batch_data, dict):
                if 'transactions' in batch_data:
                    batch_data = batch_data['transactions']
                elif 'records' in batch_data:
                    batch_data = batch_data['records']
                else:
                    # Get first list value
                    for value in batch_data.values():
                        if isinstance(value, list):
                            batch_data = value
                            break
            
            # Validate count
            if len(batch_data) != len(batch):
                print(f"⚠ Warning: Expected {len(batch)}, got {len(batch_data)} records")
                # Pad or trim
                while len(batch_data) < len(batch):
                    batch_data.append({
                        'Description': 'Generated placeholder', 
                        'Review': 'Additional record needed'
                    })
                batch_data = batch_data[:len(batch)]
            
            all_results.extend(batch_data)
            print(f"✓ Successfully processed {len(batch_data)} records")
            
            # Gemini 2.5 Flash has improved rate limits
            time.sleep(1)
            
        except json.JSONDecodeError as e:
            print(f"✗ JSON parsing error in batch {i}: {e}")
            print(f"Response length: {len(content)} chars")
            print(f"First 300 chars: {content[:300]}")
            print(f"Last 300 chars: {content[-300:]}")
            
            # Try to salvage
            try:
                last_complete = content.rfind('}')
                if last_complete > 0:
                    fixed_content = content[:last_complete + 1] + ']'
                    batch_data = json.loads(fixed_content)
                    
                    if isinstance(batch_data, dict):
                        batch_data = list(batch_data.values())[0] if batch_data.values() else []
                    
                    recovered = len(batch_data)
                    print(f"  ↳ Recovered {recovered}/{len(batch)} records")
                    all_results.extend(batch_data)
                    
                    # Fill missing
                    missing = len(batch) - recovered
                    if missing > 0:
                        all_results.extend([
                            {'Description': 'Recovery incomplete', 'Review': 'Partial data'}
                        ] * missing)
                else:
                    raise ValueError("No complete objects found")
                    
            except Exception as recovery_error:
                print(f"  ↳ Recovery failed: {recovery_error}")
                all_results.extend([
                    {'Description': 'Parse error', 'Review': 'JSON incomplete'}
                ] * len(batch))
                
        except Exception as e:
            print(f"✗ Unexpected error in batch {i}: {type(e).__name__}: {e}")
            all_results.extend([
                {'Description': 'Generation error', 'Review': 'Request failed'}
            ] * len(batch))
    
    return pd.DataFrame(all_results)

# Process with Gemini 2.5 Flash
print("Starting batch processing with Gemini 2.5 Flash...")
df_text = process_in_batches_gemini(df_numerical_sim, batch_size=20)

# Join results
df_final = pd.concat([df_numerical_sim.reset_index(drop=True), df_text], axis=1)

print(f"\n{'='*60}")
print(f"Processing Complete!")
print(f"{'='*60}")
print(f"Total rows: {len(df_final)}")
print(f"Successful: {len(df_final[~df_final['Description'].str.contains('error|Error|incomplete|Incomplete', case=False, na=False)])}")
print(f"Errors: {len(df_final[df_final['Description'].str.contains('error|Error|incomplete|Incomplete', case=False, na=False)])}")
print(f"\nSample output:")
print(df_final.head(3))

Starting batch processing with Gemini 2.5 Flash...
Processing batch 1 (rows 0-19)...
✓ Successfully processed 20 records
Processing batch 2 (rows 20-39)...
✓ Successfully processed 20 records
Processing batch 3 (rows 40-59)...
✓ Successfully processed 20 records
Processing batch 4 (rows 60-79)...
✓ Successfully processed 20 records
Processing batch 5 (rows 80-99)...
✓ Successfully processed 20 records
Processing batch 6 (rows 100-119)...
✓ Successfully processed 20 records
Processing batch 7 (rows 120-139)...
✓ Successfully processed 20 records
Processing batch 8 (rows 140-159)...
✓ Successfully processed 20 records
Processing batch 9 (rows 160-179)...
✓ Successfully processed 20 records
Processing batch 10 (rows 180-199)...
✓ Successfully processed 20 records
Processing batch 11 (rows 200-219)...
✓ Successfully processed 20 records
Processing batch 12 (rows 220-239)...
✓ Successfully processed 20 records
Processing batch 13 (rows 240-259)...
✓ Successfully processed 20 records
Process

In [14]:
# 3. Save as the final simulated dataset (Satisfies Deliverable 2)
df_final.to_csv("simulated_business_records.csv", index=False)