In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline, set_seed
import random
from datetime import datetime, timedelta

In [None]:
# Loading the datasets
products_df = pd.read_csv("product_asin.csv", nrows=1000)  # Load only 1000 rows
reviews_df = pd.read_csv("reviews_supplements.csv", nrows=1000)  # Load only 1000 rows

In [None]:
# Initialize the text generation pipeline with distilgpt2 (a small model)
generator = pipeline('text-generation', model='distilgpt2')
set_seed(42)  # for reproducibility

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
# Generate the synthetic review
def generate_synthetic_review(product_info, max_length=100):  # Reduced max_length
    prompt = f"Product: {product_info['title'][:30]}. Category: {product_info['category']}. Review:"
    generated_text = generator(prompt, max_length=max_length, num_return_sequences=1)[0]['generated_text']
    return generated_text.split("Review:")[1].strip()

# Generate random date within a range
def random_date(start, end):
    return start + timedelta(
        seconds=random.randint(0, int((end - start).total_seconds()))
    )

# Generate synthetic reviews (500 reviews)
synthetic_reviews = []
num_reviews_to_generate = 500  # Number of reviews to generate
for _ in range(num_reviews_to_generate):
    # Randomly select a product
    product = products_df.sample(1).iloc[0]
    product_info = {
        "title": product["title"],
        "category": product["cat1"],
        "subcategory": product["cat2"] if pd.notna(product["cat2"]) else product["cat1"]
    }

    # Generate a synthetic review
    review_text = generate_synthetic_review(product_info)

    # Generate random date and time (within the last year)
    end_date = datetime.now()
    start_date = end_date - timedelta(days=365)
    review_date = random_date(start_date, end_date)

    # Generate other random attributes
    rating = random.randint(1, 5)
    helpful_vote = random.randint(0, 5)  # Reduced max helpful votes
    verified_purchase = random.choice([True, False])

    synthetic_reviews.append({
        "rating": rating,
        "title": f"Review for {product_info['title'][:20]}...",  # Shortened the title to make it simpler
        "text": review_text,
        "asin": product["parent_asin"],
        "parent_asin": product["parent_asin"],
        "user_id": f"user_{_}",
        "timestamp": review_date.strftime("%Y-%m-%d %H:%M:%S"),
        "helpful_vote": helpful_vote,
        "verified_purchase": verified_purchase,
        "date": review_date.strftime("%Y-%m-%d"),
        "time": review_date.strftime("%H:%M:%S")
    })

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for 

In [None]:
# Create a DataFrame from the synthetic reviews
synthetic_df = pd.DataFrame(synthetic_reviews)

# Save the synthetic reviews to a CSV file
synthetic_df.to_csv("synthetic_reviews_small.csv", index=False)
print(f"Generated {len(synthetic_reviews)} synthetic reviews and saved to synthetic_reviews_small.csv")