# Leveraging Large Language Models (LLMs) to enhance product descriptions and SEO

- Develop a system using an LLM to generate optimized product descriptions.
- Analyze the impact of AI-generated content on search rankings and conversion rates.


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
from datasets import load_dataset, DatasetDict

amazon_dataset = load_dataset("csv", data_files='./amazon.csv')
# Load pre-trained model and tokenizer
model_name = "distilgpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token



In [None]:
def combine(row):
    return {
        'text': [f"{product_id} {product_name} {category} {discounted_price} {actual_price} {discount_percentage} {rating} {rating_count} {about_product}" for product_id, product_name, category, discounted_price, actual_price, discount_percentage, rating, rating_count, about_product in zip(row['product_id'], row['product_name'], row['category'], row['discounted_price'], row['actual_price'], row['discount_percentage'], row['rating'], row["rating_count"], row['about_product'])]
    }


amazon_dataset = amazon_dataset.map(combine, batched=True)

Map: 100%|██████████| 1465/1465 [00:00<00:00, 19756.72 examples/s]


In [None]:
def tokenize_function(row):
    tokenized_inputs = tokenizer(
        row['text'], truncation=True, padding='max_length', max_length=524)
    tokenized_inputs['labels'] = tokenized_inputs['input_ids'].copy()
    return tokenized_inputs


tokenized_dataset = amazon_dataset.map(tokenize_function, batched=True)

train_test_split = tokenized_dataset['train'].train_test_split(test_size=0.1)

tokenized_dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test']
})

Map: 100%|██████████| 1465/1465 [00:01<00:00, 1150.08 examples/s]


In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['product_id', 'product_name', 'category', 'discounted_price', 'actual_price', 'discount_percentage', 'rating', 'rating_count', 'about_product', 'user_id', 'user_name', 'review_id', 'review_title', 'review_content', 'img_link', 'product_link', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1318
    })
    validation: Dataset({
        features: ['product_id', 'product_name', 'category', 'discounted_price', 'actual_price', 'discount_percentage', 'rating', 'rating_count', 'about_product', 'user_id', 'user_name', 'review_id', 'review_title', 'review_content', 'img_link', 'product_link', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 147
    })
})

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
)

trainer.train()
trainer.evaluate()



KeyboardInterrupt: 

In [None]:
model.save_pretrained('./fine_tuned_distilgpt2')
tokenizer.save_pretrained('./fine_tuned_distilgpt2')

('./fine_tuned_distilgpt2/tokenizer_config.json',
 './fine_tuned_distilgpt2/special_tokens_map.json',
 './fine_tuned_distilgpt2/vocab.json',
 './fine_tuned_distilgpt2/merges.txt',
 './fine_tuned_distilgpt2/added_tokens.json')

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./fine_tuned_distilgpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./fine_tuned_distilgpt2')

# Assign the EOS token as the padding token if not set
tokenizer.pad_token = tokenizer.eos_token

# Move model to the appropriate device (GPU or CPU)
device = torch.device(
    'cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.eval()

# Input text for generation
input_text = "What is the cheapest phone that right now?"
inputs = tokenizer(input_text, return_tensors='pt',
                   padding=True, truncation=True)

input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)

# Generate text with diversity controls and the attention mask
generated_text_ids = model.generate(
    input_ids,
    attention_mask=attention_mask,  # Pass attention mask
    max_length=100,
    num_return_sequences=1,
    no_repeat_ngram_size=2,          # Prevent repetition of 2-grams
    top_k=50,                        # Use top-k sampling
    top_p=0.95,                      # Use nucleus sampling
    temperature=0.7,                 # Lower temperature for more focused sampling
    pad_token_id=tokenizer.eos_token_id  # Use EOS token as the pad token
)

# Decode the generated text
generated_text = tokenizer.decode(
    generated_text_ids[0], skip_special_tokens=True)

# Print the generated text
print(generated_text)



What is the cheapest phone that right now? It's a good phone for the price range, but it's not a perfect phone, it has a bit of a lag, and it doesn't support Voice Assistant. It has some issues with the phone itself, which is why it is not available in the US. But it does support voice assistant, so you can get it for your family. So, you have to go for it. You can buy it in India, for example. And it
