In [None]:
import gzip
import json
import re
from collections import Counter
import torch
from torch import nn, optim
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

In [None]:
# download the dataset
!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/Electronics.json.gz

--2024-12-16 21:15:18--  https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/Electronics.json.gz
Resolving datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)... 132.239.8.30
Connecting to datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)|132.239.8.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3322874357 (3.1G) [application/x-gzip]
Saving to: ‘Electronics.json.gz’


2024-12-16 21:16:40 (39.1 MB/s) - ‘Electronics.json.gz’ saved [3322874357/3322874357]



In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
# Download the 'punkt_tab' resource
nltk.download('punkt_tab')

# Step 1: Download and preprocess the dataset
file_path = "Electronics.json.gz"
reviews = []

with gzip.open(file_path, 'rt') as f:
    for line in f:
        review = json.loads(line)
        if 'reviewText' in review:
            reviews.append(review['reviewText'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
nltk.download("punkt_tab")
# Downsample dataset
N = 100000
selected_reviews = reviews[:N]

# Define stopwords
stop_words = set(stopwords.words('english'))

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]  # Remove stopwords
    return tokens

# Preprocess all reviews
processed_reviews = [preprocess_text(review) for review in selected_reviews]

# Step 2: Vocabulary creation
min_word_count = 5
word_counts = Counter(word for review in processed_reviews for word in review)
vocab = {word: idx for idx, (word, count) in enumerate(word_counts.items()) if count >= min_word_count}
vocab_size = len(vocab)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# Create word_to_index and index_to_word mappings
word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Step 3: Generate training data (center-context pairs)
window_size = 2
def generate_training_data(processed_reviews, word_to_index, window_size):
    pairs = []
    for review in processed_reviews:
        indices = [word_to_index[word] for word in review if word in word_to_index]
        for center_idx in range(len(indices)):
            for offset in range(-window_size, window_size + 1):
                context_idx = center_idx + offset
                if context_idx < 0 or context_idx >= len(indices) or center_idx == context_idx:
                    continue
                pairs.append((indices[center_idx], indices[context_idx]))
    return pairs


In [None]:
training_pairs = generate_training_data(processed_reviews, word_to_index, window_size)

# Step 4: Simplified Word2Vec model
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(Word2Vec, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.output_layer = nn.Linear(embed_size, vocab_size)

    def forward(self, center_words):
        embeddings = self.embedding(center_words)
        output = self.output_layer(embeddings)
        return output

# Hyperparameters
embed_size = 50
batch_size = 1024
epochs = 5

# DataLoader for batching
def create_dataloader(training_pairs, batch_size):
    dataset = torch.utils.data.TensorDataset(
        torch.tensor([pair[0] for pair in training_pairs], dtype=torch.long),
        torch.tensor([pair[1] for pair in training_pairs], dtype=torch.long)
    )
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

dataloader = create_dataloader(training_pairs, batch_size)


In [None]:
# Model, optimizer, and loss function
model = Word2Vec(vocab_size, embed_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for center, context in dataloader:
        center, context = center.to(device), context.to(device)
        optimizer.zero_grad()
        output = model(center)
        loss = criterion(output, context)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

Epoch 1/5, Loss: 87897.6557
Epoch 2/5, Loss: 84426.3095
Epoch 3/5, Loss: 83602.1010
Epoch 4/5, Loss: 83138.6846
Epoch 5/5, Loss: 82829.3236
