In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import re
from collections import Counter
import random
import numpy as np
from itertools import islice
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from pathlib import Path
from torch.utils.data import Dataset

def set_seed(seed=42):
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_seed(42)

In [2]:

class NewsGroupDataset(Dataset):
    def __init__(self, root_dir, transform=None, tokenizer=None):
        self.root_dir = Path(root_dir)
        self.transform = transform
        self.tokenizer = tokenizer

        self.samples = []
        self.classes = sorted([d.name for d in self.root_dir.iterdir() if d.is_dir()])
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(self.classes)}

        for cls_name in self.classes:
            class_folder = self.root_dir / cls_name
            for file in class_folder.iterdir():
                if file.is_file():
                    self.samples.append((file, self.class_to_idx[cls_name]))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        filepath, label = self.samples[idx]
        with open(filepath, 'r', encoding='latin1') as f:
            text = f.read()

        if self.transform:
            text = self.transform(text)
        if self.tokenizer:
            text = self.tokenizer(text)

        return text, label

# Saatnya Word Embeddings beraksi

In [3]:

class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(RNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        return self.fc(hidden[-1])  # use last hidden state

In [4]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    # Convert tokens to indices
    token_ids = [torch.tensor([vocab[token] for token in tokens], dtype=torch.long) for tokens in texts]
    padded = pad_sequence(token_ids, batch_first=True, padding_value=vocab['<pad>'])
    return padded, torch.tensor(labels)

# Membangun IMBD Dataset

In [5]:
from torch.utils.data import random_split, DataLoader
from torchtext.vocab import build_vocab_from_iterator

dataset = NewsGroupDataset("data/20_newsgroups", tokenizer=str.split)

# Create vocabulary from tokenized text
def yield_tokens(dataset):
    for text, _ in dataset:
        yield text

vocab = build_vocab_from_iterator(yield_tokens(dataset), specials=["<pad>", "<unk>"])
vocab.set_default_index(vocab["<unk>"])

# Load Pre-Trained GloVe Word Embedding

# Prepraing Word Embedding

In [6]:
# Split into train/test
train_len = int(len(dataset) * 0.8)
test_len = len(dataset) - train_len
train_dataset, test_dataset = random_split(dataset, [train_len, test_len])

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)


# Let's Train !!!

In [None]:
from torch.optim import Adam

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = RNNClassifier(
    vocab_size=len(vocab),
    embed_dim=128,
    hidden_dim=256,
    num_classes=20
).to(device)


criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-3)

for epoch in range(5):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")
