In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
df = pd.read_csv('IMDB_dataset.csv')
y=[1 if label=='positive' else 0 for label in df['sentiment']]
x=df["review"].tolist()
lengths = [len(sentence.split()) for sentence in x]
max_len = int(np.percentile(lengths, 95))
# torch.save((encoded_sentences, labels), "dataset.pt")
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    return text.split()
word2id = {"<pad>": 0, "<unk>": 1}
tokenized_text = [tokenize(t) for t in x]
x_train,x_test,y_train,y_test=train_test_split(tokenized_text,y,test_size=0.2,random_state=42)
for sentence in tokenized_text:
    for word in sentence:
        if word not in word2id:
            word2id[word] = len(word2id)

def load_glove_embeddings(filepath="glove.6B.100d.txt", cache_path="glove_embeddings.pt"):
    if os.path.exists(cache_path):
        print("Loading cached glove embeddings...")
        embeddings = torch.load(cache_path)
        return embeddings

    embeddings = {}
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = torch.tensor([float(x) for x in values[1:]], dtype=torch.float)
            embeddings[word] = vector

    torch.save(embeddings, cache_path)
    print("Glove embeddings loaded and cached.")
    return embeddings
glove = load_glove_embeddings()
embedding_dim = 100
vocab_size = len(word2id)
embedding_matrix = torch.zeros(vocab_size,embedding_dim)
for word, id in word2id.items():
    embedding = glove.get(word)
    if embedding is not None:
        embedding_matrix[id] = embedding
    else:
        embedding_matrix[id] = torch.randn(embedding_dim) * 0.01
print("Embeddings created")
class IMDBdataset(Dataset):
    def __init__(self,tokenized_sentences,labels,word2id):
        self.data = tokenized_sentences
        self.labels = labels
        self.word2id = word2id
    def __len__(self):
        return len(self.data)
    def encode(self,sentence):
        ids = [self.word2id.get(token,self.word2id["<unk>"]) for token in sentence]
        return torch.tensor(ids,dtype=torch.long)
    def __getitem__(self,idx):
        return self.encode(self.data[idx]), torch.tensor(self.labels[idx])
def collate_fn(batch):
    texts, labels = zip(*batch)
    padded = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded, torch.tensor(labels)
train_dataset = IMDBdataset(x_train, y_train, word2id)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn,num_workers=2)
test_dataset = IMDBdataset(x_test, y_test, word2id)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn, num_workers=2)
class GRUClassifier(nn.Module):
    def __init__(self,vocab,embedding_dim,hidden_dim,output_dim,embedding_matrix):
        super(GRUClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix,freeze=False)
        self.gru = nn.GRU(embedding_dim,hidden_dim,batch_first=True)
        self.fc = nn.Linear(hidden_dim,output_dim)
    def forward(self,x):
        embedded = self.embedding(x)
        output,h = self.gru(embedded)
        h = h.squeeze(0)
        return self.fc(h)
model = GRUClassifier(vocab_size, embedding_dim, hidden_dim=128, output_dim=2, embedding_matrix=embedding_matrix)
model.to(device)
loss_f = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
def validate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for x_batch, y_batch in dataloader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            logits = model(x_batch)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())

    return accuracy_score(all_labels, all_preds)
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    total_loss=0
    for x_batch,y_batch in train_dataloader:
        x,y=x_batch.to(device),y_batch.to(device)
        optimizer.zero_grad()
        logits = model(x)
        loss = loss_f(logits,y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    val_acc = validate(model,test_dataloader,device)
    print(f"EPOCH {epoch+1} - total loss {total_loss}, validation accuracy: {val_acc}")

EPOCH 1 - total loss 866.2888071537018, validation accuracy: 0.5099


KeyboardInterrupt: 