<a href="https://colab.research.google.com/github/syedmahmoodiagents/NLP/blob/main/Semantic_Analysis_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !python -m spacy download en_core_web_md --q
# !pip install gensim --q

In [None]:
import spacy
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [None]:
sentences = [
    ["I", "love", "natural", "language", "processing"],
    ["You", "can", "use", "Word2Vec", "for", "word", "embeddings"],
    ["Machine", "learning", "is", "fun"],
    ["Deep", "learning", "is", "a", "subset", "of", "machine", "learning"]
]
sentences = [[w.lower() for w in s] for s in sentences]
labels = [1, 0, 1, 0]

In [None]:
vocab_words = sorted(set(word for sent in sentences for word in sent))
old_vocab = {word: idx for idx, word in enumerate(vocab_words)}
vocab = {word: idx + 1 for word, idx in old_vocab.items()}    # shift

print("Vocab:", vocab)

Vocab: {'a': 1, 'can': 2, 'deep': 3, 'embeddings': 4, 'for': 5, 'fun': 6, 'i': 7, 'is': 8, 'language': 9, 'learning': 10, 'love': 11, 'machine': 12, 'natural': 13, 'of': 14, 'processing': 15, 'subset': 16, 'use': 17, 'word': 18, 'word2vec': 19, 'you': 20}


In [None]:
nlp = spacy.load("en_core_web_md")   # 300-dim embeddings
embed_dim = nlp.vocab.vectors_length
vocab_size = len(vocab)

In [None]:
embedding_matrix = np.zeros((vocab_size + 1, embed_dim))

for word, idx in vocab.items():
    if word in nlp.vocab:
        embedding_matrix[idx] = nlp.vocab[word].vector
    else:
        embedding_matrix[idx] = np.random.randn(embed_dim) * 0.1

embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

In [None]:
indexed_sentences = [[vocab[w] for w in sent] for sent in sentences]

In [None]:
class TextDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = torch.tensor(self.sequences[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return seq, label

In [None]:
def collate_batch(batch):
    sequences, labels = zip(*batch)
    lengths = torch.tensor([len(s) for s in sequences])
    padded = pad_sequence(sequences, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)
    return padded, lengths, labels

In [None]:
dataset = TextDataset(indexed_sentences, labels)
loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_batch)

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, emb_matrix, hidden_dim=128, num_classes=2):
        super().__init__()

        vocab_size, embed_dim = emb_matrix.shape

        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim,
            padding_idx=0          # <- padding ignored
        )

        self.embedding.weight.data.copy_(emb_matrix)
        self.embedding.weight.requires_grad = False  # freeze word2vec

        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, padded, lengths):
        embed = self.embedding(padded)
        packed = pack_padded_sequence(embed, lengths.cpu(), batch_first=True, enforce_sorted=False)
        out, (h, c) = self.lstm(packed)
        logits = self.fc(h[-1])  # final hidden state
        return logits

In [None]:
model = LSTMClassifier(embedding_matrix)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [None]:
for epoch in range(10):
    for padded, lengths, labels in loader:
        optimizer.zero_grad()
        logits = model(padded, lengths)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")

Epoch 1 Loss: 0.7059
Epoch 2 Loss: 0.6578
Epoch 3 Loss: 0.6368
Epoch 4 Loss: 0.6255
Epoch 5 Loss: 0.5799
Epoch 6 Loss: 0.5457
Epoch 7 Loss: 0.4439
Epoch 8 Loss: 0.3821
Epoch 9 Loss: 0.3903
Epoch 10 Loss: 0.2496


In [None]:
model.eval()
test_sentence = ["machine", "learning", "is", "cool"]
test_ids = torch.tensor([vocab.get(w, 0) for w in test_sentence]).unsqueeze(0)
test_len = torch.tensor([len(test_sentence)])

In [None]:
with torch.no_grad():
    logits = model(test_ids, test_len)
    pred = torch.argmax(logits, dim=1).item()

print("\nPredicted class:", pred)


Predicted class: 1
