<a href="https://colab.research.google.com/github/syedmahmoodiagents/NLP/blob/main/NextWord.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !python -m spacy download en_core_web_md --q
# !pip install gensim --q

In [None]:
import spacy
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [None]:
sentences = [
    ["I", "love", "natural", "language", "processing"],
    ["You", "can", "use", "Word2Vec", "for", "word", "embeddings"],
    ["Machine", "learning", "is", "fun"],
    ["Deep", "learning", "is", "a", "subset", "of", "machine", "learning"]
]

sentences = [[w.lower() for w in s] for s in sentences]

In [None]:
# Build vocab (shift indices so 1..N, padding = 0)
vocab_words = sorted(set(word for sent in sentences for word in sent))
old_vocab = {w: i for i, w in enumerate(vocab_words)}
vocab = {w: i + 1 for w, i in old_vocab.items()}   # shift

idx2word = {i: w for w, i in vocab.items()}        # reverse map

In [None]:
# Load spaCy vectors & build embedding matrix
nlp = spacy.load("en_core_web_md")
embed_dim = nlp.vocab.vectors_length
vocab_size = len(vocab)

In [None]:
embedding_matrix = np.zeros((vocab_size + 1, embed_dim))
for word, idx in vocab.items():
    embedding_matrix[idx] = nlp.vocab[word].vector

embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

In [None]:
train_X = []
train_y = []

for sent in sentences:
    indices = [vocab[w] for w in sent]
    for i in range(1, len(indices)):       # start from 1
        train_X.append(indices[:i])        # prefix
        train_y.append(indices[i])         # next word

In [None]:
# Dataset
class NextWordDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        seq = torch.tensor(self.X[idx], dtype=torch.long)
        target = torch.tensor(self.y[idx], dtype=torch.long)
        return seq, target

In [None]:
def collate_fn(batch):
    seqs, targets = zip(*batch)
    padded = pad_sequence(seqs, batch_first=True, padding_value=0)
    targets = torch.tensor(targets)
    return padded, targets

dataset = NextWordDataset(train_X, train_y)
loader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [None]:
# LSTM Language Model
class NextWordLSTM(nn.Module):
    def __init__(self, emb_matrix, hidden_dim=128):
        super().__init__()
        vocab_size, embed_dim = emb_matrix.shape

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embedding.weight.data.copy_(emb_matrix)
        self.embedding.weight.requires_grad = False

        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        emb = self.embedding(x)
        out, (h, c) = self.lstm(emb)
        last_hidden = out[:, -1, :]           # last timestep
        logits = self.fc(last_hidden)
        return logits

In [None]:
model = NextWordLSTM(embedding_matrix)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
for epoch in range(20):
    for batch_x, batch_y in loader:
        optimizer.zero_grad()
        logits = model(batch_x)
        loss = criterion(logits, batch_y)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")

Epoch 1 Loss: 3.0613
Epoch 2 Loss: 2.9622
Epoch 3 Loss: 2.8323
Epoch 4 Loss: 2.8853
Epoch 5 Loss: 2.8102
Epoch 6 Loss: 2.7702
Epoch 7 Loss: 2.3576
Epoch 8 Loss: 2.1370
Epoch 9 Loss: 2.2021
Epoch 10 Loss: 2.1496
Epoch 11 Loss: 1.6133
Epoch 12 Loss: 1.8841
Epoch 13 Loss: 1.7137
Epoch 14 Loss: 0.9329
Epoch 15 Loss: 1.1575
Epoch 16 Loss: 1.3580
Epoch 17 Loss: 0.6462
Epoch 18 Loss: 0.4881
Epoch 19 Loss: 0.7325
Epoch 20 Loss: 0.2913


In [None]:
def predict_next_word(model, prefix_words):
    model.eval()
    with torch.no_grad():
        ids = [vocab.get(w.lower(), 0) for w in prefix_words]
        x = torch.tensor(ids).unsqueeze(0)
        logits = model(x)
        next_id = torch.argmax(logits, dim=1).item()
        return idx2word.get(next_id, "<unk>")

In [None]:
# Test it
print("\nNext word after: ['machine', 'learning', 'is'] →")
print("Prediction:", predict_next_word(model, ["machine", "learning", "is"]))

print("\nNext word after: ['i', 'love'] →")
print("Prediction:", predict_next_word(model, ["i", "love"]))



Next word after: ['machine', 'learning', 'is'] →
Prediction: fun

Next word after: ['i', 'love'] →
Prediction: natural
