In [83]:
from collections import Counter

# Sample corpus and tokenization
corpus = [
    "hello world program",
    "hello world",
    "programming in python",
    "hello programming",
    "world of programming"
]


In [84]:
def calculate_frequencies_and_vocab(corpus, n=3, word_freq_threshold=3, subword_freq_threshold=4):
    word_counter = Counter()
    subword_counter = Counter()
    vocab = {}
    idx = 1

    for text in corpus:
        words = text.split()
        word_counter.update(words)
        for word in words:
            for i in range(len(word)):
                for j in range(i + n , len(word) + 1):
                    subword = word[i:j]
                    subword_counter[subword] += 1
    
    # Add words to vocab
    for word in word_counter:
        if word_counter[word] >= word_freq_threshold:
            vocab[word] = idx
            idx += 1

    # Add subwords to vocab
    for subword in subword_counter:
        if subword_counter[subword] >= subword_freq_threshold:
            if subword not in vocab:
                vocab[subword] = idx
                idx += 1
                
    return word_counter, subword_counter, vocab

word_counter, subword_counter, vocab = calculate_frequencies_and_vocab(corpus)

print(vocab)

{'hello': 1, 'world': 2, 'programming': 3, 'pro': 4, 'prog': 5, 'progr': 6, 'progra': 7, 'program': 8, 'rog': 9, 'rogr': 10, 'rogra': 11, 'rogram': 12, 'ogr': 13, 'ogra': 14, 'ogram': 15, 'gra': 16, 'gram': 17, 'ram': 18}


In [85]:
def tokenization_based_on_vocab(text, vocab, n=3):
    """
    Returns:
    - List[str]: A list of tokens (whole words or subwords) present in the vocabulary.
    """
    words = text.split()
    tokens = []

    for word in words:
        if word in vocab:
            # If the whole word is in the vocabulary, keep it
            tokens.append(word)
        else:
            # Break the word into subwords and keep those that are in the vocabulary
            subwords = []
            word_length = len(word)
            for i in range(word_length):
                for j in range(i + 1, min(i + n + 1, word_length + 1)):
                    subword = word[i:j]
                    if subword in vocab:
                        subwords.append(subword)
            
            if subwords:
                # If any meaningful subwords are found, add them
                tokens.extend(subwords)
            else:
                # If no meaningful subwords are found, keep the whole word (fallback)
                tokens.append(word)

    return tokens

In [86]:
text = "hello world program roggg rogrr"
tokens = tokenization_based_on_vocab(
    text,
    vocab,
    n=3
)
tokens

['hello', 'world', 'program', 'rog', 'rog', 'ogr']

In [87]:


def numericalize(tokens, vocab):
    """Convert tokens to numerical format using the vocabulary."""
    return [vocab[token] for token in tokens if token in vocab]

# Tokenize and convert to numerical format
tokenized_texts = [tokenization_based_on_vocab(text, vocab,3) for text in corpus]
numerical_data = [numericalize(tokens, vocab) for tokens in tokenized_texts]



In [88]:
numerical_data

[[1, 2, 8], [1, 2], [3], [1, 3], [2, 3]]

In [89]:
def pad_sequence(seq, max_len, pad_idx):
    """Pad sequences to ensure they have the same length."""
    return seq + [pad_idx] * (max_len - len(seq))

max_len = max(len(x) for x in numerical_data)
padded_data = [pad_sequence(x, max_len, 0) for x in numerical_data]

In [90]:
padded_data

[[1, 2, 8], [1, 2, 0], [3, 0, 0], [1, 3, 0], [2, 3, 0]]

In [91]:
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [92]:
X_tensor = torch.tensor(padded_data)
y_tensor = torch.tensor([0, 0, 1, 1, 1])  # Example labels

# Create a DataLoader
dataset = TensorDataset(X_tensor, y_tensor)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [93]:
print(dataset)

<torch.utils.data.dataset.TensorDataset object at 0x111b4f730>


In [94]:
class ClassificationModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super(ClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, embedding_dim, padding_idx=pad_idx)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)  # [batch size, sent_len, emb_dim]
        pooled = embedded.mean(dim=1)    # Average pooling
        return self.fc(pooled)

# Model hyperparameters
vocab_size = len(vocab)
embedding_dim = 50
output_dim = 2  # For binary classification
pad_idx = 0  # Padding index set to the last index

# Initialize the model
model = ClassificationModel(vocab_size, embedding_dim, output_dim, pad_idx)

# Step 6: Train the model

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10
for epoch in range(epochs):
    epoch_loss = 0
    model.train()
    
    for batch in data_loader:
        text, labels = batch
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(text)
        
        # Compute loss
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        
        # Update loss
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(data_loader)}")

Epoch 1, Loss: 0.594802975654602
Epoch 2, Loss: 0.5171931783358256
Epoch 3, Loss: 0.5002211729685465
Epoch 4, Loss: 0.4825428823630015
Epoch 5, Loss: 0.49384214480717975
Epoch 6, Loss: 0.532062957684199
Epoch 7, Loss: 0.466621736685435
Epoch 8, Loss: 0.4489479462305705
Epoch 9, Loss: 0.4357536832491557
Epoch 10, Loss: 0.4169475535551707


In [78]:
model

ClassificationModel(
  (embedding): Embedding(19, 50, padding_idx=0)
  (fc): Linear(in_features=50, out_features=2, bias=True)
)

In [95]:
test_corpus = [
    "hello world",
    "python programming",
    "new unseen text",
    "programming world"
]

# Tokenize the test data using the same vocabulary and function
test_tokenized_texts = [tokenization_based_on_vocab(text, vocab) for text in test_corpus]
test_numerical_data = [numericalize(tokens, vocab) for tokens in test_tokenized_texts]

# Pad sequences in the test data
test_max_len = max(len(x) for x in test_numerical_data)
test_padded_data = [pad_sequence(x, test_max_len, len(vocab)) for x in test_numerical_data]

# Convert to PyTorch tensors
X_test_tensor = torch.tensor(test_padded_data)
y_test_tensor = torch.tensor([0, 1, 0, 1])  # Ex

In [96]:
def evaluate_model_no_batch(model, X_test_tensor, y_test_tensor, criterion):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0  # Initialize total loss
    correct_predictions = 0  # Initialize correct predictions count
    total_predictions = 0  # Initialize total predictions count

    with torch.no_grad():  # Disable gradient computation
        # Forward pass on the entire test data
        predictions = model(X_test_tensor)
        
        # Compute loss for the entire batch
        loss = criterion(predictions, y_test_tensor)
        total_loss += loss.item()

        # Get the predicted classes
        predicted_labels = torch.argmax(predictions, dim=1)

        # Count correct predictions
        correct_predictions = (predicted_labels == y_test_tensor).sum().item()
        total_predictions = y_test_tensor.size(0)

    # Compute average loss and accuracy
    avg_loss = total_loss
    accuracy = correct_predictions / total_predictions

    return avg_loss, accuracy


In [97]:
test_loss, test_accuracy = evaluate_model(model, test_data_loader, criterion)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


Test Loss: 0.5434, Test Accuracy: 0.7500
