In [1]:
"""
Neural Probabilistic Language Model, as described in Bengio et al. (2003)
"""
import torch
import torch.nn as nn

class NPLM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, context_size):
        super(NPLM, self).__init__()
        # Embedding layer
        print(f'vocab_size: {vocab_size}')
        print(f'embed_size: {embed_size}')
        self.embeddings = nn.Embedding(vocab_size, embed_size)
        print(f'embeddings: {self.embeddings}')
        # Hidden layer
        self.linear1 = nn.Linear(context_size * embed_size, hidden_size)
        # Output layer
        self.linear2 = nn.Linear(hidden_size, vocab_size)
        # Activation functions
        self.tanh = nn.Tanh()
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x):
        # Look up the embeddings for each context word and concatenate them
        x = self.embeddings(x)
        # Reshape the input to concatenate the embeddings
        x = x.view(x.size(0), -1)
        # Pass the concatenated embeddings through the first linear layer
        x = self.linear1(x)
        # Apply tanh activation function to introduce non-linearity
        x = self.tanh(x)
        # Pass the output through the second linear layer to get the scores for each word in the vocabulary
        x = self.linear2(x)
        # Apply softmax to get log-probabilities
        x = self.softmax(x)
        return x


## EXAMPLE USAGE:
## Assuming `model_params` is a dictionary containing the model hyperparameters
# model = NPLM(**model_params)
# example_input = torch.tensor([[1, 2, 3, 4], [4, 3, 2, 1]], dtype=torch.long)
# log_probs = model(example_input)
# print(f'Output shape: {log_probs.shape}\n')
# print(f'Output: {log_probs}')

In [10]:
import load_arxiv_data
import torch.optim as optim
from torch.utils.data import DataLoader

# Load the dataset
data = load_arxiv_data.load_arxiv_data(sample=True, columns=['title'])

In [3]:
len(data)

200000

In [11]:
data.head(10)

Unnamed: 0,title
0,Double bit in-plane magnetic skyrmions on a track
1,DDM: A Demand-based Dynamic Mitigation for SMT...
2,Some new results in O(a) improved lattice QCD
3,The behavior of magnetic ordering and the KOnd...
4,The deepest X-ray view of high-redshift galaxi...
5,Decomposed Linear Dynamical Systems (dLDS) for...
6,The Magnetic Structure of Light Nuclei from La...
7,Beyond NGS data sharing and towards open science
8,Precipitation of Energetic Neutral Atoms and I...
9,Orthros: Non-autoregressive End-to-end Speech ...


In [12]:
titles = data['title'].tolist()

# Create a vocabulary
vocab = set([word for title in titles for word in title])
vocab_size = len(vocab)

# Create a mapping from words to indices
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Convert the titles to sequences of word indices
titles = [[word_to_idx[word] for word in title] for title in titles]

In [60]:
# ArXivDataset class
from torch.utils.data import Dataset

class ArXivDataset(Dataset):
    pass

def collate_fn(batch):
    pass

training_data = ArXivDataset(titles, context_size=2)
data_loader = DataLoader(training_data, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [64]:
# Hyperparameters
model_params = {
    'vocab_size': len(vocab),  # Vocabulary size
    'embed_size': 300,  # Dimension of word embeddings
    'context_size': 2,  # Number of context words to consider
    'hidden_size': 128  # Number of hidden units
}

In [65]:
# Initialize the model
model = NPLM(**model_params)

vocab_size: 97
embed_size: 300
embeddings: Embedding(97, 300)
