# Lecture 3: Language Modeling Fundamentals

Lecture 3 | CMU ANLP Spring 2025 | Instructor: Sean Welleck

#### Part 2: Feedforward neural language model

This is a notebook for [CMU CS11-711 Advanced NLP](https://cmu-l3.github.io/anlp-spring2025/) that trains a feedforward language model, i.e. one based on [Bengio et al 2003, A Neural Probabilistic Language Model](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf).

In [1]:
data = open('names.txt').read().splitlines()
data[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [2]:
token_to_index = {tok: i for i, tok in enumerate('abcdefghijklmnopqrstuvwxyz')}
token_to_index['[S]'] = 26
index_to_token = {i: tok for tok, i in token_to_index.items()}

#### Build the dataset

Our dataset consists of $x,y$ pairs, where $x$ is a $(n-1)$-token context, and $y$ is a token.

In [3]:
import torch

context_size = 5

def build_dataset(data):
    X, Y = [], []
    for item in data:
        context = [token_to_index['[S]']] * context_size
        tokens = list(item) + ['[S]']
        for token in tokens:
            X.append(context)
            Y.append(token_to_index[token])
            context = context[1:] + [token_to_index[token]]
            #print("XVAL............",X)
            #print("YVAL............",Y)
            #print("Context.........",context)
            #break

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

# Split into train, dev, test
import random
random.seed(123)
random.shuffle(data)

n1 = int(0.8 * len(data))
n2 = int(0.9 * len(data))

X_train, Y_train = build_dataset(data[:n1])
X_dev, Y_dev = build_dataset(data[n1:n2])
X_test, Y_test = build_dataset(data[n2:])

X_train.shape, Y_train.shape

(torch.Size([182427, 5]), torch.Size([182427]))

In [4]:
context_size = 5


maxiter=0

X, Y = [], []
for item in data:
    print("THE ITEM IS............",item)
    context = [token_to_index['[S]']] * context_size
    tokens = list(item) + ['[S]']
    print("BEFORE ...........",context,"   ",tokens)
    maxiter=maxiter+1
    if maxiter==10:
        break
    for token in tokens:
        print("THE TOKEN IS......",token)
        print(context)
        print(token_to_index[token])
        context = context[1:] + [token_to_index[token]]
        print("******************************")
            


THE ITEM IS............ luann
BEFORE ........... [26, 26, 26, 26, 26]     ['l', 'u', 'a', 'n', 'n', '[S]']
THE TOKEN IS...... l
[26, 26, 26, 26, 26]
11
******************************
THE TOKEN IS...... u
[26, 26, 26, 26, 11]
20
******************************
THE TOKEN IS...... a
[26, 26, 26, 11, 20]
0
******************************
THE TOKEN IS...... n
[26, 26, 11, 20, 0]
13
******************************
THE TOKEN IS...... n
[26, 11, 20, 0, 13]
13
******************************
THE TOKEN IS...... [S]
[11, 20, 0, 13, 13]
26
******************************
THE ITEM IS............ shain
BEFORE ........... [26, 26, 26, 26, 26]     ['s', 'h', 'a', 'i', 'n', '[S]']
THE TOKEN IS...... s
[26, 26, 26, 26, 26]
18
******************************
THE TOKEN IS...... h
[26, 26, 26, 26, 18]
7
******************************
THE TOKEN IS...... a
[26, 26, 26, 18, 7]
0
******************************
THE TOKEN IS...... i
[26, 26, 18, 7, 0]
8
******************************
THE TOKEN IS...... n
[26, 18, 7, 0

### Define the model

In [5]:
import torch.nn as nn

class MLPLM(nn.Module):
    def __init__(self, vocab_size, context_size, embedding_size, hidden_size):
        super(MLPLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.fc1 = nn.Linear(context_size * embedding_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)       # (batch_size, context_size, hidden_size)
        x = x.view(x.shape[0], -1)  # (batch_size, context_size * hidden_size)
        x = torch.relu(self.fc1(x)) # (batch_size, hidden_size)
        x = self.fc2(x)             # (batch_size, vocab_size)
        return x


In [6]:
model = MLPLM(len(token_to_index), context_size, 64, 64)

x = X_train[:2]
x

tensor([[26, 26, 26, 26, 26],
        [26, 26, 26, 26, 11]])

In [8]:
model.forward(x)

tensor([[ 0.2176,  0.0954,  0.2361, -0.1520, -0.4687,  0.0696,  0.2823,  0.1976,
         -0.0093, -0.3868,  0.0382,  0.0110, -0.0343,  0.0773,  0.0686,  0.1706,
          0.0908, -0.1641,  0.0681,  0.0134,  0.1676, -0.0883, -0.3497, -0.0940,
         -0.0150,  0.0009,  0.1949],
        [ 0.2027,  0.0857,  0.0941, -0.0693, -0.3242, -0.1245,  0.1153,  0.0473,
         -0.0010, -0.3164, -0.0733,  0.1544, -0.0319,  0.0842,  0.0415,  0.2588,
          0.0507, -0.2100, -0.0416, -0.0168,  0.1277,  0.1366, -0.2911, -0.0395,
          0.0083,  0.0684,  0.1101]], grad_fn=<AddmmBackward0>)

### Training

In [9]:
import torch.optim as optim

model = MLPLM(len(token_to_index), context_size, 64, 64)
print(f"Model parameters: {sum(p.numel() for p in model.parameters())}")

# Hyperparameters
learning_rate = 0.001
num_epochs = 10
batch_size = 32

# Loss function and optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    # Reshuffle the data
    perm = torch.randperm(len(X_train))
    X_train = X_train[perm]
    Y_train = Y_train[perm]
    
    model.train()
    total_loss = 0
    for i in range(0, len(X_train), batch_size):
        X_batch = X_train[i:i+batch_size]
        Y_batch = Y_train[i:i+batch_size]

        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, Y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / (len(X_train) // batch_size)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

Model parameters: 24027
Epoch [1/10], Loss: 2.2235
Epoch [2/10], Loss: 2.1249
Epoch [3/10], Loss: 2.0985
Epoch [4/10], Loss: 2.0841
Epoch [5/10], Loss: 2.0747
Epoch [6/10], Loss: 2.0674
Epoch [7/10], Loss: 2.0616
Epoch [8/10], Loss: 2.0570
Epoch [9/10], Loss: 2.0534
Epoch [10/10], Loss: 2.0509


### Generation

In [10]:
# Sample from the model
def sample(model, context, max_length=100):
    model.eval()
    output = []
    with torch.no_grad():
        context = torch.tensor(context).unsqueeze(0)
        for i in range(max_length):
            logits = model(context)
            probs = torch.softmax(logits, dim=-1)
            token = torch.multinomial(probs, num_samples=1)
            context = torch.cat([context[:, 1:], token], dim=1)

            output.append(index_to_token[token.item()])
            if index_to_token[token.item()] == '[S]':
                return ''.join(output)
    return ''.join(output)

In [11]:
for i in range(10):
    print(sample(model, [token_to_index['[S]']] * context_size))

eliphan[S]
ceevy[S]
buint[S]
nuhiba[S]
hilil[S]
nelay[S]
aadir[S]
braylan[S]
caarleya[S]
myla[S]


### Conditional generation

In [13]:
prompt = 's'
for i in range(10):
    print("INPUT HERE...........",([token_to_index['[S]']] * (context_size-len(prompt))) + [token_to_index[c] for c in prompt])
    out = sample(model, ([token_to_index['[S]']] * (context_size-len(prompt))) + [token_to_index[c] for c in prompt])
    print(prompt + out)

INPUT HERE........... [26, 26, 26, 26, 18]
sidden[S]
INPUT HERE........... [26, 26, 26, 26, 18]
shav[S]
INPUT HERE........... [26, 26, 26, 26, 18]
sandre[S]
INPUT HERE........... [26, 26, 26, 26, 18]
stontin[S]
INPUT HERE........... [26, 26, 26, 26, 18]
solerh[S]
INPUT HERE........... [26, 26, 26, 26, 18]
sybitolani[S]
INPUT HERE........... [26, 26, 26, 26, 18]
sjaicobe[S]
INPUT HERE........... [26, 26, 26, 26, 18]
siger[S]
INPUT HERE........... [26, 26, 26, 26, 18]
shayla[S]
INPUT HERE........... [26, 26, 26, 26, 18]
syren[S]
