## A2 Language Model

In [3]:
#import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext, datasets, math
from tqdm import tqdm
from sklearn.model_selection import train_test_split


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Dataset Acquisition

For the dataset, I used data beased on popular novels, "Sherlock Holmes". Sherlock Holmes is a detective novels written by Sir Arthur Conan Doyle. Everyone can access the novels for free on public domain https://www.gutenberg.org
<br>For the Sherlock Holmes dataset, it consist of story of 5 novels in Sherlock Holmes series as follows;</br>
<br>1. A Study in Scarlet</br>
<br>2. The Adventures of Sherlock Holmes</br>
<br>3. The Hound of the Baskervilles</br>
<br>4. The Sign of the Four</br>
<br>5. The Memoirs of Sherlock Holmes</br>

In [5]:
#I used this code to download novels from projectgutenberg into .txt file
import requests

def download_text_file(url, local_filename):
    response = requests.get(url)
    
    if response.status_code == 200:
        
        with open(local_filename, 'w', encoding='utf-8') as file:
            file.write(response.text)
        print(f"File downloaded successfully as {local_filename}")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

url = 'https://www.gutenberg.org/cache/epub/834/pg834.txt'
local_filename = 'sherlockholmes.txt'

# download_text_file(url, local_filename)

In [6]:
#read sherlock_data
with open('sherlockholmes.txt', 'r', encoding='utf-8') as file:
    sherlock_data = file.read()
    # print(sherlock_data)

## Preprocessing

In [7]:
#split all passage into sentence
import nltk
# nltk.download('punkt')

from nltk.tokenize import sent_tokenize

sherlock_corpus = sent_tokenize(sherlock_data)
# sherlock_corpus


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thama\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
from sklearn.model_selection import train_test_split

# Split into training and temporary set 
temp_set, test_set = train_test_split(sherlock_corpus, test_size=0.2, random_state=43)

# Split temp_set into training and validation sets
train_set, val_set = train_test_split(temp_set, test_size=0.25, random_state=43)

# Print the sizes of the sets
print(f"Number of sentences in training set: {len(train_set)}")
print(f"Number of sentences in validation set: {len(val_set)}")
print(f"Number of sentences in test set: {len(test_set)}")


Number of sentences in training set: 10108
Number of sentences in validation set: 3370
Number of sentences in test set: 3370


### Tokenizing

Simply tokenize the sherlock_data to tokens.

In [9]:
from nltk.tokenize import word_tokenize

#split sentence in train set into word token
train_set_token = [word_tokenize(sentence) for sentence in train_set]

In [10]:
# train_set_token

In [11]:
#split sentences of validation set and test set into word tokens
val_set_token = [word_tokenize(sentence) for sentence in val_set]
test_set_token = [word_tokenize(sentence) for sentence in test_set]

### Numericalizing

We will tell torchtext to add any word that has occurred at least three times in the dataset to the vocabulary because otherwise it would be too big.  Also we shall make sure to add `unk` and `eos`.

In [12]:
#create vocab and add <unk> and <eos> into vocab
vocab = torchtext.vocab.build_vocab_from_iterator(train_set_token, min_freq=3)
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>'])

In [13]:
print(len(vocab)) #len vocab = 5614

5614


In [14]:
# print(vocab.get_itos()[:10])

## Prepare the batch loader

### Prepare data


In [15]:
def get_data(dataset, vocab, batch_size):
    data = []
    for i in range (len(train_set_token)):
        if train_set_token[i]:
            tokens = train_set_token[i].append('<eos>')
            tokens = [vocab[token] for token in train_set_token[i]]
            data.extend(tokens)
    data = torch.LongTensor(data)
    num_batches = data.shape[0] // batch_size
    data = data[:num_batches * batch_size]
    data = data.view(batch_size, num_batches) #view vs. reshape (whether data is contiguous)
    return data #[batch size, seq len]

In [16]:
#set batch size =128
batch_size = 128
train_data = get_data(train_set_token, vocab, batch_size)
valid_data = get_data(val_set_token, vocab, batch_size)
test_data  = get_data(test_set_token,  vocab, batch_size)

In [17]:
# train_data.shape

## Modeling 
I used LSTM model for the assignment. The LSTM model is a type of recurrent neural network. It employs a unique architecture with cell and hidden states, along with forget, input, and output gates. The cell state preserves long-term memory, while the hidden state retains short-term memory. During training, the model undergoes forward and backward passes, adjusting weights through backpropagation to minimize the loss. LSTM mitigates vanishing gradient issues by using gates and addresses exploding gradients through gradient clipping. Hyperparameters are tuned, and the model is validated on a separate set before being tested for generalization on unseen data. LSTM's proficiency in capturing long-term dependencies makes it well-suited for sequential data tasks.

In [18]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim    = hid_dim
        self.emb_dim    = emb_dim
        
        self.embedding  = nn.Embedding(vocab_size, emb_dim)
        self.lstm       = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, dropout=dropout_rate, batch_first=True)
        self.dropout    = nn.Dropout(dropout_rate)
        self.fc         = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
    
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_other)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                self.hid_dim).uniform_(-init_range_other, init_range_other) #We
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim,   
                self.hid_dim).uniform_(-init_range_other, init_range_other) #Wh
    
    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
        
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() #not to be used for gradient computation
        cell   = cell.detach()
        return hidden, cell
        
    def forward(self, src, hidden):
        #src: [batch_size, seq len]
        embedding = self.dropout(self.embedding(src)) #harry potter is
        #embedding: [batch-size, seq len, emb dim]
        output, hidden = self.lstm(embedding, hidden)
        #ouput: [batch size, seq len, hid dim]
        #hidden: [num_layers * direction, seq len, hid_dim]
        output = self.dropout(output)
        prediction =self.fc(output)
        #prediction: [batch_size, seq_len, vocab_size]
        return prediction, hidden

## Training 

In [19]:
vocab_size = len(vocab)
emb_dim = 1024                
hid_dim = 1024                
num_layers = 2                
dropout_rate = 0.65              
lr = 1e-3                     

In [20]:
model      = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer  = optim.Adam(model.parameters(), lr=lr)
criterion  = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 28,296,686 trainable parameters


In [21]:
def get_batch(data, seq_len, idx):
    #data #[batch size, bunch of tokens]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [22]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    # data #[batch size, seq len]
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]  #we need to -1 because we start at 0
    num_batches = data.shape[-1]
    
    #reset the hidden every epoch
    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        
        #hidden does not need to be in the computational graph for efficiency
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        #need to reshape because criterion expects pred to be 2d and target to be 1d
        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [23]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [25]:
n_epochs = 50
seq_len  = 50 
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best-val-lstm_lm.pt')

    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                         

	Train Perplexity: 538.508
	Valid Perplexity: 336.945


                                                         

	Train Perplexity: 387.955
	Valid Perplexity: 290.508


                                                         

	Train Perplexity: 249.127
	Valid Perplexity: 193.834


                                                         

	Train Perplexity: 168.781
	Valid Perplexity: 148.193


                                                         

	Train Perplexity: 133.051
	Valid Perplexity: 124.738


                                                         

	Train Perplexity: 112.318
	Valid Perplexity: 107.448


                                                         

	Train Perplexity: 98.127
	Valid Perplexity: 95.667


                                                         

	Train Perplexity: 88.561
	Valid Perplexity: 87.887


                                                         

	Train Perplexity: 81.711
	Valid Perplexity: 82.273


                                                         

	Train Perplexity: 76.152
	Valid Perplexity: 79.030


                                                         

	Train Perplexity: 71.282
	Valid Perplexity: 74.757


                                                         

	Train Perplexity: 67.162
	Valid Perplexity: 69.783


                                                         

	Train Perplexity: 63.718
	Valid Perplexity: 65.871


                                                         

	Train Perplexity: 60.318
	Valid Perplexity: 63.450


                                                         

	Train Perplexity: 57.628
	Valid Perplexity: 60.058


                                                         

	Train Perplexity: 55.026
	Valid Perplexity: 57.335


                                                         

	Train Perplexity: 52.619
	Valid Perplexity: 53.787


                                                         

	Train Perplexity: 50.684
	Valid Perplexity: 51.775


                                                         

	Train Perplexity: 48.693
	Valid Perplexity: 49.887


                                                         

	Train Perplexity: 46.771
	Valid Perplexity: 47.728


                                                         

	Train Perplexity: 45.217
	Valid Perplexity: 46.193


                                                         

	Train Perplexity: 43.976
	Valid Perplexity: 44.625


                                                         

	Train Perplexity: 42.544
	Valid Perplexity: 42.962


                                                         

	Train Perplexity: 41.175
	Valid Perplexity: 41.343


                                                         

	Train Perplexity: 40.051
	Valid Perplexity: 39.938


                                                         

	Train Perplexity: 38.862
	Valid Perplexity: 38.775


                                                         

	Train Perplexity: 37.672
	Valid Perplexity: 37.484


                                                         

	Train Perplexity: 36.689
	Valid Perplexity: 36.276


                                                         

	Train Perplexity: 35.973
	Valid Perplexity: 35.661


                                                         

	Train Perplexity: 34.861
	Valid Perplexity: 35.058


                                                         

	Train Perplexity: 33.993
	Valid Perplexity: 33.607


                                                         

	Train Perplexity: 33.133
	Valid Perplexity: 32.409


                                                         

	Train Perplexity: 32.440
	Valid Perplexity: 31.713


                                                         

	Train Perplexity: 31.511
	Valid Perplexity: 31.225


                                                         

	Train Perplexity: 30.704
	Valid Perplexity: 30.061


                                                         

	Train Perplexity: 29.992
	Valid Perplexity: 29.730


                                                         

	Train Perplexity: 29.437
	Valid Perplexity: 28.940


                                                         

	Train Perplexity: 28.701
	Valid Perplexity: 28.224


                                                         

	Train Perplexity: 28.111
	Valid Perplexity: 27.437


                                                         

	Train Perplexity: 27.602
	Valid Perplexity: 26.392


                                                         

	Train Perplexity: 26.927
	Valid Perplexity: 25.284


                                                         

	Train Perplexity: 26.479
	Valid Perplexity: 25.117


                                                         

	Train Perplexity: 25.838
	Valid Perplexity: 24.940


                                                         

	Train Perplexity: 25.306
	Valid Perplexity: 24.529


                                                         

	Train Perplexity: 24.763
	Valid Perplexity: 24.095


                                                         

	Train Perplexity: 24.328
	Valid Perplexity: 23.879


                                                         

	Train Perplexity: 23.797
	Valid Perplexity: 23.298


                                                         

	Train Perplexity: 23.312
	Valid Perplexity: 22.880


                                                         

	Train Perplexity: 22.797
	Valid Perplexity: 22.371


                                                         

	Train Perplexity: 22.460
	Valid Perplexity: 22.016


## Testing

In [26]:
#save the model
model.load_state_dict(torch.load('best-val-lstm_lm.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 32.477


## Real-world inference


In [27]:
def generate(prompt, max_seq_len, temperature, model, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = word_tokenize(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [29]:
prompt = 'Sherlock Holmes is '
max_seq_len = 30
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
Sherlock Holmes is walking to his wife , and the Colonel had gone down to his own hands .

0.7
Sherlock Holmes is walking to his son ’ s end .

0.75
Sherlock Holmes is walking to his son ’ s end .

0.8
Sherlock Holmes is walking to his son ’ s end .

1.0
Sherlock Holmes is walking to his bedroom .

