<a href="https://colab.research.google.com/github/sunmyeonglee/2025-1-NLP/blob/main/3_language_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm.auto import tqdm


# Language modeling

In [2]:
!wget "https://raw.githubusercontent.com/karpathy/makemore/master/names.txt"

--2025-05-13 04:46:46--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2025-05-13 04:46:46 (44.6 MB/s) - ‘names.txt’ saved [228145/228145]



In [3]:
def read_txt(txt_path):
  with open(txt_path, 'r') as f:
    txt_string = f.readlines()
  return txt_string

txt_string = read_txt('names.txt')

In [4]:
names_list = [x.replace('\n', '') for x in txt_string]
len(names_list)

32033

# N-Gram
- Start with bi-gram (2-gram)

In [5]:
from collections import defaultdict

# bigram_dict = {}
bigram_dict = defaultdict(int) # If key is not in the defaultdict, it automatically assign key and empty value (int=0, list=[])
unigram_dict = defaultdict(int)

# RNN
- $h_t = \tanh(\textbf{W}_{hh}h_{t-1} + \textbf{W}_{xh}x_t + b) $
  - $\textbf{W}$: Weight Matrix
  - $b$: bias
  - $x_t$: input vector of time step $t$
  - $h_t$: hidden state (and also output) of time step $t$


In [6]:
torch.manual_seed(0)
sequence_length = 7
input_dim, hidden_dim = 3, 5
weight_hh = nn.Linear(hidden_dim, hidden_dim)
weight_xh = nn.Linear(input_dim, hidden_dim)
h0 = torch.zeros(hidden_dim)
x = torch.randn([sequence_length, input_dim])
t = 0
x_t = x[t]
x[t]

tensor([ 1.0554,  0.1778, -0.2303])

In [7]:
h_t = torch.tanh(weight_hh(h0) + weight_xh(x_t))
h_t

tensor([-0.3031,  0.4942, -0.3826, -0.1671, -0.0307], grad_fn=<TanhBackward0>)

In [8]:
def run_rnn_cell(weight_hh, weight_xh, prev_h, x_t):
  return torch.tanh(weight_hh(prev_h) + weight_xh(x_t))

output = []
prev_h = h0
for i in range(len(x)):
  print(f'x: {x[i]}')
  h = run_rnn_cell(weight_hh, weight_xh, prev_h, x[i])
  prev_h = h
  print(f'h: {h}')
  output.append(h)

output = torch.stack(output)
output

x: tensor([ 1.0554,  0.1778, -0.2303])
h: tensor([-0.3031,  0.4942, -0.3826, -0.1671, -0.0307], grad_fn=<TanhBackward0>)
x: tensor([-0.3918,  0.5433,  0.3356])
h: tensor([ 0.2949,  0.2907,  0.5566, -0.6004, -0.4537], grad_fn=<TanhBackward0>)
x: tensor([1.5091, 2.0820, 1.7067])
h: tensor([-0.0504, -0.8319,  0.6891, -0.0811, -0.9549], grad_fn=<TanhBackward0>)
x: tensor([ 2.3804, -1.1256, -0.3170])
h: tensor([-0.9035,  0.7153, -0.9110,  0.4101,  0.4610], grad_fn=<TanhBackward0>)
x: tensor([-1.0925,  0.8058,  0.3276])
h: tensor([ 0.5157,  0.1567,  0.7691, -0.8519, -0.4661], grad_fn=<TanhBackward0>)
x: tensor([-0.7607, -1.5991,  0.0185])
h: tensor([-0.6471,  0.9578, -0.5932, -0.2097,  0.4347], grad_fn=<TanhBackward0>)
x: tensor([-0.7504,  0.1854,  0.6211])
h: tensor([ 0.2194,  0.3107,  0.5832, -0.7386, -0.3476], grad_fn=<TanhBackward0>)


tensor([[-0.3031,  0.4942, -0.3826, -0.1671, -0.0307],
        [ 0.2949,  0.2907,  0.5566, -0.6004, -0.4537],
        [-0.0504, -0.8319,  0.6891, -0.0811, -0.9549],
        [-0.9035,  0.7153, -0.9110,  0.4101,  0.4610],
        [ 0.5157,  0.1567,  0.7691, -0.8519, -0.4661],
        [-0.6471,  0.9578, -0.5932, -0.2097,  0.4347],
        [ 0.2194,  0.3107,  0.5832, -0.7386, -0.3476]],
       grad_fn=<StackBackward0>)

In [9]:
names_list[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [10]:
entire_chars = []

for name in names_list:
  for char in name:
    entire_chars.append(char)

len(entire_chars)

196113

In [11]:
set(entire_chars)
vocab = list(set(entire_chars))
vocab.sort()

char2idx = {char: i for i, char in enumerate(vocab)}
char2idx

{'a': 0,
 'b': 1,
 'c': 2,
 'd': 3,
 'e': 4,
 'f': 5,
 'g': 6,
 'h': 7,
 'i': 8,
 'j': 9,
 'k': 10,
 'l': 11,
 'm': 12,
 'n': 13,
 'o': 14,
 'p': 15,
 'q': 16,
 'r': 17,
 's': 18,
 't': 19,
 'u': 20,
 'v': 21,
 'w': 22,
 'x': 23,
 'y': 24,
 'z': 25}

# Define Dataset Class

In [43]:
class NameSet:
  def __init__(self, txt_fn):
    txt_string = read_txt(txt_fn)
    names_list = [x.replace('\n', '') for x in txt_string]
    self.data = names_list

    for name in names_list:
      for char in name:
        entire_chars.append(char)

    self.vocab = list(set(entire_chars))
    self.vocab.sort()

    special_tokens = ['<pad>', '<start>', '<end>']
    self.vocab = special_tokens + self.vocab

    self.char2idx = {char: i for i, char in enumerate(self.vocab)}

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    name_string = self.data[idx]
    name_in_idx = [self.char2idx[char] for char in name_string]
    name_in_idx = [self.char2idx['<start>']] + name_in_idx + [self.char2idx['<end>']]

    model_input = name_in_idx[:-1]
    target_output = name_in_idx[1:]
    return model_input, target_output

dataset = NameSet('names.txt')
dataset.data[0]
len(dataset)
dataset.vocab
dataset[0]

([1, 7, 15, 15, 3], [7, 15, 15, 3, 2])

In [28]:
name = 'emma'

new = [char2idx[char] for char in name]
new

[4, 12, 12, 0]

# Define the model

In [45]:
import torch.nn as nn
class LanguageModel(nn.Module):
  def __init__(self):
    super().__init__()

model = LanguageModel()

# Define Training Loop