# Build LLM from scratch

When building a Large Language Model (LLM), the overall process can be divided into four key stages:

1. `pretraining` - Develop general language understanding by training on large-scale, diverse text corpora.
2. `fine-tuning` - Adapt the pretrained model for specific domains or down stream tasks through supervised or instruction-based tuning.
3. `retrieval integration` - Incorporate external knowledge sources such as databases or the internet to expand the model's factual accuracy and scope -> similar to how humans refer to references when reasoning more deeply.
4. `internal reflection (self Q&A)` - Enable internal question-answer loops or self-reflection mechanisms to iteratively refine reasoning and produce more coherent, accurate conclusions.

## 1. Get Training Data

e.g. alice.txt -> cleaned_alice.txt

- kaggle Harry Potter
- kaggle Alice

### 1.1 Download Harry Potter and Alice from Kaggle

In [None]:
import kagglehub

# Download latest version
path_harry_potter = kagglehub.dataset_download("shubhammaindola/harry-potter-books")
print("Path to dataset files:", path_harry_potter)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shubhammaindola/harry-potter-books?dataset_version_number=1...


100%|██████████| 2.28M/2.28M [00:00<00:00, 51.7MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/shubhammaindola/harry-potter-books/versions/1





In [None]:
import kagglehub

# Download latest version
path_alice = kagglehub.dataset_download("roblexnana/alice-wonderland-dataset")
print("Path to dataset files:", path_alice)

Downloading from https://www.kaggle.com/api/v1/datasets/download/roblexnana/alice-wonderland-dataset?dataset_version_number=1...


100%|██████████| 53.7k/53.7k [00:00<00:00, 46.0MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/roblexnana/alice-wonderland-dataset/versions/1





In [None]:
from pathlib import Path

data_dir = Path("data")
data_dir.mkdir(exist_ok=True, parents=True)

In [None]:
!cp -r $path_harry_potter $data_dir
!cp -r $path_alice $data_dir
test = data_dir / "1"
!mv $test/* $data_dir
!rm -rf $test

In [None]:
import os
import re

In [None]:
file_list = list(data_dir.glob("*.txt"))
file_list, file_list[0].parent, file_list[0].name, file_list[0].stem

([PosixPath('data/07 Harry Potter and the Deathly Hallows.txt'),
  PosixPath('data/06 Harry Potter and the Half-Blood Prince.txt'),
  PosixPath('data/04 Harry Potter and the Goblet of Fire.txt'),
  PosixPath('data/03 Harry Potter and the Prisoner of Azkaban.txt'),
  PosixPath('data/01 Harry Potter and the Sorcerers Stone.txt'),
  PosixPath('data/05 Harry Potter and the Order of the Phoenix.txt'),
  PosixPath('data/alice_in_wonderland.txt'),
  PosixPath('data/02 Harry Potter and the Chamber of Secrets.txt')],
 PosixPath('data'),
 '07 Harry Potter and the Deathly Hallows.txt',
 '07 Harry Potter and the Deathly Hallows')

### 1.2 Clean up books

remove "\n" and multiple spaces

In [None]:
# remove "\n" -> make entire book in a single line
def clean_text(filename):
  with open(filename, 'r', encoding='utf-8') as file:
    book_text = file.read()

  cleaned_text = re.sub(r'\n+', ' ', book_text) # "\n" to " "
  cleaned_text = re.sub(r'\s+', ' ', cleaned_text) # multiple spaces to single space

  print(f"cleaned_{filename.name} {len(cleaned_text)} characters")

  cleaned_filename = f"{filename.parent}/cleaned_{filename.name}"
  with open(cleaned_filename, 'w', encoding='utf-8') as file:
    file.write(cleaned_text)

  return cleaned_filename

In [None]:
cleaned_filelist = []
for filename in file_list:
  cleaned_filelist.append(clean_text(filename))

cleaned_07 Harry Potter and the Deathly Hallows.txt 1133063 characters
cleaned_06 Harry Potter and the Half-Blood Prince.txt 982041 characters
cleaned_04 Harry Potter and the Goblet of Fire.txt 1093670 characters
cleaned_03 Harry Potter and the Prisoner of Azkaban.txt 621137 characters
cleaned_01 Harry Potter and the Sorcerers Stone.txt 436000 characters
cleaned_05 Harry Potter and the Order of the Phoenix.txt 1489734 characters
cleaned_alice_in_wonderland.txt 142524 characters
cleaned_02 Harry Potter and the Chamber of Secrets.txt 488771 characters


### 1.3 Tokenize

In [None]:
# !pip install tiktoken

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2") # use tokenizer that gpt2 uses
text = "Harry Potter was a wizard."
tokens = tokenizer.encode(text)

print(f"number of characters: {len(text)}, number of tokens: {len(tokens)}")
# print(tokens)
# print(tokenizer.decode(tokens))
for t in tokens:
  print(f"{t}\t -> {tokenizer.decode([t])}")

number of characters: 26, number of tokens: 6
18308	 -> Harry
14179	 ->  Potter
373	 ->  was
257	 ->  a
18731	 ->  wizard
13	 -> .


In [None]:
# ! pip install transformers

In [None]:
# # Tokenize Korean
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct")
# # tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2") # KoGPT2

# print(f"Vocab size: {len(tokenizer)}")

# text = "대사께서는 도(道)를 얻은 모양이구려."

# tokens = tokenizer.encode(text)
# print(f"number of characters: {len(text)}, number of tokens: {len(tokens)}")
# for t in tokens:
#   print(f"{t}\t -> {tokenizer.decode([t])}")

In [None]:
for char in text:
  token_ids = tokenizer.encode(char) # encoding char by char (tokenize)
  decoded = tokenizer.decode(token_ids) # decoding char by char
  print(f"{char} -> {token_ids} -> {decoded}")

H -> [39] -> H
a -> [64] -> a
r -> [81] -> r
r -> [81] -> r
y -> [88] -> y
  -> [220] ->  
P -> [47] -> P
o -> [78] -> o
t -> [83] -> t
t -> [83] -> t
e -> [68] -> e
r -> [81] -> r
  -> [220] ->  
w -> [86] -> w
a -> [64] -> a
s -> [82] -> s
  -> [220] ->  
a -> [64] -> a
  -> [220] ->  
w -> [86] -> w
i -> [72] -> i
z -> [89] -> z
a -> [64] -> a
r -> [81] -> r
d -> [67] -> d
. -> [13] -> .


## 2. Create Dataloaders

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
class MyDataset(Dataset):
  def __init__(self,
               txt: str,
               max_length: int,
               stride: int):

    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(txt)
    print(token_ids)

    print(f"number of tokens in txt: {len(token_ids)}")

    for i in range(0, len(token_ids) - max_length, stride):
      """
      if input is "Harry"
      target would be "potter", so input_chunk and target_chunk diff is by 1 index
      """
      input_chunk =  token_ids[i     :i + max_length]
      target_chunk = token_ids[i + 1 :i + max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk, dtype=torch.long))
      self.target_ids.append(torch.tensor(target_chunk, dtype=torch.long))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [None]:
example_file = cleaned_filelist[0]
with open(example_file, 'r', encoding='utf-8') as file:
  example_txt = file.read()

dataset = MyDataset(txt=example_txt,
                    max_length=32,
                    stride=4)

train_loader = DataLoader(dataset=dataset,
                          batch_size=128,
                          shuffle=True,
                          drop_last=True)

[464, 734, 1450, 4120, 503, 286, 12062, 11, 257, 1178, 5695, 5475, 287, 262, 7135, 11, 8824, 18250, 11193, 13, 1114, 257, 1218, 484, 6204, 2407, 991, 11, 266, 1746, 7924, 379, 1123, 584, 338, 34572, 26, 788, 11, 22650, 1123, 584, 11, 484, 336, 6972, 511, 266, 1746, 11061, 511, 28050, 4730, 290, 2067, 6155, 35984, 306, 287, 262, 976, 4571, 13, 564, 250, 9980, 30, 447, 251, 1965, 262, 25242, 286, 262, 734, 13, 564, 250, 464, 1266, 11, 447, 251, 8712, 41343, 31008, 13, 383, 11193, 373, 275, 24071, 319, 262, 1364, 416, 4295, 11, 1877, 12, 25167, 275, 859, 7689, 11, 319, 262, 826, 416, 257, 1029, 11, 29776, 38705, 1522, 19859, 13, 383, 1450, 338, 890, 28050, 4730, 781, 6320, 1088, 511, 42415, 355, 484, 23558, 13, 564, 250, 817, 2917, 314, 1244, 307, 2739, 11, 447, 251, 531, 575, 897, 1636, 11, 465, 19861, 3033, 22292, 287, 290, 503, 286, 6504, 355, 262, 13737, 286, 625, 71, 4924, 7150, 6265, 262, 8824, 2971, 13, 564, 250, 1026, 373, 257, 1310, 6908, 959, 621, 314, 2938, 13, 887, 314, 2911, 

In [None]:
dataiter = iter(train_loader)

X, y = next(dataiter)
print(tokenizer.decode(X[0].tolist()))
print(tokenizer.decode(y[0].tolist()))

 to eat, and that’s when the passage to Hog’s Head opened up. I went through it and met Aberforth. He’
 eat, and that’s when the passage to Hog’s Head opened up. I went through it and met Aberforth. He’s


## 3. Define Model

## 4. Training

## 5. Check
