# Create Language Model from Scratch

## Step 1 - Preprocess
Preprocess the text by converting it to lowercase, removing punctuation, and filtering out rare or stop words.

In [42]:
import numpy as np
from collections import Counter

# Replace with your dataset path
#text = open("Henry_IV_Part_1_Yale.txt").read().lower()
text = open("quick_brown_fox.txt").read().lower()
print('Total characters: ', len(text))

def preprocess(text):
    t = text.replace("\n", " ")  # Remove newlines
    t = text.replace(",", "")  # Remove punctuation (customizable)
    return text.split()


words = preprocess(text)
print('Total words:', len(words))
print('Words after preprocessing:', words)

Total characters:  44
Total words: 9
Words after preprocessing: ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.']


## Step 2 - Build a Vocabulary

Create a vocabulary of unique words in your dataset. Assign a unique integer ID to each word.

In [36]:
# Counter is a MultiSet data structure
vocab = Counter(words)
vocab_size = len(vocab)
print('Vocab size:', vocab_size)
print('Vocab: ', vocab)

# Create a dictionary mapping words to unique IDs. In NLP tasks where you need to convert words to indices for vector representations or embeddings, using the positions of words in a vocabulary dictionary as a simple form of encoding.
word2idx = {w: i for i, (w, _) in enumerate(vocab.items())}
print('word2idx size:', len(word2idx))
print('word2idx: ', word2idx)

idx2word = {i: w for w, i in word2idx.items()}
print('idx2word size:', len(idx2word))
print('idx2word: ', idx2word)

Vocab size: 8
Vocab:  Counter({'the': 2, 'quick': 1, 'brown': 1, 'fox': 1, 'jumps': 1, 'over': 1, 'lazy': 1, 'dog.': 1})
word2idx size: 8
word2idx:  {'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumps': 4, 'over': 5, 'lazy': 6, 'dog.': 7}
idx2word size: 8
idx2word:  {0: 'the', 1: 'quick', 2: 'brown', 3: 'fox', 4: 'jumps', 5: 'over', 6: 'lazy', 7: 'dog.'}


## Step 3 - Implement a Tokenizer

Write a function to convert text sequences into numerical sequences based on the vocabulary.

In [37]:
def tokenize(text, word2idx):
  tokens = [word2idx[word] for word in text.split() if word in word2idx]
  return tokens

print(tokenize(text, word2idx))

[0, 1, 2, 3, 4, 5, 0, 6, 7]


## Step 4 - Train/Build a Bigram Model

A **Bigram** model predicts the next word based on the previous word. Train the model on your tokenized dataset. During training, the model adjusts its internal parameters to minimize the prediction error.

If the input is **wireless speakers for tv**, the output will be the following:

1. N=1 Unigram- Output- “wireless” , “speakers”, “for” , “tv”
2. N=2 Bigram- Output- “wireless speakers”, “speakers for”, “for tv”
3. N=3 Trigram – Output- “wireless speakers for”, “speakers for tv”

In [47]:
def build_bigram_model(sequences):
  print('Sequences; ', sequences)
  # Initialize transition matrix with zeros
  transition_matrix = np.zeros((vocab_size, vocab_size))
  print('Transition matrix size:', transition_matrix.shape)
  
  for sequence in sequences:
    for i in range(len(sequence) - 1):
      transition_matrix[sequence[i], sequence[i + 1]] += 1

  # Normalize each row to get probabilities
  transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)
  return transition_matrix

# Train the model on tokenized sequences
# sequences = [tokenize(seq, word2idx) for seq in words]
sequences = []
for seq in words:
    tokenized_seq = tokenize(seq, word2idx)
    print(tokenized_seq)
    sequences.append(tokenized_seq)  
    
model = build_bigram_model(sequences)

print('Model', model)

[0]
[1]
[2]
[3]
[4]
[5]
[0]
[6]
[7]
Sequences;  [[0], [1], [2], [3], [4], [5], [0], [6], [7]]
Transition matrix size: (8, 8)
Model [[nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan]
 [nan nan nan nan nan nan nan nan]]


  transition_matrix = transition_matrix / transition_matrix.sum(axis=1, keepdims=True)


## Step 5 - Generate Text

Once trained, use the model to generate new text by providing it with a seed sequence.

In [39]:
def generate_text(model, start_word, max_length=100):
  # Get start word index
  start_idx = word2idx.get(start_word, None)
  if start_idx is None:
    print("Start word not in vocabulary")
    return

  sequence = [start_idx]
  for _ in range(max_length):
    # Predict next word probability distribution
    probs = model[sequence[-1]]

    # Sample next word based on probabilities
    next_idx = np.random.choice(range(vocab_size), p=probs)
    sequence.append(next_idx)

  # Convert numerical sequence back to text
  text = " ".join([idx2word[i] for i in sequence])
  return text



# Example usage
start_text = "the quick"
generated_text = generate_text(model, start_text)
print(f"Generated text: {generated_text}")


Start word not in vocabulary
Generated text: None
