In [1]:
!pip uninstall matplotlib

Found existing installation: matplotlib 3.6.2
Uninstalling matplotlib-3.6.2:
  Would remove:
    c:\users\sumitp\anaconda3\envs\pytorch\lib\site-packages\matplotlib-3.6.2-py3.10-nspkg.pth
    c:\users\sumitp\anaconda3\envs\pytorch\lib\site-packages\matplotlib-3.6.2.dist-info\*
    c:\users\sumitp\anaconda3\envs\pytorch\lib\site-packages\matplotlib\*
    c:\users\sumitp\anaconda3\envs\pytorch\lib\site-packages\mpl_toolkits\axes_grid1\*
    c:\users\sumitp\anaconda3\envs\pytorch\lib\site-packages\mpl_toolkits\axisartist\*
    c:\users\sumitp\anaconda3\envs\pytorch\lib\site-packages\mpl_toolkits\mplot3d\*
    c:\users\sumitp\anaconda3\envs\pytorch\lib\site-packages\mpl_toolkits\tests\*
    c:\users\sumitp\anaconda3\envs\pytorch\lib\site-packages\pylab.py
  Would not remove (might be manually added):
    c:\users\sumitp\anaconda3\envs\pytorch\lib\site-packages\matplotlib\_contour.cp310-win_amd64.pyd
    c:\users\sumitp\anaconda3\envs\pytorch\lib\site-packages\matplotlib\cbook\deprecation.p



In [1]:
import warnings
from IPython.display import display

warnings.filterwarnings("ignore")

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import re

### Steps to build a simple Language Model
#### 1. Collect data: save a "*.txt" with some text in current folder
#### 2. Read and Format data: Get the text data in a variable and format as per requirement
#### 3. Convert data into tokens : Tokens can be created at character level or word level or sub-word level
#### 4. Get embeddings for tokens : Model does not understand tokens as it is hence need to convert them to some form of numbers/vectors
#### 5. Build a basic NN : Keep in mind vocab size and embedding size, when designing network input output size
#### 6. Write loop to train this model

In [20]:
# Data collection

with open("./data/wizard_of_oz.txt", 'r', encoding="utf-8-sig") as f:
    text = f.read()

print(f"Sample text from corpus:\n{text[:200]}")

Sample text from corpus:
The Project Gutenberg eBook of Dorothy and the Wizard in Oz
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restr


In [3]:
# Data collection

def format_names(input_file, output_file):
    formatted_names = []
    with open(input_file, 'r') as f:
        for line in f:
            parts = line.split(' - ')
            if len(parts) == 2:
                name = parts[0].strip().lower()
                name = re.sub(r'^[^a-zA-Z]*|[^a-zA-Z]*$', '', name)
                formatted_names.append(name)

    with open(output_file, 'w') as f:
        f.write('\n'.join(formatted_names))

# Example usage:
input_file = './data/marathi_boy_names.txt'
output_file = './data/marathi_names_output.txt'
format_names(input_file, output_file)


In [4]:
# Create a loop that reads all the names in "marathi_names_output.txt" line by line, 
# and adds start <s> and end <e> at the starting and ending of each of the name

with open("./data/marathi_names_output.txt", "r", encoding="utf-8") as f:
    text = f.readlines()
name_list = []
for name in text:
    name = name.strip()
    name_list.append(name)

print(name_list[:5])
    

['aarv', 'ansh', 'aalok', 'aapt', 'aabheer']


In [5]:
# Create mapping of chars and integers
sequence = []
chars = []
for name in name_list:
    sequence += ["<s>"] + list(name) + ["<e>"]
sequence.remove("/")
vocab = sorted(set(sequence))

char_to_idx = {char:idx for idx,char in enumerate(vocab)}
idx_to_char = {idx:char for idx,char in enumerate(vocab)}

In [6]:
# Now let us create the Bigrams out of list of names
bigram = []
for i in range(len(sequence)- 1):
    ch1 = sequence[i]
    ch2 = sequence[i+1]
    bigram.append(tuple([ch1,ch2]))
bigram = list(filter(lambda x: x != ('<e>', '<s>') , bigram))

In [7]:
print(len(bigram))
print(bigram[:10])

3534
[('<s>', 'a'), ('a', 'a'), ('a', 'r'), ('r', 'v'), ('v', '<e>'), ('<s>', 'a'), ('a', 'n'), ('n', 's'), ('s', 'h'), ('h', '<e>')]


In [8]:
bigram_count = {}

#Iterate over unique items in bigram list and count their occurance
for item in set(bigram):
    bigram_count[item] = bigram.count(item)
bigram_count = sorted(bigram_count.items(), key=lambda x: x[1], reverse= True)

print(f"Number of unique occurences of bigrams: {len(bigram_count)}")

Number of unique occurences of bigrams: 290


In [9]:
bigram_matrix = torch.zeros((len(vocab), len(vocab)), dtype = torch.int32)

for item in bigram_count:
    ch1 = item[0][0]
    ch1_idx = char_to_idx[ch1]
    ch2 = item[0][1]
    ch2_idx = char_to_idx[ch2]
    bigram_matrix[ch1_idx][ch2_idx] = item[1]

In [10]:
bigram_matrix.size()

torch.Size([25, 25])

In [11]:
import matplotlib.pyplot as plt
%matplotlib inline

numpy_array = bigram_matrix.to('cpu').numpy()

In [14]:
numpy_array.shape

(25, 25)

In [15]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
     ------------------------------------ 294.9/294.9 kB 109.7 kB/s eta 0:00:00
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2




In [None]:
import seaborn as sns

# Plot using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(numpy_array, cmap='viridis')
plt.show()

##### In NLP tokens refers to the total number of "words" in your corpus. The vocab is the number of unique "words".
##### It should be the case that vocab <= tokens.

In [65]:
import re

def clean_words(word_list):
    cleaned_words = []
    for word in word_list:
        cleaned_word = re.sub(r'^[^a-zA-Z]*|[^a-zA-Z]*$', '', word)
        if cleaned_word:  # Check if the word is not empty after cleaning
            cleaned_words.append(cleaned_word)
    return cleaned_words

tokens = text.lower().strip().split()
tokens = clean_words(tokens)
print(tokens[:30])
print(f"Total number of tokens: {len(tokens)}")

vocab = sorted(set(tokens))
print(f"vocabulary size: {len(vocab)}")

['the', 'project', 'gutenberg', 'ebook', 'of', 'dorothy', 'and', 'the', 'wizard', 'in', 'oz', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and', 'most', 'other', 'parts', 'of', 'the']
Total number of tokens: 42287
vocabulary size: 4331


In [66]:
# Step 2: Data Preparation

# Decide on the sequence length (e.g., bi-grams, tri-grams)
sequence_length = 2  # For bi-grams
# Create input-output pairs for training
input_sequences = [tokens[i:i+sequence_length] for i in range(len(tokens) - sequence_length)]
target_outputs = [tokens[i+sequence_length] for i in range(len(tokens) - sequence_length)]

print(f"input: {input_sequences[:4]}")
print(f"target: {target_outputs[:4]}")

input: [['the', 'project'], ['project', 'gutenberg'], ['gutenberg', 'ebook'], ['ebook', 'of']]
target: ['gutenberg', 'ebook', 'of', 'dorothy']


In [71]:
# Step 3: Tokenization
# Create a vocabulary mapping each unique token to a numerical ID
vocab = {token: i for i, token in enumerate(set(tokens))}

# Convert tokens to numerical IDs
input_sequences_ids = [[vocab[token] for token in sequence] for sequence in input_sequences]
target_outputs_ids = [vocab[token] for token in target_outputs]

In [93]:
# Step 4: Model Building
# Define a simple n-gram language model
from collections import defaultdict
# from nltk.tokenize import word_tokenize

class NGramLanguageModel:
    def __init__(self, n):
        self.n = n
        self.ngrams = defaultdict(list)

    def train(self, tokens):
        # tokens = word_tokenize(corpus)
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i+self.n])
            self.ngrams[ngram[:-1]].append(ngram[-1])

    def generate_text(self, input_string, num_words=50):
        tokens = input_string.lower().strip().split()
        tokens = clean_words(tokens)
        print(tokens)
        seed = tuple(tokens)[-self.n+1:]
        print(seed)
        result = list(seed)
        for _ in range(num_words):
            last_n_words = tuple(result[-self.n+1:])
            next_word = self._get_next_word(last_n_words)
            if next_word:
                result.append(next_word)
            else:
                break
        return ' '.join(result)

    def _get_next_word(self, ngram_prefix):
        possible_next_words = self.ngrams.get(ngram_prefix, [])
        if possible_next_words:
            return possible_next_words[0]  # Just return the first next word for simplicity
        else:
            return None

    def _get_random_seed(self):
        # Return a random seed from the available n-grams
        return next(iter(self.ngrams.keys()))

# Example usage:
corpus = "The quick brown fox jumps over the lazy dog"
n = 2
model = NGramLanguageModel(n)
model.train(tokens)
generated_text = model.generate_text(input_string="my name is dorothy",num_words= 100)
print(generated_text)


['my', 'name', 'is', 'dorothy']
('dorothy',)
dorothy and the project gutenberg ebook of dorothy and the project gutenberg ebook of dorothy and the project gutenberg ebook of dorothy and the project gutenberg ebook of dorothy and the project gutenberg ebook of dorothy and the project gutenberg ebook of dorothy and the project gutenberg ebook of dorothy and the project gutenberg ebook of dorothy and the project gutenberg ebook of dorothy and the project gutenberg ebook of dorothy and the project gutenberg ebook of dorothy and the project gutenberg ebook of dorothy and the project gutenberg ebook of dorothy and the project gutenberg ebook of dorothy and the


In [96]:
torch.zeros(5, dtype = torch.int32)

tensor([0, 0, 0, 0, 0], dtype=torch.int32)

##### Now we have the tokens but for training the model we will need to convert each token to vectors aka Embeddings

In [47]:
embedding = nn.Embedding(num_embeddings=len(tokens), embedding_dim= 8)
token_embeddings = embedding(tokens)


TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list

In [8]:
%%writefile trial_file.py

import argparse

def addition(a, b):
    return a+b

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--a", type = int, default= 0)
    parser.add_argument("--b", type = int, default= 0)
    args = parser.parse_args()
    a = args.a
    b = args.b
    c = addition(a, b)
    print(f"sum of given to nos. is {c}")

Writing trial_file.py


In [10]:
%run trial_file.py --a 6 --b 7

sum of given to nos. is 13


In [11]:
%mkdir python_scripts

In [15]:
%%time
%run trial_file.py --a 2890 --b 568999

sum of given to nos. is 571889
CPU times: total: 15.6 ms
Wall time: 3.02 ms
