# 1. Basic LLM - 1

In [1]:
#Importing necessary libraries
import nltk
import random
from nltk.util import ngrams
from collections import defaultdict, Counter

#Sample Data
sample_text = """
Once upon a time, in a land far, far away, there lived a king and queen who had a beautiful daughter. The princess was kind and gentle, and everyone loved her.
"""

In [2]:
#Tokenization
nltk.download('punkt')
tokens = nltk.word_tokenize(sample_text.lower())
print(tokens)

['once', 'upon', 'a', 'time', ',', 'in', 'a', 'land', 'far', ',', 'far', 'away', ',', 'there', 'lived', 'a', 'king', 'and', 'queen', 'who', 'had', 'a', 'beautiful', 'daughter', '.', 'the', 'princess', 'was', 'kind', 'and', 'gentle', ',', 'and', 'everyone', 'loved', 'her', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#Ngrams
bigrams = list(ngrams(tokens, 2))
bigram_freq = defaultdict(Counter)

for w1, w2 in bigrams:
    bigram_freq[w1][w2] += 1

print(bigram_freq)

defaultdict(<class 'collections.Counter'>, {'once': Counter({'upon': 1}), 'upon': Counter({'a': 1}), 'a': Counter({'time': 1, 'land': 1, 'king': 1, 'beautiful': 1}), 'time': Counter({',': 1}), ',': Counter({'in': 1, 'far': 1, 'there': 1, 'and': 1}), 'in': Counter({'a': 1}), 'land': Counter({'far': 1}), 'far': Counter({',': 1, 'away': 1}), 'away': Counter({',': 1}), 'there': Counter({'lived': 1}), 'lived': Counter({'a': 1}), 'king': Counter({'and': 1}), 'and': Counter({'queen': 1, 'gentle': 1, 'everyone': 1}), 'queen': Counter({'who': 1}), 'who': Counter({'had': 1}), 'had': Counter({'a': 1}), 'beautiful': Counter({'daughter': 1}), 'daughter': Counter({'.': 1}), '.': Counter({'the': 1}), 'the': Counter({'princess': 1}), 'princess': Counter({'was': 1}), 'was': Counter({'kind': 1}), 'kind': Counter({'and': 1}), 'gentle': Counter({',': 1}), 'everyone': Counter({'loved': 1}), 'loved': Counter({'her': 1}), 'her': Counter({'.': 1})})


In [4]:
#Generating Text
def generate_text(seed, n_words):
    result = [seed]
    for _ in range(n_words):
        next_word_options = bigram_freq[result[-1]]
        next_word = random.choices(list(next_word_options.keys()), list(next_word_options.values()))[0]
        result.append(next_word)
    return ' '.join(result)

generated_text = generate_text('princess', 5)
print(generated_text)

princess was kind and gentle ,


# 2. Simple LLM Which can answer questions.

In [5]:
#Importing necessary libraries
import nltk
import random
from nltk import word_tokenize, sent_tokenize
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline

In [6]:
#The dataset
old_text = "The use of n-grams, particularly bigrams, in natural language processing (NLP) carries great importance by providing valuable insights into the structure and patterns of language, enabling the development of more accurate and efficient NLP systems. N-grams work by analyzing the frequencies of consecutive items, such as words or characters, in a given text. This analysis helps in tasks like language modeling, speech recognition, text prediction, and information retrieval. The benefits of utilizing n-grams include improved language understanding, accurate word prediction, enhanced search accuracy, and faster information retrieval."
new_text = "To further enhance the effectiveness of n-grams, one can consider increasing the value of n to include larger sequences of items, such as trigrams or even higher-order n-grams, as it may capture more contextual information. Additionally, considering the size and diversity of the training data used for building n-gram models is important to ensure their applicability across different domains and languages."
combined_text = old_text + " " + new_text

In [7]:
#Tokenize the text
sent_tokens = sent_tokenize(combined_text)
word_tokens = [word_tokenize(t) for t in sent_tokens]

In [8]:
#Creating a trigram model
n = 3
train_data, padded_sents = padded_everygram_pipeline(n, word_tokens)

In [9]:
#Train the model
model = MLE(n)
model.fit(train_data, padded_sents)

In [10]:
#Generate text with various questions
def generate_text(prompt, num_words, model):
    word_list = model.generate(num_words, text_seed=prompt.split())
    response = ' '.join(word_list)
    return response

# Example questions
questions = [
    "What is the importance",
    "How does it work",
    "What are the benefits",
    "How can I improve",
    "What should I consider to increase the performance?"
]

for question in questions:
    print(f"Question: {question}")
    print(f"Answer: {generate_text(question, 20, model)}")
    print("\n")

Question: What is the importance
Answer: by providing valuable insights into the structure and patterns of language , enabling the development of more accurate and efficient


Question: How does it work
Answer: by analyzing the frequencies of consecutive items , such as trigrams or even higher-order n-grams , one can consider increasing


Question: What are the benefits
Answer: of utilizing n-grams include improved language understanding , accurate word prediction , enhanced search accuracy , and information retrieval .


Question: How can I improve
Answer: applicability across different domains and languages . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>


Question: What should I consider to increase the performance?
Answer: it may capture more contextual information . </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s> </s>




# 3. Simple Bigram LLM model

In [23]:
#Importing necessary libraries
import nltk
from nltk import bigrams, FreqDist
from nltk.util import ngrams
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from random import choice

In [24]:
# Sample text data
text = "Natural language processing is a subfield of linguistics, computer science, and artificial intelligence \
concerned with the interactions between computers and human language. In particular, it focuses on programming \
computers to process and analyze large amounts of natural language data."

# Tokenize the text
tokens = nltk.word_tokenize(text)

In [25]:
# Generate bigrams and their frequency distribution
bigrams = list(ngrams(tokens, 2))
bigram_freq_dist = FreqDist(bigrams)

# Prepare the dataset for training
train_data, padded_sents = padded_everygram_pipeline(2, tokens)

In [26]:
# Train the bigram model
model = MLE(2)
model.fit(train_data, padded_sents)

In [27]:
def generate_sentence(model, num_words, seed_word):
    sentence = [seed_word]
    for _ in range(num_words - 1):
        next_word = model.generate(1, text_seed=sentence)
        sentence.append(next_word)

    return ' '.join(sentence)

# Example questions to the model
questions = [
    "What is natural language processing?",
    "How does artificial intelligence relate to linguistics?",
    "Can computers understand human language?",
]

# Generate answers for the questions
for question in questions:
    tokens = nltk.word_tokenize(question)
    seed_word = choice(tokens)
    generated_sentence = generate_sentence(model, 10, seed_word)
    print(f"Q: {question}\nA: {generated_sentence}\n")

Q: What is natural language processing?
A: ? </s> p u a t a g e r

Q: How does artificial intelligence relate to linguistics?
A: How n d </s> <s> b e n </s> d

Q: Can computers understand human language?
A: understand p r a m p r s </s> s

