1. Clean data


In [1]:
import re

def clean_wikipedia_text(text):
    # Remove citations like [1], [2][3], [citation needed]
    text = re.sub(r'\[[^\]]+\]', '', text)
    
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    
    # Remove parenthetical information
    text = re.sub(r'\([^)]*\)', '', text)
    
    # Remove special formatting (e.g., ''italic'' or '''bold''')
    text = re.sub(r"''+|\<.*?\>", '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove lines starting with special characters (-, *, digits)
    text = '\n'.join(line for line in text.split('\n') if not line.strip().startswith(('-', '*', '1.')))
    
    return text

# Load text and clean
with open('data/wiki_dataset.txt', 'r', encoding='utf-8') as file:
    raw_text = file.read()

cleaned_text = clean_wikipedia_text(raw_text)

# Save cleaned text
with open('data/cleaned_wiki_text.txt', 'w', encoding='utf-8') as file:
    file.write(cleaned_text)

2. Spit data into 3 subset for training, testing and validation 

In [5]:
import random
import re

# Load and preprocess corpus
with open('data/cleaned_wiki_text.txt', 'r') as file:
    # Convert to lowercase
    corpus = file.read().lower()

# Clean up text: remove extra newlines and leading/trailing spaces
corpus = re.sub(r'\n+', ' ', corpus).strip()  # Replace multiple newlines with a space
corpus = re.sub(r'\s+', ' ', corpus) 
 # Replace multiple spaces with a single space

# Split into sentences
sentences = corpus.split('.')

# Remove empty sentences
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

# Shuffle sentences
random.shuffle(sentences)

# Split into training, validation, and testing sets
train_split = int(0.7 * len(sentences))
val_split = int(0.8 * len(sentences))

train_set = sentences[:train_split]
val_set = sentences[train_split:val_split]
test_set = sentences[val_split:]

# Verify results
print(f"Total sentences: {len(sentences)}")
print(f"Training set: {len(train_set)} sentences")
print(f"Validation set: {len(val_set)} sentences")
print(f"Testing set: {len(test_set)} sentences")

# Optional: Save the sets to files for further use
with open('data/wiki_article/train_set.txt', 'w') as file:
    file.write('. '.join(train_set) + '.')

with open('data/wiki_article/val_set.txt', 'w') as file:
    file.write('. '.join(val_set) + '.')

with open('data/wiki_article/test_set.txt', 'w') as file:
    file.write('. '.join(test_set) + '.')


Total sentences: 589
Training set: 412 sentences
Validation set: 59 sentences
Testing set: 118 sentences


In [8]:
import re
from nltk.tokenize import word_tokenize
from collections import Counter

# Tokenize
tokens = word_tokenize(' '.join(train_set))
vocab_size = 20000 # Limit vocabulary size
token_counts = Counter(tokens)
vocab = {word for word, _ in token_counts.most_common(vocab_size)}

# Clean up unwanted characters
def clean_tokens(tokens):
    cleaned = []
    for sentence in tokens:
        cleaned_sentence = [re.sub(r'[^\w.]', '', word) for word in sentence]  # Keep only words and periods
        cleaned_sentence = [word for word in cleaned_sentence if word]  # Remove empty tokens
        cleaned.append(cleaned_sentence)
    return cleaned

# Replace words not in vocab with <UNK> and clean
def replace_with_unk(data):
    return [
        [word if word in vocab else '<UNK>' for word in word_tokenize(sentence)]
        for sentence in data
    ]

train_tokens = replace_with_unk(train_set)
val_tokens = replace_with_unk(val_set)
test_tokens = replace_with_unk(test_set)

# Clean the tokenized data
train_tokens = clean_tokens(train_tokens)
val_tokens = clean_tokens(val_tokens)
test_tokens = clean_tokens(test_tokens)

# Check results
print(train_tokens[:10])  # Show first 10 sentences from train_tokens
print(val_tokens[:10])    # Show first 10 sentences from val_tokens
print(test_tokens[:10])   # Show first 10 sentences from test_tokens


[['despite', 'adopting', 'an', 'anticorruption', 'law', 'in', '2010', 'corruption', 'prevails', 'throughout', 'the', 'country'], ['some', 'critics', 'of', 'the', 'government', 'have', 'been', 'arrested', 'for', 'allegedly', 'spreading', 'fake', 'news', 'about', 'the', 'covid19', 'pandemic', 'in', 'cambodia'], ['hinduism', 'has', 'left', 'little', 'trace', 'beyond', 'the', 'magical', 'practices', 'of', 'tantricism', 'and', 'a', 'host', 'of', 'hindu', 'gods', 'now', 'assimilated', 'into', 'the', 'spirit', 'world'], ['there', 'is', 'also', 'a', 'frenchlanguage', 'newspaper', 'and', 'some', 'tv', 'channels', 'are', 'available', 'in', 'french'], ['after', 'the', '2013', 'cambodian', 'general', 'election', 'allegations', 'of', 'voter', 'fraud', 'from', 'opposition', 'party', 'cambodia', 'national', 'rescue', 'party', 'led', 'to', 'widespread', 'antigovernment', 'protests', 'that', 'continued', 'into', 'the', 'following', 'year'], ['the', 'credit', 'bureau', 'further', 'increases', 'the', 'tr

In [9]:
import pandas as pd 
from collections import Counter 
from nltk.tokenize import word_tokenize
from itertools import chain
# for i in train_tokens:
#     train_corpus = train_tokens[i]+1
# print(train_corpus)
flattened_tokens = list(chain.from_iterable(train_tokens))
word_counts = Counter(flattened_tokens)
tokens = word_tokenize(corpus.lower()) 
df = pd.DataFrame(word_counts.items(),columns=['word','Frequency'])
df = df.sort_values(by='Frequency', ascending=False).reset_index(drop=True)

# Display the DataFrame
print(df)
for word, count in word_counts.items():
    print(f'{word}: {count}')

            word  Frequency
0            the        637
1             of        333
2            and        321
3             in        250
4             to        171
...          ...        ...
2656    senators          1
2657        1921          1
2658  condudcted          1
2659        base          1
2660   democracy          1

[2661 rows x 2 columns]
despite: 2
adopting: 1
an: 19
anticorruption: 2
law: 7
in: 250
2010: 5
corruption: 8
prevails: 1
throughout: 4
the: 637
country: 47
some: 13
critics: 2
of: 333
government: 23
have: 24
been: 16
arrested: 2
for: 61
allegedly: 1
spreading: 1
fake: 1
news: 1
about: 13
covid19: 2
pandemic: 2
cambodia: 155
hinduism: 5
has: 36
left: 3
little: 1
trace: 1
beyond: 1
magical: 1
practices: 4
tantricism: 1
and: 321
a: 152
host: 1
hindu: 1
gods: 1
now: 6
assimilated: 1
into: 12
spirit: 1
world: 14
there: 9
is: 98
also: 20
frenchlanguage: 1
newspaper: 1
tv: 1
channels: 1
are: 54
available: 2
french: 8
after: 18
2013: 7
cambodian: 47
general: 9
el

In [10]:
from collections import defaultdict

def build_ngram_model(data, n):
    model = defaultdict(lambda: defaultdict(int))
    for sentence in data:
        for i in range(len(sentence) - n + 1):
            ngram = tuple(sentence[i:i+n])
            prefix, token = ngram[:-1], ngram[-1]
            model[prefix][token] += 1
    return model

def backoff_prob(model, ngram):
    prefix, token = ngram[:-1], ngram[-1]
    if prefix in model and sum(model[prefix].values()) > 0:
        if token in model[prefix]:
            return model[prefix][token] / sum(model[prefix].values())
    elif len(prefix) > 1:
        return backoff_prob(model, prefix[1:] + (token,))
    return 0  # Default probability if no valid prefix or tokens


lm1 = build_ngram_model(train_tokens, 4)


In [11]:
# Build the LM2 interpolated method 
def interpolated_prob(model, ngram, lambdas, k):
    n = len(ngram)
    prob = 0
    for i in range(1, n+1):
        prefix, token = ngram[:i-1], ngram[i-1]
        count = model[prefix][token] + k
        total_count = sum(model[prefix].values()) + k * vocab_size
        prob += lambdas[i-1] * (count / total_count)
    return prob

lambdas = [0.1, 0.2, 0.3, 0.4]  # Example weights
k = 1  # Example smoothing parameter


In [None]:
import math
def perplexity(model, tokens, prob_func):
    log_prob = 0
    word_count = 0
    for sentence in tokens:
        for i in range(3, len(sentence)):
            ngram = tuple(sentence[i-3:i+1])
            prob = prob_func(model, ngram)
            # Ensure probability is a small positive value
            if prob <= 0:
                prob = 1e-10  # Assign a small positive probability
            log_prob += math.log2(prob)
            word_count += 1
    return 2 ** (-log_prob / word_count)


pp_lm1 = perplexity(lm1, test_tokens, backoff_prob)
pp_lm2 = perplexity(lm1, test_tokens, lambda m, n: interpolated_prob(m, n, lambdas, k))

print(f"pp_lm1: {pp_lm1}")
print(f"pp_lm2: {pp_lm2}")

pp_lm1: 6482841614.53346
pp_lm2: 19786.35481142604


In [None]:
def generate_text(model, start, length, prob_func):
    text = list(start)
    for _ in range(length - len(start)):
        prefix = tuple(text[-3:])
        if prefix not in model or not model[prefix] or sum(model[prefix].values()) == 0:
            print(f"Prefix not found or has zero probability: {prefix}. Stopping generation.")
            break  # Stop generation if prefix is invalid
        else:
            next_word = max(model[prefix], key=lambda x: prob_func(model, prefix + (x,)))
            text.append(next_word)
    return ' '.join(text)
# Ensure start tokens match the model's keys
start = list(random.choice(list(lm1.keys())))
print('LM1:', generate_text(lm1, start, 50, backoff_prob))


Prefix not found or has zero probability: ('in', 'the', '1970s'). Stopping generation.
LM1: established diplomatic relations with numerous countries the government reports twenty embassies in the country s terror period in the 1970s
