In [1]:
import os
from pathlib import Path

In [2]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
import pandas as pd
import nltk
from nltk.lm import MLE, Laplace
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.tokenize import word_tokenize, sent_tokenize

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Dataset

I've chosen [BBC news dataset](https://www.kaggle.com/datasets/pariza/bbc-news-summary). I selected the sports, tech, and business categories, which together comprise 1422 news articles.

Description of the dataset: This dataset was created using a dataset used for data categorization that onsists of 2225 documents from the BBC news website corresponding to stories in five topical areas from 2004-2005 used in the paper of D. Greene and P. Cunningham. "Practical Solutions to the Problem of Diagonal Dominance in Kernel Document Clustering", Proc. ICML 2006; whose all rights, including copyright, in the content of the original articles are owned by the BBC.

Creating dataframe based on txt files

In [5]:
def create_df_from_folder(folder, suffix):
    records = []
    for fname in os.listdir(folder):
        if not fname.lower().endswith('.txt'):
            continue
        text = open(os.path.join(folder, fname), 'r', errors='ignore').read()
        name, _ = os.path.splitext(fname)
        records.append({
            'file_name': f"{name}_{suffix}.txt",
            'text': text
        })
    return pd.DataFrame(records)


In [6]:
# sport category

sport_news_folder = r'/content/drive/MyDrive/documents/вшэ/10сем/nlp/homeworks/hw_2/bbc_news/BBC News Summary/News Articles/sport/'

sports_df = create_df_from_folder(sport_news_folder, 'sport')

In [7]:
# tech category

tech_news_folder = r'/content/drive/MyDrive/documents/вшэ/10сем/nlp/homeworks/hw_2/bbc_news/BBC News Summary/News Articles/tech/'

tech_df = create_df_from_folder(tech_news_folder, 'tech')

In [8]:
# business category

business_path = r'/content/drive/MyDrive/documents/вшэ/10сем/nlp/homeworks/hw_2/bbc_news/BBC News Summary/News Articles/business/'

business_df = create_df_from_folder(business_path, 'business')

In [9]:
news_df = pd.concat([sports_df, tech_df, business_df], ignore_index=True)

In [10]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [11]:
tokenized_texts = []
for txt in news_df['text']:
    tokens = word_tokenize(txt.lower())
    tokenized_texts.append(tokens)

In [12]:
lengths = [len(t) for t in tokenized_texts]
print(f"Average tokens in document: {sum(lengths)/len(lengths)}")
print(f"Min/max tokens: {min(lengths)}/{max(lengths)}")

Average tokens in document: 431.1223628691983
Min/max tokens: 131/3362


In [13]:
from nltk.lm import Vocabulary
from collections import Counter

In [14]:
word_counts = Counter(w for sent in tokenized_texts for w in sent)

In [49]:
vocab = Vocabulary(word_counts, unk_cutoff=3)

In [121]:
n = 3
model = Laplace(n, vocabulary=vocab)

In [122]:
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_texts)

In [123]:
model.fit(train_data, padded_sents)

In [124]:
print(f"vocab size: {len(model.vocab)}")

vocab size: 11815


In [125]:
correct_phrases = [
    "the central bank raised interest rates",
    "the mayor unveiled the new infrastructure plan",
    "researchers reported a breakthrough in cancer treatment",
    "oil prices surged amid supply concerns",
    "the international summit concluded with a joint statement"
]

In [126]:
incorrect_phrases = [
    "bank central the raised rates interest",
    "mayor the plan unveiled infrastructure new the",
    "researchers a in cancer reported breakthrough treatment",
    "surged oil prices amid concerns supply",
    "joint concluded summit international the with statement a"
]

In [127]:
from nltk.lm.preprocessing import pad_both_ends
from nltk.util import ngrams

In [128]:
def ppl_ngram(model, sentence, n):
    tokens = word_tokenize(sentence.lower())
    padded = list(pad_both_ends(tokens, n))
    test_grams = list(ngrams(padded, n))
    return model.perplexity(test_grams)

In [129]:
tokenized_ex = word_tokenize(correct_phrases[0].lower())

In [134]:
ppl_correct = []
ppl_incorrect = []

In [135]:
for s in correct_phrases:
    ppl = ppl_ngram(model, s, 3)
    ppl_correct.append(ppl)
    print(f"{s!r}: {ppl:.2f}")

'the central bank raised interest rates': 5473.44
'the mayor unveiled the new infrastructure plan': 8328.75
'researchers reported a breakthrough in cancer treatment': 12080.24
'oil prices surged amid supply concerns': 7266.01
'the international summit concluded with a joint statement': 7885.12


In [136]:
for s in incorrect_phrases:
    ppl = ppl_ngram(model, s, 3)
    ppl_incorrect.append(ppl)
    print(f"{s!r}: {ppl:.2f}")

'bank central the raised rates interest': 9682.33
'mayor the plan unveiled infrastructure new the': 7666.71
'researchers a in cancer reported breakthrough treatment': 12078.20
'surged oil prices amid concerns supply': 12119.69
'joint concluded summit international the with statement a': 7720.88


In [145]:
import numpy as np

In [144]:
print(f"average perplexity on correct phrases: {np.mean(ppl_correct)}")
print(f"average perplexity on incorrect phrases: {np.mean(ppl_incorrect)}")

average perplexity on correct phrases: 8206.711760397862
average perplexity on incorrect phrases: 9853.56223647606


In [140]:
prompts = [
    ["i", "think"],
    ["she", "goes", "to"],
    ["in", "the", "future"],
    ["blue", "dog"],
    ["once", "upon", "a", "time"]
]

In [146]:
for prompt in prompts:
    gen = model.generate(num_words=20, text_seed=prompt, random_seed=42)
    print(f"prompt: {' '.join(prompt)}")
    print(" ".join(gen))

prompt: i think
the <UNK> cinema chain and the two biggest eurozone economies , it 's about the new find showed that .
prompt: she goes to
show <UNK> . `` the ppi would argue for greenspan to the <UNK> at speed is 512kbps , though ,
prompt: in the future
of aguas argentinas about what that means for technology , are in <UNK> . on the day just prior to
prompt: blue dog
the korean economy in recent weeks during a conference call on tuesday , agence france presse . `` this <UNK>
prompt: once upon a time
when - all of them profited financially . '' <UNK> <UNK> hopes arsenal 's season . according to the 19th
