In [1]:
import regex as re
from nltk.util import bigrams
from nltk.lm.preprocessing import pad_both_ends 
from nltk.util import everygrams
from nltk.lm.preprocessing import flatten, padded_everygram_pipeline, padded_everygrams
from nltk.util import ngrams
from nltk.lm import NgramCounter, MLE
from nltk.lm import KneserNeyInterpolated

In [2]:
with open("../tweeteval/datasets/irony/train_text.txt", "r") as f:
    irony = f.read()
with open("../tweeteval/datasets/stance/climate/train_text.txt", "r") as f:
    stance_climate = f.read()
with open("../pattern.txt", "r") as f:
    pat = re.compile(f.read())
with open("news-commentary-v16.txt") as f:
    news = f.read()

In [16]:
climate_tokens = [re.findall(pat, line) for line in stance_climate.split("\n")]

irony_tokens = [re.findall(pat, line) for line in irony.split("\n")]

news_tokens = [re.findall(pat, line) for line in news.split("\n")[:5000]]

all_tokens = irony_tokens + news_tokens

In [17]:
news_valid = [re.findall(pat, line) for line in news.split("\n")[-5000:]]

In [18]:
def train_model(dataset, n):
    _, vocab = padded_everygram_pipeline(2, all_tokens)
    grams = list(flatten(ngrams(sentence, n) for sentence in dataset))
    print(f"Length of {n}-gram list: {len(grams)}")
    train, _ = padded_everygram_pipeline(2, dataset)
    lm = MLE(n)
    lm.fit(train, vocab)
    return lm

In [19]:
lm = train_model(news_tokens,1)

Length of 1-gram list: 101784


In [20]:
lm.generate(15, text_seed=["king"], random_seed=231)

['the',
 'world',
 'should',
 'run',
 'the',
 'presidency',
 'with',
 'better',
 'returns',
 'on',
 'exports',
 'this',
 'for',
 'three-quarters',
 'of']

In [21]:
lm.score("I")
lm.logscore("king")

-inf

In [22]:
len(lm.vocab)

20187

In [23]:
lm.generate(4, text_seed=["I","super"])

['field', 'is', 'a', 'primary']

In [11]:
test = list(flatten(everygrams(sent, 2, 2) for sent in news_valid))

In [24]:
test_set = set()
for line in test:
    perp = lm.perplexity(line)
    test_set.add(perp)
print(test_set)

{19459.096693351745, 8837.301149106554, 16663.774853122435, 667.6200421159634, 7452.266666666661, 10658.186207622934, 1828.3527845473116, 9127.12538024249, inf, 7215.626172874162, 19760.806107039258, 817.6642226197633, 79043.22442815703, 24393.268642140174, 6986.5, 211.1199944663984, 14431.252345748324, 39521.612214078516, 8426.036035570185, 8702.372597415157}


In [25]:
def train_model_kne(dataset, n):
    _, vocab = padded_everygram_pipeline(2, all_tokens)
    grams = list(flatten(ngrams(sentence, n) for sentence in dataset))
    print(f"Length of {n}-gram list: {len(grams)}")
    train, _ = padded_everygram_pipeline(2, dataset)
    lm2 = KneserNeyInterpolated(n)
    lm2.fit(train, vocab)
    return lm2

In [26]:
lm3 = train_model_kne(news_tokens, 2)

Length of 2-gram list: 96905


In [27]:
lm3.perplexity(test)

KeyboardInterrupt: 

In [None]:
import math

In [None]:
test_data, _ = padded_everygram_pipeline(2, news_valid)

for i, test in enumerate(test_data):
    if not math.isinf(lm2.perplexity(test)):
        print("PP( line: {0}):{1}".format(i, lm2.perplexity(test)))


NameError: name 'math' is not defined

In [28]:
test_set = set()
for line in test:
    try:
        perp = lm2.perplexity(line)
        test_set.add(perp)
    except ZeroDivisionError:
        pass
print(test_set)

NameError: name 'lm2' is not defined

In [None]:
test_set

set()

In [30]:
ngrams_linewise = list(list(everygrams(sent, 2, 2)) for sent in news_valid)
len(ngrams_linewise)

5000

In [33]:
from tqdm import tqdm
lm = train_model_kne(irony_tokens, 2)
perplexity_list = []
for line in tqdm(ngrams_linewise):
    try:
        perplexity_list.append(lm.perplexity(line))
    except ZeroDivisionError:
        perplexity_list.append("inf")


Length of 2-gram list: 36082
  1%|          | 50/5000 [00:17<28:52,  2.86it/s]


KeyboardInterrupt: 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e4cdc3a5-dd4a-4d72-a71a-972cea883107' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>