In [None]:
import regex as re
from nltk.util import bigrams
from nltk.lm.preprocessing import pad_both_ends 
from nltk.util import everygrams
from nltk.lm.preprocessing import flatten, padded_everygram_pipeline, padded_everygrams
from nltk.lm import MLE
from nltk.util import ngrams
from nltk.lm import NgramCounter, MLE
from nltk.lm import KneserNeyInterpolated

In [None]:
with open("../tweeteval/datasets/irony/train_text.txt", "r") as f:
    irony = f.read()
with open("../tweeteval/datasets/stance/climate/train_text.txt", "r") as f:
    stance_climate = f.read()
with open("../pattern.txt", "r") as f:
    pat = re.compile(f.read())
with open("news-commentary-v16.txt") as f:
    news = f.read()

In [None]:
climate_tokens = [re.findall(pat, line) for line in stance_climate.split("\n")]

irony_tokens = [re.findall(pat, line) for line in irony.split("\n")]

news_tokens = [re.findall(pat, line) for line in news.split("\n")[:50000]]

all_tokens = climate_tokens + irony_tokens + news_tokens

In [None]:
news_valid = [re.findall(pat, line) for line in news.split("\n")[-5000:]]

In [None]:
def train_model(dataset, n):
    _, vocab = padded_everygram_pipeline(2, all_tokens)
    grams = list(flatten(ngrams(sentence, n) for sentence in dataset))
    print(f"Length of {n}-gram list: {len(grams)}")
    train, _ = padded_everygram_pipeline(2, dataset)
    lm = MLE(n)
    lm.fit(train, vocab)
    return lm

In [None]:
lm = train_model(news_tokens,1)

Length of 1-gram list: 1036815


1.7593012055611511e-06

In [None]:
lm.generate(15, text_seed=["king"], random_seed=4237648273)

['hacking',
 'and',
 'Get Started',
 'Now',
 'I',
 'guess',
 'not',
 'forget',
 'this',
 'practical',
 'class',
 'I',
 'hope',
 'ev1',
 'vaccinations']

In [None]:
lm.score("I")
lm.logscore("king")

-13.843234077004007

In [None]:
len(lm.vocab)

51253

In [None]:
lm.generate(4, text_seed=["I","super"])

['missing', '#kennychesney', '</s>', 'are']

In [None]:
test = list(flatten(everygrams(sent, 2, 2) for sent in news_valid))
test
lm.perplexity(test)

inf

In [None]:
test_set = set()
for line in test:
    perp = lm.perplexity(line)
    test_set.add(perp)
print(test_set)

{2188.1995889056157, 11022.75, 1039.812670569869, 4242.658453139963, 279.14643203788313, 7452.739249047012, 801.7870831995953, 1057.000821289603, 1447.3579828045083, 9000.037687421098, 297.7762471214816, inf, 1075.0705203205548, 4024.9392138267167, 4409.100000000002, 2363.5256886766615, 1470.517181047296, 3661.5582522312534, 1745.5794059229884, 1494.8248969072567, 2012.4696069133558, 7918.977371662529, 6646.96834688909, 760.1896551724144, 2046.8732888911006}


In [None]:
def train_model_kne(dataset, n):
    _, vocab = padded_everygram_pipeline(2, all_tokens)
    grams = list(flatten(ngrams(sentence, n) for sentence in dataset))
    print(f"Length of {n}-gram list: {len(grams)}")
    train, _ = padded_everygram_pipeline(2, dataset)
    lm = KneserNeyInterpolated(n)
    lm.fit(train, vocab)
    return lm

In [None]:
lm2 = train_model_kne(news_tokens, 2)

Length of 2-gram list: 988025


In [None]:
import math

In [None]:
test_data, _ = padded_everygram_pipeline(2, news_valid)

for i, test in enumerate(test_data):
    if not math.isinf(lm2.perplexity(test)):
        print("PP( line: {0}):{1}".format(i, lm2.perplexity(test)))


KeyboardInterrupt: 

In [None]:

test_set = set()
for line in test:
    try:
        perp = lm2.perplexity(line)
        test_set.add(perp)
    except ZeroDivisionError:
        print(0)
print(test_set)

0
0
0
0


TypeError: 'int' object is not subscriptable

In [None]:
test_set

{8592.422257243994, 135948.6666666665, 235469.99788791226, 407846.0, inf}

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e4cdc3a5-dd4a-4d72-a71a-972cea883107' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>