In [None]:
import regex as re
from nltk.util import bigrams
from nltk.lm.preprocessing import pad_both_ends 
from nltk.util import everygrams
from nltk.lm.preprocessing import flatten, padded_everygram_pipeline, padded_everygrams
from nltk.util import ngrams
from nltk.lm import NgramCounter, MLE
from nltk.lm import KneserNeyInterpolated

In [None]:
with open("../tweeteval/datasets/irony/train_text.txt", "r") as f:
    irony = f.read()
with open("../tweeteval/datasets/stance/climate/train_text.txt", "r") as f:
    stance_climate = f.read()
with open("../pattern.txt", "r") as f:
    pat = re.compile(f.read())
with open("news-commentary-v16.txt") as f:
    news = f.read()

In [None]:
climate_tokens = [re.findall(pat, line) for line in stance_climate.split("\n")]

irony_tokens = [re.findall(pat, line) for line in irony.split("\n")]

news_tokens = [re.findall(pat, line) for line in news.split("\n")[:50000]]

all_tokens = climate_tokens + irony_tokens + news_tokens

In [None]:
news_valid = [re.findall(pat, line) for line in news.split("\n")[-5000:]]

In [None]:
def train_model(dataset, n):
    _, vocab = padded_everygram_pipeline(2, all_tokens)
    grams = list(flatten(ngrams(sentence, n) for sentence in dataset))
    print(f"Length of {n}-gram list: {len(grams)}")
    train, _ = padded_everygram_pipeline(2, dataset)
    lm = MLE(n)
    lm.fit(train, vocab)
    return lm

In [None]:
lm = train_model(news_tokens,1)

Length of 1-gram list: 1036815


In [None]:
lm.generate(15, text_seed=["king"], random_seed=4237648273)

['</s>',
 'sons',
 'among',
 'the',
 'EFSF',
 'be',
 'lost',
 'generation',
 'of',
 'the',
 'United States',
 'and',
 'inundate',
 'it',
 'could']

In [None]:
lm.score("I")
lm.logscore("king")

-18.11656606515653

In [None]:
len(lm.vocab)

51253

In [None]:
lm.generate(4, text_seed=["I","super"])

['maximum', 'risk', 'factor', 'in']

In [None]:
test = list(flatten(everygrams(sent, 2, 2) for sent in news_valid))

In [None]:
test_set = set()
for line in test:
    perp = lm.perplexity(line)
    test_set.add(perp)
print(test_set)

{5248.205036036329, 27013.4787050391, 17797.529186749107, 11404.703631308228, 8468.6152460498, 119830.8226081438, 87447.30769230759, 6687.147058823536, 84266.3341920088, 6445.250330543317, inf, 7476.457699533605, 21943.12815911232, 8762.897613380563, 5058.359707917539, 9797.217371694791, 204.4693138844952, 37581.94462763385, 10573.307877189174, 11351.136046838596, 24666.78447277755, 6877.295052773287, 5728.790862957998, 487.66706354440413, 5867.671610216974, 28270.584350753816, 15342.76464653909, 15223.98729428257, 169466.37452276147, 894.6868201412238}


In [None]:
def train_model_kne(dataset, n):
    _, vocab = padded_everygram_pipeline(2, all_tokens)
    grams = list(flatten(ngrams(sentence, n) for sentence in dataset))
    print(f"Length of {n}-gram list: {len(grams)}")
    train, _ = padded_everygram_pipeline(2, dataset)
    lm2 = KneserNeyInterpolated(n)
    lm2.fit(train, vocab)
    return lm2

In [None]:
lm3 = train_model_kne(news_tokens, 2)

Length of 2-gram list: 988025


In [None]:
lm3.perplexity(test)

KeyboardInterrupt: 

In [None]:
import math

In [None]:
test_data, _ = padded_everygram_pipeline(2, news_valid)

for i, test in enumerate(test_data):
    if not math.isinf(lm2.perplexity(test)):
        print("PP( line: {0}):{1}".format(i, lm2.perplexity(test)))


NameError: name 'lm2' is not defined

In [None]:
test_set = set()
for line in test:
    try:
        perp = lm2.perplexity(line)
        test_set.add(perp)
    except ZeroDivisionError:
        pass
print(test_set)

KeyboardInterrupt: 

In [None]:
test_set

{8592.422257243994, 135948.6666666665, 235469.99788791226, 407846.0, inf}

In [None]:
ngrams_linewise = list(list(everygrams(sent, 2, 2)) for sent in news_valid)
len(ngrams_linewise)

5000

In [None]:
lm = train_model_kne(news_tokens, 2)
perplexity_list = []
for line in ngrams_linewise:
    try:
        perplexity_list.append(lm.perplexity(line))
    except ZeroDivisionError:
        perplexity_list.append("inf")

Length of 2-gram list: 988025


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e4cdc3a5-dd4a-4d72-a71a-972cea883107' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>