In [19]:
import csv
import itertools
import operator
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import matplotlib.pyplot as plt
from rnn import RNNNaive


In [20]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /home/user/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [21]:
vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

In [22]:
with open('./reddit-comments-2015-08.csv', 'r', encoding='utf8') as f:
    reader = csv.reader(f, skipinitialspace=True)
    next(reader)
    sentences = itertools.chain(*[sent_tokenize(x[0].lower()) for x in reader])
    # Додаємо SENTENCE_START та SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print("Parsed %d sentences." % (len(sentences)))

Parsed 79184 sentences.


In [23]:
for i, sentence in enumerate(sentences[0:10]):
    print("%d) %s" % (i + 1, sentence))

1) SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END
2) SENTENCE_START it's a slight ppr league- .2 ppr. SENTENCE_END
3) SENTENCE_START standard besides 1 points for 15 yards receiving, .2 points per completion, 6 points per td thrown, and some bonuses for rec/rush/pass yardage. SENTENCE_END
4) SENTENCE_START my question is, is it wildly clear that qb has the highest potential for points? SENTENCE_END
5) SENTENCE_START i put in the rules at a ranking site and noticed that top qbs had 300 points more than the top rb/wr. SENTENCE_END
6) SENTENCE_START would it be dumb not to grab a qb in the first round? SENTENCE_END
7) SENTENCE_START in your scenario, a person could just not run the mandatory background check on the buyer and still sell the gun to the felon. SENTENCE_END
8) SENTENCE_START there's no way to enforce it. SENTENCE_END
9) SENTENCE_START an honest seller is going to not sell the gun to them when they see they'r

In [24]:
tokenized_sentences = [word_tokenize(sent) for sent in sentences]

In [25]:
for i, tokenized_sentence in enumerate(tokenized_sentences[0:10]):
    print("%d) %s" % (i + 1, tokenized_sentence))

1) ['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']
2) ['SENTENCE_START', 'it', "'s", 'a', 'slight', 'ppr', 'league-', '.2', 'ppr', '.', 'SENTENCE_END']
3) ['SENTENCE_START', 'standard', 'besides', '1', 'points', 'for', '15', 'yards', 'receiving', ',', '.2', 'points', 'per', 'completion', ',', '6', 'points', 'per', 'td', 'thrown', ',', 'and', 'some', 'bonuses', 'for', 'rec/rush/pass', 'yardage', '.', 'SENTENCE_END']
4) ['SENTENCE_START', 'my', 'question', 'is', ',', 'is', 'it', 'wildly', 'clear', 'that', 'qb', 'has', 'the', 'highest', 'potential', 'for', 'points', '?', 'SENTENCE_END']
5) ['SENTENCE_START', 'i', 'put', 'in', 'the', 'rules', 'at', 'a', 'ranking', 'site', 'and', 'noticed', 'that', 'top', 'qbs', 'had', '300', 'points', 'more', 'than', 'the', 'top', 'rb/wr', '.', 'SENTENCE_END']
6) ['SENTENCE_START', 'would', 'it', 'be', 'dumb', 'not', 'to', '

In [26]:
words_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Знайдено %d унікальних токенів." % len(words_freq.items()))

Знайдено 63011 унікальних токенів.


In [27]:
for key, value in itertools.islice(words_freq.items(), 10):
  print("%s f %d" % (key, value))

SENTENCE_START f 79184
i f 32614
joined f 28
a f 31777
new f 1250
league f 163
this f 9032
year f 751
and f 30055
they f 7856


In [28]:
vocab = words_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])
print("Розмір словника %d." % vocabulary_size)

Розмір словника 8000.


In [29]:
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sentence]

In [30]:
print("\nРечення: '%s'" % sentences[1])
print("\Речення після обробки: '%s'" % tokenized_sentences[1])


Речення: 'SENTENCE_START it's a slight ppr league- .2 ppr. SENTENCE_END'
\Речення після обробки: '['UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', '_', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'a', 'UNKNOWN_TOKEN', 'd', 'i', 's', 'h', 'o', 'n', 'e', 's', 't', 'UNKNOWN_TOKEN', 's', 'e', 'l', 'l', 'e', 'r', 'UNKNOWN_TOKEN', 'i', 's', 'n', "'", 't', 'UNKNOWN_TOKEN', 'g', 'o', 'i', 'n', 'g', 'UNKNOWN_TOKEN', 't', 'o', 'UNKNOWN_TOKEN', 'r', 'u', 'n', 'UNKNOWN_TOKEN', 't', 'h', 'e', 'UNKNOWN_TOKEN', 'c', 'h', 'e', 'c', 'k', 'UNKNOWN_TOKEN', 'i', 'n', 'UNKNOWN_TOKEN', 't', 'h', 'e', 'UNKNOWN_TOKEN', 'f', 'i', 'r', 's', 't', 'UNKNOWN_TOKEN', 'p', 'l', 'a', 'c', 'e', '.', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', 'UNKNOWN_TOKEN', '_', 'UNKNOWN_TOKEN', 'UN

In [31]:
# Стврення тестових даних
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

In [32]:
# Тестові дані
x_example, y_example = X_train[17], y_train[17]
print("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
print("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

x:
UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN _ UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN a UNKNOWN_TOKEN d i s h o n e s t UNKNOWN_TOKEN s e l l e r UNKNOWN_TOKEN i s n ' t UNKNOWN_TOKEN g o i n g UNKNOWN_TOKEN t o UNKNOWN_TOKEN r u n UNKNOWN_TOKEN t h e UNKNOWN_TOKEN c h e c k UNKNOWN_TOKEN i n UNKNOWN_TOKEN t h e UNKNOWN_TOKEN f i r s t UNKNOWN_TOKEN p l a c e . UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN UNKNOWN_TOKEN _ UNKNOWN_TOKEN UNKNOWN_TOKEN
[7999 7999 7999 7999 7999 7999 7999 7999 5274 7999 7999 7999 7999 7999
 7999    7 7999  656    6  537 4144 1740 2154 1491  537  755 7999  537
 1491 4496 4496 1491 2475 7999    6  537 2154  166  755 7999 2719 1740
    6 2154 2719 7999  755 1740 7999 2475 1027 2154 7999  755 4144 1491
 7999 1726 4144 1491 1726 3135 7999    6 2154 7999  755 4144 1491 7999
 2515    6

In [33]:
np.random.seed(10)
model = RNNNaive(vocabulary_size)

In [34]:
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)

393 ms ± 46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
np.random.seed(10)
model = RNNNaive(vocabulary_size)
model.train_with_sgd(X_train[:1000], y_train[:1000], nepoch=5, evaluate_loss_after=1)

2025-03-27 11:58:59: loss after num_examples_seen=0 epoch=0: 8.986149
2025-03-27 12:05:50: loss after num_examples_seen=1000 epoch=1: 0.002392
2025-03-27 12:12:42: loss after num_examples_seen=2000 epoch=2: 0.001014
2025-03-27 12:19:32: loss after num_examples_seen=3000 epoch=3: 0.000635
2025-03-27 12:26:24: loss after num_examples_seen=4000 epoch=4: 0.000459


In [36]:
def generate_sentence(model, word_to_index, index_to_word, sentence_start_token, sentence_end_token, unknown_token, max_len=50):
    # Речення починанється зі SENTENCE_START
    new_sentence = [word_to_index[sentence_start_token]]
    # Повторюємо допоки не SENTENCE_END
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)
        sampled_word = word_to_index[unknown_token]
        # Не додаємо в речення UNKNOWN_TOKEN
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, pvals=next_word_probs[0][0])
            sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)

        # або допоки речення надто довге
        if len(new_sentence) > max_len:
            break
        
    sentence_str = [index_to_word[x] for x in new_sentence[1:-1]]
    
    return sentence_str

In [38]:
num_sentences = 20
senten_min_length = 10

for i in range(num_sentences):
    sent = []
    while len(sent) < senten_min_length:
        sent = generate_sentence(
            model,
            word_to_index,
            index_to_word,
            sentence_start_token,
            sentence_end_token,
            unknown_token,
            max_len=50
        )
    print(" ".join(sent))
    print()

meal heals cousins survival wallet arrest performances modified tags thing pussy director debates knows 25 beast thru template mac acts courts guard wager libido charge executed kia physical tradition icon wheel adblock draft scaling gorgeous nicer reveal bro aa placed shooting ult tourist gravity doctor adoption email hands fly

arrives ron handy forgotten kit they admittedly waiting wages game skilled jam analogy abide syndrome asylum girl git field guides bedroom bei gon clean pa trek russian contest bathroom stores amazed northern studying ign filed cons underrated survived convincing 7200rpm f1 jim feels buried atomic ceo frustrating message= researchers

years suggest sturdy decent fully annie boom beautiful q=title regards protocol south loans selftext=true sheet studied has injury titles felt depending july statistically numerous browsing launched pack feeding $ banter prospects yourself nerve sufficient older destruction maze discount millions pause temporary moses bread selec