In [1]:
import pandas as pd
from itertools import chain
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import csv

import numpy as np

In [2]:
df = pd.read_csv('shortjokes.csv')
df

Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""..."
1,2,Telling my daughter garlic is good for you. Go...
2,3,I've been going through a really rough period ...
3,4,"If I could have dinner with anyone, dead or al..."
4,5,Two guys walk into a bar. The third guy ducks.
...,...,...
231652,231653,The Spicy Sausage by Delia Katessen
231653,231654,"TIL That I Shouldn't have gone to law school, ..."
231654,231655,What did the RAM stick say to the politician? ...
231655,231656,what do you call a play about victorian era me...


In [3]:
from transformers import AutoTokenizer
import nltk
import string
import re
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.tokenize.casual import casual_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
tokennized_jokes = AutoTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [5]:
def my_preprocessing(text):
    text = text.replace('\n', '')
    text = text.replace('\r', '')
    text = text.lower()
    tokens = nltk.tokenize.word_tokenize(text)
    punctuationSet = [c for c in string.punctuation] + ['’','“','”','``']
    tokenRemovedNuisance = [token for token in tokens if token not in punctuationSet]

    # Join tokens back into a single string
    processed_text = ' '.join(tokenRemovedNuisance)

    # Fix common contractions
    contractions = {
        r'\b(ca) n\'t\b': 'cannot',
        r'\b(do) n\'t\b': 'do not',
        r'\b(are) n\'t\b': 'are not',
        r'\b(is) n\'t\b': 'is not',
        r'\b(I) \'m\b': 'I am',
        r'\b(they) \'re\b': 'they are',
        # Add more contractions as needed
    }

    for pattern, replacement in contractions.items():
        processed_text = re.sub(pattern, replacement, processed_text)

    return processed_text

In [6]:
df['Joke4'] = df['Joke'].apply(my_preprocessing)

In [7]:
df["Joke4"][0]

"me narrating a documentary about narrators i cannot hear what they are saying cuz i 'm talking ''"

In [8]:
df['Joke'][0]

'[me narrating a documentary about narrators] "I can\'t hear what they\'re saying cuz I\'m talking"'

In [9]:
df = df.drop(columns=['Joke'])
df = df.rename(columns={'Joke4':'Joke'})
df

Unnamed: 0,ID,Joke
0,1,me narrating a documentary about narrators i c...
1,2,telling my daughter garlic is good for you goo...
2,3,i 've been going through a really rough period...
3,4,if i could have dinner with anyone dead or ali...
4,5,two guys walk into a bar the third guy ducks
...,...,...
231652,231653,the spicy sausage by delia katessen
231653,231654,til that i should n't have gone to law school ...
231654,231655,what did the ram stick say to the politician i...
231655,231656,what do you call a play about victorian era me...


In [10]:
def wordtokenizer(text, casual=False, preserve_case=False, reduce_len=False, strip_handles=False):
    """
    An advanced word tokenizer function.

    Args:
    text (str): The input text to tokenize.
    casual (bool): If True, use casual tokenizer for social media text. Default is False.
    preserve_case (bool): If False, lowercase all tokens. Default is False.
    reduce_len (bool): If True, replace repeated character sequences of length 3 or greater with sequences of length 3. Default is False.
    strip_handles (bool): If True, remove Twitter-style @handles. Default is False.

    Returns:
    list: A list of tokenized words.
    """

    # Pre-processing
    if not preserve_case:
        text = text.lower()

    if reduce_len:
        text = re.sub(r'(.)\1{2,}', r'\1\1\1', text)

    if strip_handles:
        text = re.sub(r'@\w+', '', text)

    # Tokenization
    if casual:
        tokens = casual_tokenize(text, preserve_case=preserve_case, reduce_len=False, strip_handles=False)
    else:
        tokens = word_tokenize(text)

    # Post-processing
    # Handle contractions
    contraction_mapping = {
        "n't": "not",
        "'m": "am",
        "'re": "are",
        "'s": "is",
        "'ll": "will",
        "'ve": "have",
        "'d": "would"
    }

    processed_tokens = []
    for token in tokens:
        if token in contraction_mapping:
            if processed_tokens and processed_tokens[-1].lower() in ["ca", "wo", "sha"]:
                processed_tokens[-1] += token
            else:
                processed_tokens.append(contraction_mapping[token])
        else:
            processed_tokens.append(token)

    return processed_tokens

In [11]:
jokes_list = df['Joke'].tolist()

# Apply wordtokenizer to each joke
tokenized_jokes = [wordtokenizer(joke, casual=True, preserve_case=False, reduce_len=True) for joke in jokes_list]

# Convert the list of tokenized jokes to a numpy array
jokes_array = np.array(tokenized_jokes, dtype=object)

In [12]:
jokes_array

array([list(['me', 'narrating', 'a', 'documentary', 'about', 'narrators', 'i', 'cannot', 'hear', 'what', 'they', 'are', 'saying', 'cuz', 'i', "'", 'm', 'talking', "'", "'"]),
       list(['telling', 'my', 'daughter', 'garlic', 'is', 'good', 'for', 'you', 'good', 'immune', 'system', 'and', 'keeps', 'pests', 'away.ticks', 'mosquitos', 'vampires', '...', 'men']),
       list(['i', "'", 've', 'been', 'going', 'through', 'a', 'really', 'rough', 'period', 'at', 'work', 'this', 'week', 'it', "'", 's', 'my', 'own', 'fault', 'for', 'swapping', 'my', 'tampax', 'for', 'sand', 'paper']),
       ...,
       list(['what', 'did', 'the', 'ram', 'stick', 'say', 'to', 'the', 'politician', 'i', "'", 'm', 'pc2']),
       list(['what', 'do', 'you', 'call', 'a', 'play', 'about', 'victorian', 'era', 'menstruation', 'a', 'period', 'piece']),
       list(['calculus', 'should', 'be', 'taught', 'in', 'every', 'high', 'school', 'around', 'the', 'world', 'it', 'is', 'such', 'an', 'integral', 'field', 'of', 'math']

In [13]:
jokes_array[0]

['me',
 'narrating',
 'a',
 'documentary',
 'about',
 'narrators',
 'i',
 'cannot',
 'hear',
 'what',
 'they',
 'are',
 'saying',
 'cuz',
 'i',
 "'",
 'm',
 'talking',
 "'",
 "'"]

In [14]:
def flatten(ls):
    """
    Flatten list of list
    """
    return list(chain.from_iterable(ls))

def create_lookup_dict(tokenized, n_min=None):
    """
    Create lookup dictionary from list of words (lyrics)
    """
    word_counts = Counter(tokenized)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    if n_min is not None:
        sorted_vocab = {k: v for k, v in word_counts.items() if v >= n_min}
    vocab_to_int = {word: i for i, word in enumerate(sorted_vocab, 0)}
    int_to_vocab = {i: word for word, i in vocab_to_int.items()}
    return (vocab_to_int, int_to_vocab)

In [15]:
jokes_array = flatten(jokes_array)
jokes_array = [token if token != '\n' else ' ' for token in jokes_array]
word_counts = Counter(jokes_array)
vocab_to_int, int_to_vocab = create_lookup_dict(jokes_array, n_min=None)

In [16]:
vocab_to_int["they"]

27

In [17]:
len(vocab_to_int)

71826

In [18]:
int_to_vocab[12]

'of'

In [19]:
sequence_length = 4

tokenized_indices = [vocab_to_int.get(token, 0) for token in jokes_array]

X, target = [], []
for n in range(0, len(tokenized_indices) - sequence_length, 1):
  x = tokenized_indices[n: n + sequence_length]
  y = tokenized_indices[n + sequence_length]
  X.append(np.array(x))
  target.append(y)
X = np.array(X)
target = np.array(target)

In [20]:
X[0]

array([  16, 9370,    1, 3061])

In [21]:
target[0]

45

In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

class MyDataSet(torch.utils.data.Dataset):
  def __init__(self, X, y):
    super(MyDataSet, self).__init__()
    self._X = X
    self._y = y

  def __len__(self):
    return self._X.shape[0]

  def __getitem__(self, index):
    X = self._X[index]
    y = self._y[index]
    return X, y

In [23]:
# Hyperparameters
LEARNING_RATE = 0.001
BATCH_SIZE = 512
NUM_EPOCHS = 5

# Classification
NUM_CLASSES = len(vocab_to_int)

dataset = MyDataSet(X, target)

trainloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [24]:
from typing_extensions import Self
class Simple_LSTM(nn.Module):
    def __init__(self):
        super(Simple_LSTM, self).__init__()
        self.embeddings = nn.Embedding(num_embeddings=len(vocab_to_int), embedding_dim=200)
        self.lstm = nn.LSTM(input_size=200, hidden_size=256, num_layers=2, dropout=0.2, batch_first=True)
        self.fc = nn.Linear(256, len(vocab_to_int))
        self.attention = nn.Linear(256, 1)

    def forward(self, x):
        x = self.embeddings(x)
        lstm_out, _ = self.lstm(x)
        attn_weights = torch.softmax(self.attention(lstm_out), dim=1)
        context_vector = torch.sum(attn_weights * lstm_out, dim=1)
        out = self.fc(context_vector)
        return out

In [25]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X = X.to('cuda')
        y = y.to('cuda')
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 1000 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def generate(model, start_word, pad_value=0, predict_len=40):

    words = word_tokenize(start_word)
    start_word_ids = []

    predicted = words

    word_ids = [vocab_to_int.get(word, pad_value) for word in words]

    current_seq = [np.pad(word_ids, (4 - len(word_ids), pad_value), 'constant')]

    for _ in range(predict_len):
        current_seq = torch.LongTensor(current_seq).to('cuda')

        p = model(current_seq)
        p = nn.Softmax(dim=1)(p).cpu().detach().numpy()

        p = p[0]

        word_i = np.random.choice(np.arange(0, p.shape[0]), p=p)

        word = int_to_vocab[word_i]

        predicted.append(word)

        current_seq = current_seq.detach().cpu().detach().numpy()
        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    gen_sentences = ' '.join(predicted)
    return gen_sentences

In [26]:
model = Simple_LSTM().to('cuda')
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)

In [27]:
for t in range(NUM_EPOCHS):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(trainloader, model, loss_fn, optimizer)
    with torch.no_grad():
      print(generate(model, 'You are', predict_len=40))
print("Done!")

Epoch 1
-------------------------------
loss: 11.181231  [    0/4442944]
loss: 5.953588  [512000/4442944]
loss: 5.719442  [1024000/4442944]
loss: 5.598287  [1536000/4442944]
loss: 5.309292  [2048000/4442944]
loss: 5.532105  [2560000/4442944]
loss: 5.137928  [3072000/4442944]
loss: 5.283448  [3584000/4442944]
loss: 5.437125  [4096000/4442944]


  current_seq = torch.LongTensor(current_seq).to('cuda')


You are scary movement and named drove to facebook learn how to make your order feel wtf these monsters are cool enough stance later without the record buzzz get thise itself today rich girl are you love ... ' ' this is
Epoch 2
-------------------------------
loss: 4.983220  [    0/4442944]
loss: 4.837670  [512000/4442944]
loss: 5.141973  [1024000/4442944]
loss: 5.100200  [1536000/4442944]
loss: 5.108617  [2048000/4442944]
loss: 4.883153  [2560000/4442944]
loss: 5.008471  [3072000/4442944]
loss: 4.956040  [3584000/4442944]
loss: 4.883723  [4096000/4442944]
You are longer further or long-pending what do you call a used pig a mexican judges a rapist the pharmacy comes down in paris they now want a lz i ' m sorry ... what is the most of the health has
Epoch 3
-------------------------------
loss: 4.866211  [    0/4442944]
loss: 4.540312  [512000/4442944]
loss: 4.850302  [1024000/4442944]
loss: 5.047641  [1536000/4442944]
loss: 4.858783  [2048000/4442944]
loss: 5.372711  [2560000/4442944]


In [28]:
print(generate(model, 'I love', predict_len=20))

I love like a ' word for trump ' s similar professor nodding ' ' i n go give someone to brush
