Dataset link: https://www.kaggle.com/datasets/abhinavmoudgil95/short-jokes/data

In [4]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [5]:
!kaggle datasets download -d abhinavmoudgil95/short-jokes

Downloading short-jokes.zip to /content
 71% 7.00M/9.82M [00:00<00:00, 13.5MB/s]
100% 9.82M/9.82M [00:00<00:00, 12.3MB/s]


In [6]:
import zipfile
zip_ref = zipfile.ZipFile('/content/short-jokes.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [7]:
import pandas as pd
jokes_df = pd.read_csv('/content/shortjokes.csv')

In [8]:
jokes_df.head()

Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""..."
1,2,Telling my daughter garlic is good for you. Go...
2,3,I've been going through a really rough period ...
3,4,"If I could have dinner with anyone, dead or al..."
4,5,Two guys walk into a bar. The third guy ducks.


In [9]:
jokes_df.shape

(231657, 2)

In [10]:
# Extract jokes text
jokes = jokes_df['Joke'].head(5000).values

In [11]:
print(jokes)

['[me narrating a documentary about narrators] "I can\'t hear what they\'re saying cuz I\'m talking"'
 'Telling my daughter garlic is good for you. Good immune system and keeps pests away.Ticks, mosquitos, vampires... men.'
 "I've been going through a really rough period at work this week It's my own fault for swapping my tampax for sand paper."
 ... 'Sucks that these Crest strips only come in white'
 "I don't like Jewish jokes. Anne Frankly I won't stand them."
 'I like my cigarettes like my Instagram. \\#nofilter Edit: learned formatting']


In [12]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

#Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(jokes)


In [13]:
total_words = len(tokenizer.word_index)+1
total_words

10501

The texts_to_sequences method of the Keras Tokenizer class is used to convert a list of texts (sentences or phrases) into sequences of integers. Each word in the text is assigned a unique integer based on the vocabulary learned by the tokenizer during the fitting process.

In [14]:
#Create input sequences and labels
input_sequence = []
for joke in jokes:
  token_list = tokenizer.texts_to_sequences([joke])[0]
  for i in range (1,len(token_list)):
    n_gram = token_list[:i+1]
    input_sequence.append(n_gram)

In [15]:
max_sequence_length = max([len(i) for i in input_sequence])
max_sequence_length

43

In [16]:
#padding of input sequence to max length
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_input_sequence = pad_sequences(input_sequence, maxlen = max_sequence_length, padding='pre')

In [17]:
X = padded_input_sequence[:,:-1]
y = padded_input_sequence[:,-1]

In [18]:
X.shape

(82904, 42)

In [19]:
y.shape

(82904,)

In [20]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=total_words)

In [21]:
#Build LSTM model

from keras import Sequential
from keras.layers import Dense, Embedding, LSTM

model = Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_length-1))
model.add(LSTM(150))
model.add(Dense(total_words,activation='softmax'))

In [22]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
  # Train the model
model.fit(X, y, epochs=30, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x78e1904d7b50>

In [24]:
model

<keras.src.engine.sequential.Sequential at 0x78e275d13c10>

In [32]:
# Generate a new joke based on a seed text
import numpy as np
def generate_joke(seed_text, next_words, model, max_sequence_length):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

# Seed text to start the joke
seed_text = "Why did the chicken cross the road"

# Number of words to generate in the joke
next_words = 10

# Generate and print the new joke
generated_joke = generate_joke(seed_text, next_words, model, max_sequence_length)
print("Generated Joke:", generated_joke)

Generated Joke: Why did the chicken cross the road i don't fucking know chickens don't even know what roads
