In [213]:
import numpy as np
import string
import re
import os

from tensorflow.keras.datasets import imdb
from tensorflow import keras
import pandas as pd
from tensorflow.keras.layers import LSTM, Activation, Dropout, Dense, Input, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences

from matplotlib import pyplot as plt

In [214]:
PAD_FLAG = 2
START_FLAG = 2
OOV_FLAG = 2
OFFSET = 2

VOCAB_SIZE = 10000 + 3
EMBEDDING_DIM = 50
MAX_SEQUENCE_LENGTH = 100

(x_train, y_train), (x_test, y_test) = imdb.load_data(
    num_words=VOCAB_SIZE,
    start_char=START_FLAG, 
    oov_char=OOV_FLAG,
    index_from=OFFSET)

In [215]:
print(
    'x_train shape:', x_train.shape,
    'y_train shape:', y_train.shape,
)

x_train shape: (25000,) y_train shape: (25000,)


In [216]:
IDX_2_WORD = {
    idx+OFFSET: word
    for word, idx in imdb.get_word_index().items() 
}

IDX_2_WORD[OOV_FLAG] = '[OOV]'
IDX_2_WORD[START_FLAG] = '[START]'
IDX_2_WORD[PAD_FLAG] = '[PAD]'

WORD_2_IDX = { word: idx for idx, word in IDX_2_WORD.items() }
WORD_2_IDX

{'fawn': 34703,
 'tsukino': 52008,
 'nunnery': 52009,
 'sonja': 16818,
 'vani': 63953,
 'woods': 1410,
 'spiders': 16117,
 'hanging': 2347,
 'woody': 2291,
 'trawling': 52010,
 "hold's": 52011,
 'comically': 11309,
 'localized': 40832,
 'disobeying': 30570,
 "'royale": 52012,
 "harpo's": 40833,
 'canet': 52013,
 'aileen': 19315,
 'acurately': 52014,
 "diplomat's": 52015,
 'rickman': 25244,
 'arranged': 6748,
 'rumbustious': 52016,
 'familiarness': 52017,
 "spider'": 52018,
 'hahahah': 68806,
 "wood'": 52019,
 'transvestism': 40835,
 "hangin'": 34704,
 'bringing': 2340,
 'seamier': 40836,
 'wooded': 34705,
 'bravora': 52020,
 'grueling': 16819,
 'wooden': 1638,
 'wednesday': 16820,
 "'prix": 52021,
 'altagracia': 34706,
 'circuitry': 52022,
 'crotch': 11587,
 'busybody': 57768,
 "tart'n'tangy": 52023,
 'burgade': 14131,
 'thrace': 52025,
 "tom's": 11040,
 'snuggles': 52027,
 'francesco': 29116,
 'complainers': 52029,
 'templarios': 52127,
 '272': 40837,
 '273': 52030,
 'zaniacs': 52132,

In [217]:
data_url = "http://nlp.stanford.edu/data/wordvecs/glove.6B.zip"
data_path = keras.utils.get_file("glove.6B.zip", data_url, extract=True)
data_path = os.path.dirname(data_path)+'/glove.6B.50d.txt'

In [218]:
f = open(data_path)

EMBEDDING_DICT = {}

for line in f:
    values = line.split()
    word = values[0]
    vectors = np.asarray(values[1:], 'float32')
    EMBEDDING_DICT[word] = vectors

EMBEDDING_DICT

{'the': array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
        -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
         2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
         1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
        -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
        -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
         4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
         7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
        -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
         1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
       dtype=float32),
 ',': array([ 0.013441,  0.23682 , -0.16899 ,  0.40951 ,  0.63812 ,  0.47709 ,
        -0.42852 , -0.55641 , -0.364   , -0.23938 ,  0.13001 , -0.063734,
        -0.39575 , -0.48162 ,  0.23291 ,  0.090201, -0.13324 ,  0.078639,
        -0.4

In [219]:
EMBEDDING_MATRIX = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))

for idx in range(VOCAB_SIZE):
    word = IDX_2_WORD.get(idx)
    embed_vector = EMBEDDING_DICT.get(word)
    if embed_vector is not None:
        EMBEDDING_MATRIX[idx] = embed_vector

In [220]:
embedding_layer = Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=EMBEDDING_DIM,
    input_length=MAX_SEQUENCE_LENGTH,
    weights=[EMBEDDING_MATRIX],
    trainable=False
)



In [221]:
def LSTMSenClf(input_shape):

  X_indices = Input(input_shape)

  embeddings = embedding_layer(X_indices)

  X = LSTM(128, return_sequences=True)(embeddings)

  X = Dropout(0.6)(X)

  X = LSTM(128, return_sequences=True)(X)

  X = Dropout(0.6)(X)

  X = LSTM(128)(X)

  X = Dense(1, activation='sigmoid')(X)

  model = Model(
    inputs=X_indices,
    outputs=X,
    name='lstm_sen_clf'
  )

  return model

In [222]:
x_train_padded = pad_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH, padding='pre')
x_train_padded

array([[1414,   32,    5, ...,   18,  177,   31],
       [ 162,   10, 3214, ...,   15,  144,   94],
       [1300,    3, 1872, ...,    6,  128,  112],
       ...,
       [  10,    5, 4064, ...,    3, 3585,    2],
       [  99, 2197,    7, ...,   11,    8,   22],
       [  77, 1098,   16, ...,  203,  130,    8]], dtype=int32)

In [223]:
model = LSTMSenClf((MAX_SEQUENCE_LENGTH,))

In [224]:
model.summary()

In [225]:
adam = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [226]:
# model.fit(x_train_padded, y_train, epochs=10, batch_size=32, shuffle=True)

In [227]:
x_test = pad_sequences(x_test, maxlen=MAX_SEQUENCE_LENGTH, padding='pre')

In [228]:
model.evaluate(x_test, y_test)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 39ms/step - accuracy: 0.5076 - loss: 0.6953


[0.6965039968490601, 0.5001199841499329]

# Language Model by LSTM

In [229]:
for i in range(3):
    print(i)

0
1
2


In [230]:
test = [1,2,3,4,5]
sequence = ''
input_seq = []
for i in range(1,len(test)):
    n_gram_seq = test[:i+1]
    input_seq.append(n_gram_seq)
input_seq

[[1, 2], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4, 5]]

In [231]:
input_seq_size = 15

input_seq = []
for sequence in x_train:
    for i in range(len(sequence)):
        n_gram_seq = sequence[:i+1]
        input_seq.append(n_gram_seq)

In [232]:

input_seq = pad_sequences(input_seq, maxlen=input_seq_size, padding='pre')

input_to_model, label = input_seq[:,:-1], np.reshape(input_seq[:,1], (-1, 1))

In [233]:
input_to_model.shape, label.shape

((5967841, 14), (5967841, 1))

In [234]:
maxlen = 15
LM_data = pad_sequences(x_train, maxlen, padding='pre')

In [235]:
LM_data

array([[  11,   15,  282, ...,   18,  177,   31],
       [   3, 1354,    4, ...,   15,  144,   94],
       [  88,    2,   13, ...,    6,  128,  112],
       ...,
       [5407,  746, 1114, ...,    3, 3585,    2],
       [  13,  250,    7, ...,   11,    8,   22],
       [  11,   15,    4, ...,  203,  130,    8]], dtype=int32)

In [236]:
X, y = LM_data[:,:-1], LM_data[:,-1]

In [237]:
y.shape

(25000,)

In [238]:
# embedding_layer = Embedding(
#     input_dim=VOCAB_SIZE,
#     output_dim=EMBEDDING_DIM,
#     input_length=maxlen-1,
#     weights=[EMBEDDING_MATRIX],
#     trainable=False
# )
# def LSTMLanguageModel(input_shape, vocab_size):

#   model = keras.Sequential()
  
#   model.add(embedding_layer)

#   model.add(LSTM(128, return_sequences=True))
#   model.add(LSTM(128))
  
#   model.add(Dense(128, activation='relu'))
#   model.add(Dense(VOCAB_SIZE, activation='softmax'))
  
#   model.compile(
#     loss='sparse_categorical_crossentropy',
#     optimizer='adam',
#     metrics=['accuracy']
#   )
  
#   model.summary()
#   return model


def LSTMLanguageModel(vocab_size, embedding_dim, maxlen, embedding_matrix):

    model = keras.Sequential()
    
    # Embedding layer
    embedding_layer = Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        input_length=maxlen - 1,
        weights=[embedding_matrix],
        trainable=False
    )
    model.add(embedding_layer)
    
    # LSTM layers
    model.add(LSTM(128, return_sequences=True))
    model.add(LSTM(128))
    
    # Dense layers
    model.add(Dense(128, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    
    # Compile the model
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    return model

In [239]:
model.summary()

In [240]:
LSTMmodel = LSTMLanguageModel(VOCAB_SIZE, EMBEDDING_DIM, 14, EMBEDDING_MATRIX)

In [242]:
history = LSTMmodel.fit(input_to_model, label, epochs=1
                        , batch_size=32, shuffle=True)

[1m    13/186496[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:02:30[0m 20ms/step - accuracy: 0.9687 - loss: 0.1432

KeyboardInterrupt: 

In [None]:
history.history['accuracy']

[0.6926000118255615,
 0.7009999752044678,
 0.7095999717712402,
 0.7192000150680542,
 0.734279990196228,
 0.7416399717330933,
 0.7497599720954895,
 0.7612000107765198,
 0.766319990158081,
 0.7771199941635132,
 0.7811200022697449,
 0.7921199798583984,
 0.7988399863243103,
 0.8073199987411499,
 0.8146399855613708,
 0.8200799822807312,
 0.8310400247573853,
 0.8396400213241577,
 0.8434799909591675,
 0.8502799868583679,
 0.8592000007629395,
 0.8645600080490112,
 0.8704000115394592,
 0.8726800084114075,
 0.8728799819946289,
 0.8859599828720093,
 0.8972399830818176,
 0.8963199853897095,
 0.903439998626709,
 0.8993600010871887,
 0.9018800258636475,
 0.915440022945404,
 0.9173200130462646,
 0.9211999773979187,
 0.916920006275177,
 0.923520028591156,
 0.9273599982261658,
 0.9343600273132324,
 0.9334800243377686,
 0.9261999726295471,
 0.9495599865913391,
 0.9452400207519531,
 0.937279999256134,
 0.9416000247001648,
 0.9426800012588501,
 0.9484800100326538,
 0.9530400037765503,
 0.9429600238800049,

In [None]:
history = model.fit(X, y, epochs=50, batch_size=32, shuffle=True)

Epoch 1/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 20ms/step - accuracy: 0.7087 - loss: 1.1504
Epoch 2/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 20ms/step - accuracy: 0.7151 - loss: 1.1314
Epoch 3/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 20ms/step - accuracy: 0.7211 - loss: 1.0772
Epoch 4/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.7344 - loss: 1.0316
Epoch 5/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 20ms/step - accuracy: 0.7492 - loss: 0.9878
Epoch 6/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 22ms/step - accuracy: 0.7562 - loss: 0.9601
Epoch 7/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 21ms/step - accuracy: 0.7612 - loss: 0.9222
Epoch 8/50
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 20ms/step - accuracy: 0.7730 - loss: 0.8778
Epoch 9/50
[1m782/782[

In [None]:
Tokenizer = Tokenizer.fit_on_sequences(x_train, maxlen=MAX_SEQUENCE_LENGTH, padding='pre', sequences=)

TypeError: Tokenizer.fit_on_sequences() missing 1 required positional argument: 'sequences'

In [None]:
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_sequences(x_train)

In [None]:
seed_text = 'the movie'

def get_sequence(text):
    return pad_sequences(
                [[ int(WORD_2_IDX.get(word)) + 2 for word in seed_text.split()]],
                maxlen=MAX_SEQUENCE_LENGTH-1,
                padding='pre')


LSTMmodel.predict(get_sequence(seed_text)).argmax(axis=1)[0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step


2

In [None]:
input_seq

array([[  0,   0,   0, ...,   0,   0,   2],
       [  0,   0,   0, ...,   0,   2,  13],
       [  0,   0,   0, ...,   2,  13,  21],
       ...,
       [247,  19,  11, ...,  21,   3, 203],
       [ 19,  11,  15, ...,   3, 203, 130],
       [ 11,  15,   4, ..., 203, 130,   8]], dtype=int32)

In [None]:
model.predict(get_sequence(seed_text))

In [None]:
sorted_dict = {k: IDX_2_WORD[k] for k in sorted(IDX_2_WORD)}
print(sorted_dict)



In [None]:
def generate_seq(model, seed_text, n_words):
  in_txt = seed_text
  result = []
  for _ in range(n_words):
    encoded = get_sequence(in_txt)
    yhat = model.predict(encoded).argmax(axis=1)[0]
    out_word = ''
    for word, index in tokenizer.word_index.items():
      if index == yhat:
        out_word = word
        break
    in_txt += ' ' + out_word
    result.append(out_word)
  return ' '.join(result), in_txt

In [None]:
tokenizer

<keras.src.legacy.preprocessing.text.Tokenizer at 0x2f43ae510>

In [None]:
model.predict(get_sequence('movie')).argmax(axis=1)[0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step


21

In [None]:
sen = x_train[673]
' '.join([IDX_2_WORD.get(idx) for idx in sen])


"[PAD] i hated the first episode of this show [PAD] [PAD] so much in 1999 that i [PAD] the rest however when it came on 'the paramount comedy [PAD] i watched it in full and to my surprise found it absolutely hilarious [PAD] never judge a comedy series in its first week br br set in 1969 [PAD] stars simon pegg as [PAD] [PAD] editor of an [PAD] like underground magazine called [PAD] his friends are the feminist jill laid back alex and the half wit hugo back in the late [PAD] there was a feeling of incredible optimism amongst the young that they could change the world through the [PAD] of magazines nobody read rather than [PAD] at the [PAD] [PAD] [PAD] is [PAD] towards it arthur [PAD] scripts [PAD] parody a number of that [PAD] icons [PAD] [PAD] 'the [PAD] even the infamous [PAD] [PAD] trial of the early [PAD] excellent performances from the cast julian [PAD] [PAD] [PAD] strangely put me in mind of the richard [PAD] character from [PAD] about the [PAD] its a shame that there was never a s

In [None]:
seed_text += 'film'

In [None]:
seed_text.replace('film', '')

'the movie is terrible but'

In [None]:
seed_text = 'and'

for i in range(10):
    seed_text = seed_text + ' ' + IDX_2_WORD.get(history.predict(get_sequence(seed_text)).argmax(axis=1)[0]+2)

print(f'Ori: i\'t so boring but')    
print(f'GPT: ' + seed_text)

AttributeError: 'History' object has no attribute 'predict'

In [None]:
def chatbot_response(user_input):
    # Find the appropriate response
    for i in range(50):
        user_input =  user_input + ' ' + IDX_2_WORD.get(model.predict(get_sequence(user_input)).argmax(axis=1)[0]+2)
    
    
    return user_input
chatbot_response('it is so boring')
# print("Welcome to the chatbot! Type 'exit' to end the conversation.")

# while True:
#     # Capture user input
#     user_input = input("You: ")
    
#     # Exit condition
#     if user_input.lower() == "bye":
#         print("Chatbot: Goodbye!")
#         break
    
#     # Get the chatbot's response
#     response = chatbot_response(user_input)
    
#     # Display the response
#     print(f"Chatbot: {response}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16

'it is so boring social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social social'

In [None]:
seed_text = 'the movie is terrible but'
generate_seq(model,seed_text, 50)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17

('                                                 ',
 'the movie is terrible but                                                  ')