# Homemade chat GPT (from IMDb_dataset)

In [1]:
# Load the modules

import json
import numpy as np
from nltk import word_tokenize, download
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint

from utils import clean_utterance

download('punkt')


2024-04-15 19:23:51.253018: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-15 19:23:51.258177: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-15 19:23:51.315094: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /home/valentin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Preparing data

In [4]:



# Load the dataset and limit the number of samples
with open('./IMDb_dataset.json', 'rt') as f:
    imdb_data = json.load(f)


# We create a list of tokenized positive and negative texts
print("Starting tokenization for positive texts")
pos_texts = [clean_utterance(word_tokenize(x[1])) for x in imdb_data if x[0] == 'pos']

print("Starting tokenization for negative texts")
neg_texts = [clean_utterance(word_tokenize(x[1])) for x in imdb_data if x[0] == 'neg']



First sample: ['pos', 'For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.']
Last sample: ['neg', 'Not that I dislike childrens movies, but this was a tearjerker with few redeeming qualities. M.J. Fox was the perfect voice for Stuart and the rest of the talent was wasted. Hugh Laurie can be amazingly funny, but is not given the chance in this movie. It´s sugar-coated sugar and would hardly appeal to anyone over 7 years of age. See Toy Story, Monsters Inc. or Shrek instead. 3/10']
tokenized text[i] = ['for', 'a', 'movie', 'that', 'gets', 'no', 'respect', 'there', 'sure', 'are', 'a', 'lot', 'of', 'memorable', 'quotes', 'listed', 'for', 'this', 'gem', '.', 'imagine', 'a', 'movie', 'where', 'joe', 'piscopo', 'is', 'actually', 'funny', '!', 'maureen', 'stapleton

### Prepare the vocabulary on which we will operate

This is the standard stuff that you should already be acquainted with. 

We provide a function to get token counts, i.e., list all tokens present in list of texts along with their number of occurrences. The function returns a dictionary where the keys are the tokens and the values the number of occurrences in the dataset.

Up to you to select your vocabulary and create the corresponding token_to_id and id_to_token mappings. Because, after selection of the vocabulary, some of the tokens will not be part of the vocabulary, we need to insert the special token '<unk>'. 
    
1. Create a vocabulary with tokens that appear at least 30 or 20 times in the data, plus the <unk> token. The vocabulary is typically represented as a dictionary that maps tokens (strings) to integers so you can call vocab['real'] to get the integer id of the token 'real'. Also create the inverse mapping from id to token as a 
list (not necessary but very useful for debugging and pretty printing).
    
2. Encode pos_texts and neg_texts as list of integers with your vocabulary
    
    
We provide in the next cell two useful functions to encode/decode a sequence: list of strings to list of integers for the former, and conversely for the latter.

In [70]:
#
# Scan the list of texts and return a list of the tokens therein along with their
# number of occurrences, sorted in descending order.
#
def get_token_counts(data: list) -> dict:
    '''
    Create vocabulary from a bunch of (tokenized) texts. 
    
    Returns:
        - token count (dict)
    '''

    tokcnt = {}    
    
    for utterance in data:
        for token in utterance:
            tokcnt[token] = 1 if token not in tokcnt else tokcnt[token] + 1

    return dict(sorted(tokcnt.items(), key=lambda x: x[1], reverse = True))


count = get_token_counts(pos_texts)

#
# Pretty print a number of things
#
print('total number of tokens in dataset =', len(count))
print('most frequent tokens:')
for x in list(count.keys())[:20]:
    print(f"   {x:20}  {count[x]}")
print('\nleast frequent tokens:')
for x in list(count.keys())[-20:]:
    print(f"   {x:20}  {count[x]}")

total number of tokens in dataset = 77041
most frequent tokens:
   the                   172318
   ,                     144077
   .                     117678
   and                   89398
   a                     83300
   of                    76630
   to                    66455
   is                    58467
   in                    49797
   it                    47350
   i                     40267
   that                  35526
   this                  34881
   's                    32132
   as                    26253
   with                  23197
   was                   22685
   for                   22303
   but                   20731
   film                  20284

least frequent tokens:
   vulgarities           1
   rêves                 1
   objectifier           1
   disaster.one          1
   ketty                 1
   konstadinou           1
   kavogianni            1
   'guilty               1
   laughing.every        1
   heart.my              1
   vassilis        

In [71]:
#
# Create a vocabulary with tokens that appear at least MINOCC times in the data, plus the <unk> token 
# with id 0.
#
# The vocabulary is typically represented as a dictionary that maps tokens (strings) to integers so you 
# can call vocab['real'] to get the integer id of the token 'real'. Also create the inverse mapping 
# from id to token as a string list (not necessary but very useful for debugging and pretty printing).
#

MINOCC = 30 # you can also play with smaller number and see what happens to your generator

int_to_word = []
vocab = {}

int_id = 1
vocab['<unk>'] = 0
int_to_word.append('<unk>')
for token in count:
    if count[token] >= MINOCC:
        vocab[token] = int_id
        int_to_word.append(token)
        int_id += 1

In [72]:
#
# 2. Encode pos_texts and neg_texts as list of integers with your vocabulary
#
# We strongly suggest to write an encode_sequence function that takes as input a list of 
# tokens (strings) and outputs the corresponding list of integers given the vocabulary. 
# Having a decode_sequence function that does the inverse operation is also very practical.
#
# Encoding the first positive comment should give something like (showing comment as a list 
# of tokens, then encoded as a list of ids):
#
# ['for', 'a', 'movie', 'that', 'gets', 'no', 'respect', 'there', 'sure', 'are', 'a', 'lot', 'of', 'memorable', 'quotes', 'listed', 'for', 'this', 'gem', '.', 'imagine', 'a', 'movie', 'where', 'joe', 'piscopo', 'is', 'actually', 'funny', '!', 'maureen', 'stapleton', 'is', 'a', 'scene', 'stealer', '.', 'the', 'moroni', 'character', 'is', 'an', 'absolute', 'scream', '.', 'watch', 'for', 'alan', 'the', 'skipper', 'hale', 'jr.', 'as', 'a', 'police', 'sgt', '.']
# [18, 5, 21, 12, 231, 90, 1144, 46, 268, 26, 5, 166, 6, 648, 4172, 4427, 18, 13, 1031, 3, 918, 5, 21, 125, 772, 0, 8, 190, 182, 33, 5351, 0, 8, 5, 136, 0, 3, 1, 0, 114, 8, 36, 1744, 2434, 3, 117, 18, 1283, 1, 0, 0, 1837, 15, 5, 531, 6210, 3]
#
# In the end, pos_texts should be converted to a list of list of integers.
#

def encode_sequence(tokens: list[str], vocab: dict) -> list[int]:
    '''
    Encode a list of tokens into a list of integers using a vocabulary.
    '''
    
    return [vocab[token] if token in vocab else 0 for token in tokens]


encoded_pos_texts = [encode_sequence(x, vocab) for x in pos_texts]
encoded_neg_texts = [encode_sequence(x, vocab) for x in neg_texts]

## Train a basic RNN-LM model

We will guide you through the following steps:

1. Prepare the training data for LM with a large number of pairs (history, next_token)
2. Define the model as a tensorflow.keras model
3. Train the model and play with it

#### Prepare training data for LM

We first need to prepare training data for our simplified LM model. Here, training data consists of fixed-length sequences with the corresponding label, i.e., the token that follows. For a 6-gram model, we thus have (all of these should be encoded as integers of course)

```
['for', 'a', 'movie', 'that', 'gets']  >>  no
['that', 'gets', 'no', 'respect', 'there']  >>  sure
['respect', 'there', 'sure', 'are', 'a']  >>  lot
['are', 'a', 'lot', 'of', 'memorable']  >>  quotes
``` 

We will design a closed vocabulary LM (i.e., no possibility of assigning probability to the <unk> token or having the <unk> token in the history), we discard sequences (history and label) where <unk> appears. As training data are documents rather than sentences, we also avoid sequences with end of sentence punctuation marks in the history (here, only the period is considered).

**TODO**

Create two arrays X and Y containing the training data, where
- X[i]: list[int] ==> history represented as as sequence of token ids for the i'th training sample 
- Y[i]: int       ==> token id to predict following X[i]
    
To generate the training samples, scan the input sequences and retain all couples (history, prediction) that you can make and where the <unk> token does not appear (neither in the history, nor in the prediction). The end-of-sentence punctuation (.) should also not appear in the history but has to appear in the prediction: it will be used in generation to stop the generator upon reaching the end of the sentence. 
    
Taking all possible (history, prediction) pairs might yield too many training data so you might want to downsize if necessary to somewhere around 500,000 samples. This can be done with a step parameter when you scan an input sequence, e.g., something like

```python
for i, utterance in enumerate(pos_texts_encoded):    
    for j in range(0, len(utterance) - input_length, step):
        ### check if utterance[j:j+input_length] is a valid history
        ### check of utterance[j+input_length] is a valid prediction
```

In [105]:

input_length = 5     # define length of history


def create_x_y(encoded_texts, nb_sample_max=None, step=1):
    X = []
    Y = []
    point_indice = vocab['.']

    if nb_sample_max is None:
        for text in encoded_texts:
            for i in range(0, len(text) - input_length, step):
                if text[i+input_length] != 0 and text[i+input_length] != len(vocab):
                    X.append(text[i:i+input_length])
                    Y.append(text[i+input_length])
            X.append(text[-input_length:])
            Y.append(point_indice)

    else:
        for text in encoded_texts:
            if nb_sample_max <= 0:
                break
            for i in range(0, len(text) - input_length, step):
                if text[i+input_length] != 0 and text[i+input_length] != len(vocab):
                    nb_sample_max -= 1
                    X.append(text[i:i+input_length])
                    Y.append(text[i+input_length])
            X.append(text[-input_length:])
            Y.append(point_indice)
    
    return X, Y

X_good, Y_good = create_x_y(encoded_pos_texts, 500000, 4)


print('number of sequences for training =', len(X))

False
number of sequences for training = 508437


### Define and train a RNN LM

The following two cells define the model and run training. You should only adapt things here, in particular to your vocabulary size in the first cell. The rest can be left untouched.

**TODO**

Draw the architecture of the model that is defined and take a look at
https://www.tensorflow.org/api_docs/python/tf/keras/losses/SparseCategoricalCrossentropy
for the definition of the sparse categorical cross-entropy loss function. How adapted is it to our problem? 

In [74]:
#
# Define the model and the hyperparameters such as embedding dimension and LSTM state dimension.
#


vocab_size = len(vocab) ### set to your vocabulary size, i.e., number of tokens in the vocabulary, including <unk>
embedding_size = 100 # dimension of the input embeddings
lstm_size = 100 # dimension of the RNN state

model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_shape = (input_length,)))
model.add(LSTM(lstm_size))
model.add(Dropout(0.1))
model.add(Dense(vocab_size, activation = 'softmax'))

model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam')

print(model.summary())



  super().__init__(**kwargs)


None


In [106]:
#
# And finally run training with an early stopping criterion
#

epochs = 1
batch_size = 128
val_split = 0.2

stop = EarlyStopping(monitor = 'val_loss', min_delta = 0, patience = 5, verbose = 1, mode = 'auto')
save = ModelCheckpoint('data.NOSAVE/lstm-10-5.keras', monitor = 'val_loss', verbose = 0, save_best_only = True)

X = np.array(X)
Y = np.array(Y)

history = model.fit(X, Y, batch_size = batch_size, epochs = epochs, verbose = 1, validation_split = val_split, callbacks = [stop, save])

model.save('bad_model')

[1m3178/3178[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 19ms/step - loss: 5.1394 - val_loss: 5.1154


### Play with the model

Now we have a properly trained model able to take in a few tokens and output a probability distribution function over the vocabulary for the next position (i.e., p[.|h]). Before going for text generation, let's first see on test data how prediction for a given history differs from the reality. The idea is to generate data as for training on new (test) data, take the prediction for a given history along with its probability and see how it differs from the truth in the data (here again, looking at the probabilities). We typically want to have an output like this

```
['the', 'dead', 'of', 'winter', 'i']                 best = can (0.048)   true = 've (0.014)
['winter', 'i', "'ve", 'been', 'there']              best = in (0.148)   true = , (0.074)
['been', 'there', ',', 'and', 'this']                best = is (0.417)   true = is (0.417)
```

where best is the predicted token that follows the history with the corresponding probability in the LM, and true is the actual token with the corresponding LM probability.

We will first prepare some test data from positive and negative comments. We will then define a predict function that takes as input an history and predicts the next token.


In [76]:
"""
#
# Get a few unseen texts from the database and process them
#
[clean_utterance(word_tokenize(x[1])) for x in imdb_data if x[0] == 'pos']
pos_tests = [clean_utterance(word_tokenize(x[1])) for x in imdb_data[2000:2050]]
neg_tests = [clean_utterance(word_tokenize(x[1])) for x in imdb_data[-50:]]

X1, Y1 = create_x_y([encode_sequence(x, vocab) for x in pos_tests])
        
print('number of positive sequences for testing =', len(X1))

X2, Y2 = create_x_y([encode_sequence(x, vocab) for x in neg_tests])
print('number of negative sequences for testing =', len(X2))
"""

"\n#\n# Get a few unseen texts from the database and process them\n#\n[clean_utterance(word_tokenize(x[1])) for x in imdb_data if x[0] == 'pos']\npos_tests = [clean_utterance(word_tokenize(x[1])) for x in imdb_data[2000:2050]]\nneg_tests = [clean_utterance(word_tokenize(x[1])) for x in imdb_data[-50:]]\n\nX1, Y1 = create_x_y([encode_sequence(x, vocab) for x in pos_tests])\n        \nprint('number of positive sequences for testing =', len(X1))\n\nX2, Y2 = create_x_y([encode_sequence(x, vocab) for x in neg_tests])\nprint('number of negative sequences for testing =', len(X2))\n"

In [85]:
#
# Complete the predict function below that takes as input a model and a history, runs the history through 
# the model to get the probability distribution over the vocabulary and returns the predicted token. 
#
# The function returns the three following elements:
#   - id of the token predicted as following the history
#   - probability for this token
#   - probability for the true_i token (or 0 if none is specified -- see below)
#
# Anticipating text generation, the predict function has two modes:
# - best: returns the token with highest probability (to follow the history)
# - random: returns a random token chosen according to the probability distribution over the vocabulary
#
# The function can also take a true_i argument which typically corresponds to the id of the token that 
# actually follows the history in the data. If given, predict returns the probability for this token to 
# follow the history. Else returns a probability of 0.

def predict(model, h: list[int], mode = 'best', true_i = None):
    '''
    Return a predicted token given the history and the model. Said more simply, predict p[.|h]
    with the model and take the best guess or a random guess (depending on mode).
    
    Returns predicted token with the corresponding probability, optionnally returning the activation 
    prob of the true token if true_i is provided
    '''
    h_array = np.array(h).reshape(1, -1)
    
    probs = model.predict(h_array, verbose=0)[0]
    
    if mode == 'best':
        pred_token_id = np.argmax(probs)
        pred_prob = probs[pred_token_id]

    else:
        pred_token_id = np.random.choice(len(probs), p=probs)
        pred_prob = probs[pred_token_id]
    
    true_prob = probs[true_i] if true_i is not None else 0
    
    return pred_token_id, pred_prob, true_prob

In [78]:
#
# Use the predict function above on a few sentence from pos_tests and neg_tests to mimic the following output
#
# ['the', 'dead', 'of', 'winter', 'i']                 best = can (0.048)   true = 've (0.014)
# ['winter', 'i', "'ve", 'been', 'there']              best = in (0.148)   true = , (0.074)
# ['been', 'there', ',', 'and', 'this']                best = is (0.417)   true = is (0.417)
#
# Are there significant differences between predictions on the positive samples and that of 
# the negative samples (assuming a model trained on either positive or negative comments)

prediction_best = predict(model, [vocab['the'], vocab['dead'], vocab['of'], vocab['winter'], vocab['i']], 'best')
prediction_random = predict(model, [vocab['the'], vocab['dead'], vocab['of'], vocab['winter'], vocab['i']], 'random')
print(f"['the', 'dead', 'of', 'winter', 'i']                 best = {int_to_word[prediction_best[0]]} ({prediction_best[1]:.3f})   random = '{int_to_word[prediction_random[0]]} ({prediction_random[1]:.3f})")

prediction_best = predict(model, [vocab['winter'], vocab['i'], vocab["'ve"], vocab['been'], vocab['there']], 'best')
prediction_random = predict(model, [vocab['winter'], vocab['i'], vocab["'ve"], vocab['been'], vocab['there']], 'random')
print(f"['winter', 'i', ''ve', 'been', 'there']              best = {int_to_word[prediction_best[0]]} ({prediction_best[1]:.3f})   random = '{int_to_word[prediction_random[0]]} ({prediction_random[1]:.3f})")

prediction_best = predict(model, [vocab['been'], vocab['there'], vocab[','], vocab['and'], vocab['this']], 'best')
prediction_random = predict(model, [vocab['been'], vocab['there'], vocab[','], vocab['and'], vocab['this']], 'random')
print(f"['been', 'there', ',', 'and', 'this']                best = {int_to_word[prediction_best[0]]} ({prediction_best[1]:.3f})   random = '{int_to_word[prediction_random[0]]} ({prediction_random[1]:.3f})")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
['the', 'dead', 'of', 'winter', 'i']                 best = have (0.060)   random = 'do (0.030)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
['winter', 'i', ''ve', 'been', 'there']              best = to (0.092)   random = 'with (0.016)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
['been', 'there', ',', 'and', 'this']                best = is (0.158)   random = 'quite (0.000)


## Natural language generation

Finally, we can move on to text generation, which is sort of straightforward given the predict function above. The idea is to give a prompt and let the generator complete, e.g., 

```python
prompt = encode_sequence(['it', "'s", 'hard', 'to', 'believe'])
sample = generate(prompt, model)
```

outputs

```
it 's hard to believe that the film is not a good movie .
```

In [137]:
#
# Write a function generate that takes as input a prompt (a list of token ids correponding to the
# begining of the text with a minimum length of the history size you defined) and iteratively 
# generate the following tokens. The generation loop stops when a punctuation is generated (.!?) 
# or when MAX_SENTENCE_SIZE is reached. 
#
# You can do the generation loop selecting either the best following token at each iteration or 
# simply taking a random one according to the probability distribution function defined by the 
# current history. You're invited to compare and comment the two.
#

MAX_SENTENCE_SIZE = 100

def generate(prompt, model, mode):
    '''
    Generate text starting from the prompt.
    '''
    if mode == 'mix':
        mode_mix = True
        mode = 'best'
    else:
        mode_mix = False

    prompt_list = prompt.split()
    encoded_text = []
    text = [clean_utterance(word_tokenize(word)) for word in prompt_list[-5:]]

    for word in text:
        if word[0] not in vocab:
            encoded_text.append(0)
        else:
            encoded_text.append(vocab[word[0]])


    iteration = 0
    while iteration < MAX_SENTENCE_SIZE:
        prediction = predict(model, encoded_text[-5:], mode)
        encoded_text.append(prediction[0])
        if prediction[0] == vocab['.']:
            break
        iteration += 1
        if mode_mix:
            mode = 'random' if mode == 'best' else 'best'
    
    # Decode the text
    for number in encoded_text:
        print(int_to_word[number], end=' ')



In [141]:
model = tf.keras.models.load_model('good__model_1.keras')

# Different modes : 'best', 'random', 'mix'
mode = 'random'

prompt = "Hello"

generate(prompt, model, mode)


hello elm cable and the ship feature . 