## Language Model

In [1]:
from google.colab import drive

# Uploading the dataset
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd "gdrive/My Drive/DL_Coursework/Task3/"

/content/gdrive/My Drive/DL_Coursework/Task3


In [3]:
# Import modules
import os
import nltk
import tensorflow as tf
import numpy as np
import pandas
from tensorflow.keras.layers.experimental import preprocessing

## Loading

In [4]:
# Load the data and create a dataset
with open('internet_archive_scifi_v3.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Print the length of text
print('The archive has {} characters'.format(len(text)))

The archive has 149326361 characters


## Preprocessing

In [5]:
# Select text between indices
text_content = text[580:149322961]
print(text_content[-100:])

In two minutes, he slumped forward on the desk. His death precipitated the great stock market crash 


In [6]:
sub_text = text_content.replace("  ", " ")

# Print the length of text
print('The archive has {} characters'.format(len(sub_text)))

The archive has 149294026 characters


In [7]:
# Print the unique characters
vocab = sorted(set(sub_text))
print('The archive are {} unique characters'.format(len(vocab)))

The archive are 75 unique characters


In [8]:
print(sub_text[-100:])

In two minutes, he slumped forward on the desk. His death precipitated the great stock market crash 


In [9]:
# Character to index
chartoindex = {v:i for i,v in enumerate(vocab)}
int_text = np.array([chartoindex[i] for i in sub_text])

# Index to character 
indextochar = np.array(vocab)

In [10]:
print("Character to index: \n")
for char,_ in zip(chartoindex, range(105)):
    print('  {:4s}: {:3d}'.format(repr(char), chartoindex[char]))

print("\n Text to integer: \n")
print('{} to {}'.format(repr(sub_text[:20]),int_text[:20]))

Character to index: 

  ' ' :   0
  '!' :   1
  '"' :   2
  '#' :   3
  "'" :   4
  '(' :   5
  ')' :   6
  ',' :   7
  '-' :   8
  '.' :   9
  '0' :  10
  '1' :  11
  '2' :  12
  '3' :  13
  '4' :  14
  '5' :  15
  '6' :  16
  '7' :  17
  '8' :  18
  '9' :  19
  ':' :  20
  ';' :  21
  '?' :  22
  'A' :  23
  'B' :  24
  'C' :  25
  'D' :  26
  'E' :  27
  'F' :  28
  'G' :  29
  'H' :  30
  'I' :  31
  'J' :  32
  'K' :  33
  'L' :  34
  'M' :  35
  'N' :  36
  'O' :  37
  'P' :  38
  'Q' :  39
  'R' :  40
  'S' :  41
  'T' :  42
  'U' :  43
  'V' :  44
  'W' :  45
  'X' :  46
  'Y' :  47
  'Z' :  48
  'a' :  49
  'b' :  50
  'c' :  51
  'd' :  52
  'e' :  53
  'f' :  54
  'g' :  55
  'h' :  56
  'i' :  57
  'j' :  58
  'k' :  59
  'l' :  60
  'm' :  61
  'n' :  62
  'o' :  63
  'p' :  64
  'q' :  65
  'r' :  66
  's' :  67
  't' :  68
  'u' :  69
  'v' :  70
  'w' :  71
  'x' :  72
  'y' :  73
  'z' :  74

 Text to integer: 

'science fiction maga' to [67 51 57 53 62 51 53  0 54 57 

In [11]:
# Maximum characters as an input
length= 80   # sequence length
examples_per_epoch = len(sub_text)

# Text to character index stream
char_dt = tf.data.Dataset.from_tensor_slices(int_text)

In [12]:

# Sequences from individual characters
sequences = char_dt.batch(length+1, drop_remainder=True)

In [13]:
# Create input-target pairs
def input_target_pairs(k):
    input_text = k[:-1]
    target_text = k[1:]          # next step
    return input_text, target_text

data_text = sequences.map(input_target_pairs)

In [14]:
# Batch size, buffer size for shuffling
batch_size = 128
buffer_size = 10000

dataset = data_text.shuffle(buffer_size).batch(batch_size , drop_remainder=True)

In [15]:

print("Character_Stream: \n")
for i in char_dt.take(13):
  print(indextochar[i.numpy()])  

print("\nSequence: \n")
for i in sequences.take(13):
  print(repr(''.join(indextochar[i.numpy()])))

Character_Stream: 

s
c
i
e
n
c
e
 
f
i
c
t
i

Sequence: 

'science fiction magazine called IF. The title was selected after much thought bec'
'ause of its brevity and on the theory it is indicative of the field and will be e'
"asy to remember. The tentative title that just morning and couldn't remember it u"
"ntil we'd had a cup of coffee, it was summarily discarded. A great deal of though"
't and effort lias gone into the formation of this magazine. We have had the aid o'
'f several very talented and generous people, for which we are most grateful. Much'
' is due them for their warmhearted assistance. And now that the bulk of the forma'
'tive work is done, we will try to maintain IF as one of the finest books on the m'
'arket. t a great public demand for our magazine. In short, why will you buy IF? W'
'e cannot, in honesty, say we will publish at all times the best science fiction i'
'n the field. That would not be true. But we will have access to the best stories,'
' and we will get

## Model

In [16]:
import random

# Set the seed
random.seed(100)
# Create the lstm model
def model_lstm(vocabul_size, embedding_dim, rnn_units, batch_size):
    txt_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabul_size, embedding_dim,           # embedding
                              batch_input_shape=[batch_size, None]), 
    tf.keras.layers.LSTM(rnn_units,                                  # LSTM
                        return_sequences=True,
                        stateful=True,),
    tf.keras.layers.Dense(vocabul_size)                              # Dense
  ])
    return txt_model

In [17]:
batch_size = 128
vocab_size = len(vocab)
embedding_dim = 256
rnn_units= 1024

lstm_txt_model = model_lstm(vocabul_size = vocab_size,embedding_dim=embedding_dim, rnn_units=rnn_units,batch_size=batch_size)
     


In [18]:
# Test the shape
for input_example_batch, target_example_batch in dataset.take(1):
    prediction = lstm_txt_model(input_example_batch)
    assert (prediction.shape == (batch_size, length, vocab_size)) 
    print(prediction.shape)
     

(128, 80, 75)


In [19]:
# Loss function
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

# Test the loss
example_loss  = loss(target_example_batch, prediction)
print("Prediction shape: ", prediction.shape)
print("Loss:      ", example_loss.numpy().mean())
     

Prediction shape:  (128, 80, 75)
Loss:       4.3179383


In [20]:
adam_Optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001) # Initializing Learning Rate

# Compile the model
lstm_txt_model.compile(optimizer=adam_Optimizer, loss=loss, metrics = ["accuracy"])

In [21]:
import os

from keras.callbacks import EarlyStopping, ModelCheckpoint

# Model checkpoints
lstm_dir_checkpoints = 'lstm_checkpoints'
checkpoint_prefix = os.path.join(lstm_dir_checkpoints, "checkpt_{epoch}") 
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True, monitor= 'loss', save_best_only=True)

# create the EarlyStopping callback
early_stopping = EarlyStopping(monitor='loss', patience=2)

## Training

In [None]:
history = lstm_txt_model.fit(dataset, epochs=15, callbacks=[checkpoint_callback, early_stopping]) 

## Loading Weight

In [42]:
lstm_mod = model_lstm(vocab_size, embedding_dim, rnn_units, batch_size=1)
lstm_mod.load_weights('lstm_checkpoints/checkpt_10').expect_partial()
lstm_mod.build(tf.TensorShape([1, None]))

## Text Generatation

In [54]:
import textwrap

def generate_text(model, input_string, num=200, temperature=0.5, wrap_width=80):
    input_indices = [chartoindex[s] for s in input_string]  # text to indexes
    input_indices = tf.expand_dims(input_indices, 0)

    # result with predicted characters
    text_result = []

    model.reset_states()
    for i in range(num):
        predictions = model(input_indices)
        # Remove the dimension from batch
        predictions = tf.squeeze(predictions, 0)

        # Categorical distribution for prediction
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        # Pass the predicted character
        input_indices = tf.expand_dims([predicted_id], 0)
        char = indextochar[predicted_id]
        text_result.append(char)
        
        # Add line break after each complete word
        if char == ' ':
            word = ''.join(text_result).rsplit(' ', 1)[1]
            if len(word) > 1 and word[-1] not in ['.', '!', '?']:
                text_result.append('\n')

    # Join the characters and wrap the text
    generated_text = ''.join(text_result)
    wrapped_text = textwrap.fill(generated_text, wrap_width)

    return input_string + wrapped_text


### Small Sentence

In [138]:
# User Input 
lstm_pred = input("Enter your text: ")

print('\nPrediction:')
# Prediction
print(generate_text(model = lstm_mod, input_string=lstm_pred, num=100, temperature=0.4))

Enter your text: The moon revolves around

Prediction:
The moon revolves around the world of the Earth which is all the continuing ones of the same as a
possibility of the enginee


### Paragraph

In [58]:
# User Input 
lstm_pred = input("Enter your text: ")

print('\nPrediction:')
# Prediction
print(generate_text(model = lstm_mod, input_string=lstm_pred, num=1000, temperature=0.4))

Enter your text: I want a trip to space 

Prediction:
I want a trip to space is a considerable man of some other person. The word of the man who had been a
mechanical account of the problem of the Earth in the present time of the man
who had already been a statement with the real reason of the world of the
construction of the pyramid of the Earth. The stories are probably a planet of
the position of the orbit of Mars. And then they can be the first of the
centuries of the pyramid that has no doubt that it will be a primitive one. The
problem was that the interest in the world was surprised to see the medical
survival of the problem. It was a different way. The whole thing was a science
fiction story on the Skyroval Confederation of the Moon. The first time the body
was the only planet of the complete reason. He was a bigger than the other woman
who was a shadow of a second on the other side of the compartment. "It was a
long time ago, I guess I can't see the dead songs of the sight of 