Data Visualization

In [2]:
import pandas as pd

book_list = pd.read_csv('archive/db_books.csv')
stories = pd.read_csv('archive/stories.csv')

In [3]:
book_list

Unnamed: 0,bookno,Title,Author,Language
0,51082.txt,Coming Attraction,Fritz Leiber,English
1,32243.txt,Confidence Game,James McKimmey,English
2,306-0.txt,"The Early Short Fiction of Edith Wharton, Par...",Edith Wharton,English
3,31038.txt,The Real Hard Sell,William W Stuart,English
4,28636-8.txt,The Grey Woman and other Tales,Mrs. (Elizabeth) Gaskell,English
...,...,...,...,...
997,29487.txt,Forever,Robert Sheckley,English
998,56527-0.txt,In a Quiet Village,Sabine Baring-Gould,English
999,31218-8.txt,Flten und Dolche,Heinrich Mann,German
1000,33839.txt,Problem on Balak,Roger D. Aycock,English


Data Cleaning : Creating a list of stories

In [4]:
story_arrays = stories['content'].to_numpy()

In [5]:
import re

# function to remove urls embeded in the texts
def remove_urls(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub(r'', text)

# function to remove special characters
def remove_special_characters(input_string):
    return re.sub(r'[^A-Za-z0-9 ]+', '', input_string)

In [6]:
# cleaning the dataset of special characters particularly newline characters

for i in range(len(story_arrays)):
    story_arrays[i] = remove_urls(story_arrays[i])
    story_arrays[i] = remove_special_characters(story_arrays[i])
    story_arrays[i] = story_arrays[i].split(' ')

    # removing empty strings and converting to lowercase
    story_arrays[i] = [x.lower() for x in story_arrays[i] if x!='']
    story_arrays[i] = ' '.join(story_arrays[i])

In [7]:
story_arrays[0]

'start of this project gutenberg ebook coming attraction produced by greg weeks mary meehan and the onlinedistributed proofreading team at coming attraction by fritz leiber illustrated by paul calle transcribers note this etext was produced from galaxy science fiction november 1950 extensive research did not uncover any evidence that the us copyright on this publication was renewed women will always go on trying to attract men even when the future seems to have no futurethe coupe with the fishhooks welded to the fender shouldered up overthe curb like the nose of a nightmare the girl in its path stoodfrozen her face probably stiff with fright under her mask for once myreflexes werent shy i took a fast step toward her grabbed her elbowyanked her back her black skirt swirled outthe big coupe shot by its turbine humming i glimpsed three facessomething ripped i felt the hot exhaust on my ankles as the bigcoupe swerved back into the street a thick cloud like a black flowerblossomed from its 

Writing the processed data into a csv file

In [8]:
processed_data = open('archive/processed_data.csv','w')

In [9]:
for story in story_arrays:
    processed_data.write(story+'\n')

Building a simple rnn to predict next word from set of 5 previous words

In [10]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


file = open('archive/processed_data.csv')
stories = []
num_of_stories = 4
for story in file:
    stories.append(story)
    num_of_stories -= 1
    if num_of_stories == 0:
        break


# initialize a tokenizer to convert words into integer tokens
tokenizer = Tokenizer()
tokenizer.fit_on_texts(stories)

# get the word-to_index dictionary
word_index = tokenizer.word_index

2025-02-24 09:35:54.363591: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1740369954.372824   72634 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740369954.375817   72634 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-24 09:35:54.387027: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
print(len(word_index))

12322


In [12]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(word_index)+1, output_dim=50),
    tf.keras.layers.SimpleRNN(units=100, return_sequences=False),
    tf.keras.layers.Dense(len(word_index)+1, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the summary of the model
model.summary()

I0000 00:00:1740369962.543628   72634 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 2278 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6


In [13]:
for story in stories:
  # convert the stories to sequences of word indices
  sequences = tokenizer.texts_to_sequences([story])

  # create input output pairs 5 words as input next word as output
  x=[]
  y=[]
  for seq in sequences:
    for i in range(5,len(seq)):
      x.append(seq[i-5:i])
      y.append(seq[i])
  # convert to numpy arrays
  x = np.array(x)
  y = np.array(y)

  # pad sequences if necessary
  x = pad_sequences(x, maxlen=5, padding='pre')

  # one hot encode the target labels(y)
  y = tf.keras.utils.to_categorical(y, num_classes=len(word_index) + 1)

  # training the model
  model.fit(x,y,epochs=20, batch_size=32)

  del x
  del y

Epoch 1/20


: 

In [None]:
# Assuming 'model' and 'tokenizer' are defined from the previous code.
# Also assuming 'word_index' is available.

def predict_next_word(seed_text):
    """Predicts the next word based on the input seed text."""

    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=5, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)

    for word, index in word_index.items():
        if index == predicted:
            return word
    return None  # Return None if no matching word is found


# Example usage
seed_text = "plumb by an old blast"
predicted_word = predict_next_word(seed_text)
print(f"Seed text: {seed_text}")
print(f"Predicted next word: {predicted_word}")

1923167

In [None]:
#prdicting the next 50 words
seed_text = "plumb by an old blast"
text = "plumb by an old blast"
next_word = ''
for i in range(50):
  next_word = predict_next_word(seed_text)
  seed_text = ' '.join(seed_text.split(' ')[1:])
  seed_text += ' ' + next_word
  text += ' ' + next_word

print(text)

1923167