In [11]:
import tensorflow as tf
import numpy as np
import pandas as pd
import random

In [12]:
# Load CSV (update the path if needed)
df = pd.read_csv(r'C:\Users\Admin\Desktop\Github Projects\Text Generation\data\dataset.csv', on_bad_lines='skip')

# Extract text column, drop missing values, join into one big string
text = " ".join(df['text'].dropna().astype(str)).lower()

print(f'Total characters in text: {len(text)}')


Total characters in text: 35695884


In [13]:
# Create vocabulary of unique characters
vocab = sorted(set(text))
print(f'Vocabulary size: {len(vocab)}')

# Character to index mapping and reverse
char2idx = {c: i for i, c in enumerate(vocab)}
idx2char = np.array(vocab)

# Convert entire text to integers
text_as_int = np.array([char2idx[c] for c in text])


Vocabulary size: 104


In [14]:
seq_length = 100  # input length per example

# Create dataset from integer encoded text
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

# Batch into sequences (input + target)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

# Split sequences into input and target
def split_input_target(chunk):
    return chunk[:-1], chunk[1:]

dataset = sequences.map(split_input_target)

# Shuffle and batch dataset
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


In [15]:
vocab_size = len(vocab)
embedding_dim = 64
rnn_units = 128

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_shape=(None,)),
    tf.keras.layers.LSTM(rnn_units, return_sequences=True, recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
])


def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

model.summary()


  super().__init__(**kwargs)


In [16]:
EPOCHS = 20
history = model.fit(dataset, epochs=EPOCHS)

Epoch 1/20


[1m 164/5522[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6:10[0m 69ms/step - loss: 3.3006

KeyboardInterrupt: 

In [None]:
def generate_text(model, start_string, num_generate=100, temperature=1.0):
    input_eval = [char2idx.get(s, 0) for s in start_string.lower()]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    model.layers[1].reset_states()  # Reset LSTM layer states

    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])

    return start_string + ''.join(text_generated)


In [None]:
print(generate_text(model, start_string="The ", num_generate=200, temperature=0.8))

The uster mp fo haveun fre anth to wasicore s  mprmalitrsall win a f r pens. tmilerin inthentin s vistha as. in fout wr pres bla an ivo “ithe bed o, ase tay s fed ste “trinsppove a atond d s. hexpled he l
