In [1]:
!pip install tensorflow keras nltk



In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU, Flatten, LeakyReLU, Input
from nltk.tokenize import word_tokenize
from nltk.corpus import reuters
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

import nltk
nltk.download('reuters')
nltk.download('punkt')


[nltk_data] Downloading package reuters to
[nltk_data]     /Users/vineethsai/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vineethsai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Load a text dataset (e.g., Reuters Corpus from NLTK)
texts = [' '.join(reuters.words(fileid)) for fileid in reuters.fileids()]
texts = texts[:2000]  # Limit to 2000 texts for simplicity

# Tokenize and preprocess
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences for consistent input shape
max_sequence_length = 20
data = pad_sequences(sequences, maxlen=max_sequence_length)


In [4]:
!pip install --upgrade tensorflow keras



In [5]:
import numpy as np
import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.optimizers import Adam

# Define Generator
def build_generator(latent_dim, output_dim):
    model = Sequential([
        Dense(128, input_dim=latent_dim),
        LeakyReLU(0.2),
        Dense(256),
        LeakyReLU(0.2),
        Dense(output_dim, activation=lambda x: tf.keras.activations.softmax(x / 0.8))  # Add temperature scaling
    ])
    return model


# Define Discriminator
def build_discriminator(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(256, activation='relu'),
        Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')  # Binary classification (real/fake)
    ])
    return model

# Initialize models and parameters
latent_dim = 100  # Noise vector size
vocab_size = 20
sequence_length = vocab_size
batch_size = 64
epochs = 5000
data = np.random.rand(1000, sequence_length).astype(np.float32)

# Create models
generator = build_generator(latent_dim, vocab_size)
discriminator = build_discriminator(sequence_length)

# Optimizers
generator_optimizer = Adam(learning_rate=0.0002)
discriminator_optimizer = Adam(learning_rate=0.0002)

@tf.function
def train_step(real_data):
    noise = tf.random.normal(shape=(batch_size, latent_dim))

    # Train discriminator
    with tf.GradientTape() as disc_tape:
        generated_data = generator(noise, training=True)
        real_output = discriminator(real_data, training=True)
        fake_output = discriminator(generated_data, training=True)

        # Debugging shapes
        tf.debugging.assert_shapes([
            (real_output, (batch_size, 1)),
            (fake_output, (batch_size, 1))
        ])

        d_loss_real = tf.keras.losses.binary_crossentropy(tf.ones_like(real_output), real_output)
        d_loss_fake = tf.keras.losses.binary_crossentropy(tf.zeros_like(fake_output), fake_output)
        d_loss = 0.5 * (tf.reduce_mean(d_loss_real) + tf.reduce_mean(d_loss_fake))

    gradients_of_discriminator = disc_tape.gradient(d_loss, discriminator.trainable_variables)

    # Check gradients and variables
    if not gradients_of_discriminator or not discriminator.trainable_variables:
        print(f"Discriminator variables: {discriminator.trainable_variables}")
        print(f"Gradients: {gradients_of_discriminator}")
        raise ValueError("Gradients or trainable variables are empty for discriminator.")

    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

    # Train generator
    with tf.GradientTape() as gen_tape:
        generated_data = generator(noise, training=True)
        fake_output = discriminator(generated_data, training=False)
        g_loss = tf.keras.losses.binary_crossentropy(tf.ones_like(fake_output), fake_output)
        g_loss = tf.reduce_mean(g_loss)

    gradients_of_generator = gen_tape.gradient(g_loss, generator.trainable_variables)

    # Check gradients and variables for generator
    if not gradients_of_generator or not generator.trainable_variables:
        print(f"Generator variables: {generator.trainable_variables}")
        print(f"Gradients: {gradients_of_generator}")
        raise ValueError("Gradients or trainable variables are empty for generator.")

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))

    return d_loss, g_loss

# Training loop
for epoch in range(epochs):
    idx = np.random.randint(0, data.shape[0], batch_size)
    real_data = data[idx]

    try:
        d_loss, g_loss = train_step(real_data)

        if epoch % 500 == 0:
            print(f"Epoch {epoch} | D Loss: {d_loss:.4f} | G Loss: {g_loss:.4f}")

    except Exception as e:
        print(f"Error in epoch {epoch}: {e}")
        break


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 0 | D Loss: 0.6255 | G Loss: 0.6753
Epoch 500 | D Loss: 0.0067 | G Loss: 4.4005
Epoch 1000 | D Loss: 0.0006 | G Loss: 6.7507
Epoch 1500 | D Loss: 0.0003 | G Loss: 7.6563
Epoch 2000 | D Loss: 0.0003 | G Loss: 8.2316
Epoch 2500 | D Loss: 0.0001 | G Loss: 8.9827
Epoch 3000 | D Loss: 0.0000 | G Loss: 9.5103
Epoch 3500 | D Loss: 0.0000 | G Loss: 9.4029
Epoch 4000 | D Loss: 0.0000 | G Loss: 10.0383
Epoch 4500 | D Loss: 0.0000 | G Loss: 10.6754


In [6]:
import numpy as np

# Adjusted temperature function
def apply_temperature(logits, temperature=1.3):
    scaled_logits = logits / temperature
    exp_logits = np.exp(scaled_logits - np.max(scaled_logits))  # Stability adjustment
    return exp_logits / np.sum(exp_logits)

# Nucleus Sampling with Fine-Tuned Threshold
def nucleus_sampling_with_temperature(prob_dist, p=0.85):
    sorted_indices = np.argsort(prob_dist)[::-1]
    sorted_probs = prob_dist[sorted_indices]
    cumulative_probs = np.cumsum(sorted_probs)
    cutoff_index = np.searchsorted(cumulative_probs, p)
    top_indices = sorted_indices[:cutoff_index + 1]
    top_probs = prob_dist[top_indices]
    top_probs /= np.sum(top_probs)  # Renormalize
    return np.random.choice(top_indices, p=top_probs)

# Improved fallback replacement with context-aware alternatives
def replace_fallback(word, fallback_list=['and', 'the', 'on', 'for', 'mln', 'pct', 'said', 'to']):
    return word if word != '[fallback_word]' else np.random.choice(fallback_list, p=[0.2, 0.2, 0.15, 0.15, 0.1, 0.1, 0.05, 0.05])

# Generate synthetic text embeddings
noise = np.random.normal(0, 1, (10, latent_dim))  # Generate noise for 10 samples
synthetic_embeddings = generator.predict(noise)

# Debugging: Print synthetic embeddings
print("Synthetic Embeddings (first 2 samples):")
print(synthetic_embeddings[:2])

# Convert embeddings to text
synthetic_texts = []
top_p = 0.85  # Adjusted nucleus threshold
temperature = 1.3  # Fine-tuned temperature

for embedding in synthetic_embeddings:
    # Apply temperature scaling
    temperature_scaled_probs = apply_temperature(embedding, temperature)

    # Normalize probabilities
    normalized_probs = temperature_scaled_probs / np.sum(temperature_scaled_probs)

    # Apply nucleus sampling
    word_indices = [
        nucleus_sampling_with_temperature(normalized_probs, p=top_p)
    ]

    # Map indices to words using the tokenizer
    words = [
        replace_fallback(tokenizer.index_word.get(idx, '[fallback_word]'))
        for idx in word_indices
    ]
    
    synthetic_texts.append(' '.join(words))

# Print synthetic texts
print("Synthetic Texts:")
for i, text in enumerate(synthetic_texts, start=1):
    print(f"{i}: {text}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Synthetic Embeddings (first 2 samples):
[[0.00548434 0.00489045 0.1837741  0.01364604 0.00875299 0.00323025
  0.01995614 0.02632596 0.00656502 0.05309298 0.01848776 0.0191366
  0.2800972  0.00809647 0.03588558 0.00599954 0.06702787 0.00902957
  0.22530027 0.00522087]
 [0.00547455 0.00775824 0.16432801 0.01655325 0.0097418  0.00370401
  0.02257742 0.04176881 0.00940367 0.05481337 0.0301092  0.02453142
  0.27243352 0.00794764 0.02963134 0.00728313 0.06722493 0.00775021
  0.20921476 0.00775067]]
Synthetic Texts:
1: on
2: dlrs
3: s
4: for
5: in
6: '
7: to
8: of
9: on
10: 000
