In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from datasets import load_dataset
from sklearn.utils import shuffle
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.losses import kullback_leibler_divergence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention
from transformers import AutoTokenizer, BartTokenizer, TFBartForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


## <center>Tensorflow (Stop)</center>

In [3]:
dataset = load_dataset("cnn_dailymail")

In [3]:
train_article = dataset['train']['article']
test_article = dataset['test']['article']

train_highlight = dataset['train']['highlights']
test_highlight = dataset['test']['highlights']

In [4]:
train_article[1]

'(CNN) -- Ralph Mata was an internal affairs lieutenant for the Miami-Dade Police Department, working in the division that investigates allegations of wrongdoing by cops. Outside the office, authorities allege that the 45-year-old longtime officer worked with a drug trafficking organization to help plan a murder plot and get guns. A criminal complaint unsealed in U.S. District Court in New Jersey Tuesday accuses Mata, also known as "The Milk Man," of using his role as a police officer to help the drug trafficking organization in exchange for money and gifts, including a Rolex watch. In one instance, the complaint alleges, Mata arranged to pay two assassins to kill rival drug dealers. The killers would pose as cops, pulling over their targets before shooting them, according to the complaint. "Ultimately, the (organization) decided not to move forward with the murder plot, but Mata still received a payment for setting up the meetings," federal prosecutors said in a statement. The complai

In [5]:
train_highlight[1]

'Criminal complaint: Cop used his role to help cocaine traffickers .\nRalph Mata, an internal affairs lieutenant, allegedly helped group get guns .\nHe also arranged to pay two assassins in a murder plot, a complaint alleges .'

In [6]:
train_article = np.array(train_article)
test_article = np.array(test_article)

train_highlight = np.array(train_highlight)
test_highlight = np.array(test_highlight)

In [7]:
# Define Parameters
vocab_size = 10000  
oov_tok = "<OOV>"
max_length = 256  
padding_type = "post"  
trunc_type = "post"  
num_epochs = 3  
batch_size = 64

In [8]:
# Define tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_article)

In [9]:
# Define model architecture
def create_model(vocab_size, max_length):
    # Encoder
    encoder_inputs = Input(shape=(max_length,))
    encoder_embedding = Embedding(vocab_size, 256, input_length=max_length)(encoder_inputs)
    encoder_lstm = LSTM(256, return_sequences=True, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(vocab_size, 256)(decoder_inputs)
    decoder_lstm = LSTM(256, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

    # Attention mechanism
    attention_layer = Attention()
    attention_output = attention_layer([decoder_outputs, encoder_outputs])

    # Concatenate attention output and decoder output
    decoder_concat = Dense(256, activation='tanh')
    decoder_dense = decoder_concat(attention_output)

    # Output layer
    decoder_outputs = Dense(vocab_size, activation='softmax')(decoder_dense)

    # Define model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

In [10]:
# Create the model
model = create_model(vocab_size, max_length)

# Compile the model
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

# Print model summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 256, 256)     2560000     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    2560000     ['input_2[0][0]']                
                                                                                              

In [11]:
# Define data generators
def data_generator(texts, highlights, tokenizer, batch_size, max_length, padding_type, truncating_type):
    num_samples = len(texts)
    indices = np.arange(num_samples)
    while True:
        indices = shuffle(indices)
        for start in range(0, num_samples, batch_size):
            end = min(start + batch_size, num_samples)
            batch_indices = indices[start:end]
            batch_texts = texts[batch_indices]
            batch_highlights = highlights[batch_indices]
            
            # Tokenize and pad sequences for encoder input (article)
            batch_sequences = tokenizer.texts_to_sequences(batch_texts)
            encoder_input_data = pad_sequences(batch_sequences, maxlen=max_length, padding=padding_type, truncating=truncating_type)
            
            # Tokenize and pad sequences for decoder input (highlight)
            batch_sequences = tokenizer.texts_to_sequences(batch_highlights)
            decoder_input_data = pad_sequences(batch_sequences, maxlen=max_length, padding=padding_type, truncating=truncating_type)
            
            # Generate decoder target data (shifted by one timestep)
            decoder_target_data = np.zeros_like(decoder_input_data)
            decoder_target_data[:, :-1] = decoder_input_data[:, 1:]
            decoder_target_data[:, -1] = 0  # Padding token
            
            yield [encoder_input_data, decoder_input_data], decoder_target_data

In [12]:
# Create data generators
train_generator = data_generator(train_article, train_highlight, tokenizer, batch_size, max_length, padding_type, trunc_type)
test_generator = data_generator(test_article, test_highlight, tokenizer, batch_size, max_length, padding_type, trunc_type)

In [13]:
# Train model using fit_generator
history = model.fit_generator(train_generator, 
                              steps_per_epoch=len(train_article)//batch_size,
                              epochs=num_epochs,
                              validation_data=test_generator,
                              validation_steps=len(test_article)//batch_size)

  history = model.fit_generator(train_generator,


Epoch 1/3

KeyboardInterrupt: 

**Note**:
- I stopped because it took a long time to train only 1 epoch (1854/4486); the above result was shown after training for approximately 9 hours.

## <center>Using BART</center>

### BART Only

In [2]:
# Get dataset and remove column "id"
dataset = load_dataset("cnn_dailymail")
dataset = dataset.remove_columns(["id"])

In [3]:
dataset["train"], dataset["test"], dataset["validation"]

(Dataset({
     features: ['article', 'highlights'],
     num_rows: 287113
 }),
 Dataset({
     features: ['article', 'highlights'],
     num_rows: 11490
 }),
 Dataset({
     features: ['article', 'highlights'],
     num_rows: 13368
 }))

In [4]:
# Define Model and Tokenizer
bart_model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

All PyTorch model weights were used when initializing TFBartForConditionalGeneration.

All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [5]:
# Tokenize each of the datasets
def tokenize_data(data):
    tokenized_data = []

    for example in data:
        article_tokens = tokenizer.encode(example['article'], truncation=True, max_length=512, padding='max_length', return_tensors="tf")
        highlight_tokens = tokenizer.encode(example['highlights'], truncation=True, max_length=150, padding='max_length', return_tensors="tf")
        tokenized_data.append((article_tokens, highlight_tokens))

    return tokenized_data

In [6]:
# Convert tokens to token IDs, pad or truncate sequences
train_data_tokenized = tokenize_data(dataset["train"])
test_data_tokenized = tokenize_data(dataset["test"])
validation_data_tokenized = tokenize_data(dataset["validation"])

In [7]:
# Define Loss Function
loss = SparseCategoricalCrossentropy(from_logits=True)

In [8]:
# Compile the Model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
bart_model.compile(optimizer=optimizer, loss=loss)

#### 1. Data Generator
Use a data generator to load batches of data incrementally during training rather than loading the entire dataset into memory at once.

In [11]:
# Define data generator 
def data_generator(tokenized_data, batch_size):
    num_batches = len(tokenized_data) // batch_size
    for i in range(num_batches):
        batch = tokenized_data[i * batch_size: (i + 1) * batch_size]
        inputs = [example[0] for example in batch]
        targets = [example[1] for example in batch]
        yield inputs, targets

# Define batch size
batch_size = 32

In [12]:
# Train the Model using the data generator
history = bart_model.fit(
    data_generator(train_data_tokenized, batch_size),  
    steps_per_epoch=len(train_data_tokenized) // batch_size,  
    epochs=5, 
    validation_data=data_generator(validation_data_tokenized, batch_size),
    validation_steps=len(validation_data_tokenized) // batch_size 
)

Epoch 1/5


AttributeError: in user code:

    File "D:\anaconda3\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "D:\anaconda3\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "D:\anaconda3\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "D:\anaconda3\lib\site-packages\transformers\modeling_tf_utils.py", line 1571, in train_step
        x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data)

    AttributeError: module 'keras.utils' has no attribute 'unpack_x_y_sample_weight'


**Note**:
- I stopped because I can't solve the error which is the `AttributeError`

#### 2. Youtube

In [None]:
# Train the Model
history = bart_model.fit(
    train_data_tokenized,   
    epochs=3,  
    validation_data=validation_data_tokenized  
)

**Note**:
- I stopped because I can't solve the error which is the `ValueError`

**Note**:
- `train_data_tokenized` should contain both the input (article tokens) and the target (highlight tokens).
- The` validation_data` parameter should also be provided in a similar format as ` train_data_tokenized`.

#### 3. Other Source

In [9]:
# Unpack tokenized data into separate lists for input and target data
train_input_data = [example[0] for example in train_data_tokenized]
train_target_data = [example[1] for example in train_data_tokenized]

In [10]:
# Prepare validation data
validation_input_data = [example[0] for example in validation_data_tokenized]
validation_target_data = [example[1] for example in validation_data_tokenized]
validation_data = (validation_input_data, validation_target_data)

In [11]:
# Train the model
history = bart_model.fit(
    x=train_input_data,
    y=train_target_data,
    epochs=3,
    batch_size=4,
    validation_data=validation_data
)

KeyboardInterrupt: 

**Note**:
- I stopped because it took a long time to start training; it was still running and didn't show any epochs after 1 hour.

**Note**:
- `batch_size` determines the number of samples used in each iteration of training.
- `num_epochs` specifies the number of times the entire dataset will be passed forward and backward through the model during training.
- `train_input_data` should contain the input data for the training set, containing tokenized articles.
- `train_target_data` should contain the target data for the training set, containing tokenized highlights or summaries.
- `validation_data` should contain the tokenized validation data in the same format as the training data. Like the training data, this should be preprocessed and tokenized appropriately for the model.

In [None]:
# Evaluate the Model
test_loss = bart_model.evaluate(test_data_tokenized)
print("Test Loss:", test_loss)

### Personal Network Layer fine tuned with BART

#### 1. Other Source

In [2]:
# Get dataset and remove column "id"
dataset = load_dataset("cnn_dailymail")
dataset = dataset.remove_columns(["id"])

In [3]:
# Define Model and Tokenizer
bart_model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

All PyTorch model weights were used when initializing TFBartForConditionalGeneration.

All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [4]:
# Tokenize each of the datasets
def tokenize_data(data):
    tokenized_data = []

    for example in data:
        article_tokens = tokenizer.encode(example['article'], truncation=True, max_length=512, padding='max_length', return_tensors="tf")
        highlight_tokens = tokenizer.encode(example['highlights'], truncation=True, max_length=150, padding='max_length', return_tensors="tf")
        tokenized_data.append((article_tokens, highlight_tokens))

    return tokenized_data

In [5]:
# Tokenize train, test, and validation datasets
train_data_tokenized = tokenize_data(dataset["train"])
test_data_tokenized = tokenize_data(dataset["test"])
validation_data_tokenized = tokenize_data(dataset["validation"])

In [6]:
# Define Loss Function
loss_fn = SparseCategoricalCrossentropy(from_logits=True)

In [7]:
# Define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

In [8]:
# Define Model
input_layer = tf.keras.layers.Input(shape=(512,), dtype=tf.int32)
bart_output = bart_model(input_layer)[0]  # Retrieve only the first output of the BART model
dense_layer = tf.keras.layers.Dense(150, activation='linear')(bart_output) 

In [9]:
# Create the model
model = tf.keras.Model(inputs=input_layer, outputs=dense_layer)

In [10]:
# Compile Model
model.compile(optimizer=optimizer, loss=loss_fn)

In [11]:
# Train the Model
history = model.fit(
    x=[example[0] for example in train_data_tokenized],  # Input (articles)
    y=[example[1] for example in train_data_tokenized],  # Target (highlights)
    epochs=3,  
    validation_data=(
        [example[0] for example in validation_data_tokenized],  # Input validation data
        [example[1] for example in validation_data_tokenized]   # Target validation data
    )
)

KeyboardInterrupt: 

**Note**:
- I stopped because it took a long time to start training; it was still running and didn't show any epochs after 2 hour.

#### 2. TF with BART

**Note**:
- Load the pre-trained BART model and tokenizer from the Hugging Face `transformers` library.
- Prepare the input data by tokenizing the articles and highlights using the BART tokenizer.
- Define a TensorFlow model that takes the tokenized articles and highlights as input and produces logits as output.
- Compile the model with appropriate loss and optimizer.
- Train the model using the tokenized articles as input and tokenized highlights as target output.put.

In [2]:
dataset = load_dataset("cnn_dailymail")

train_article = dataset['train']['article'][:5000]
test_article = dataset['test']['article'][:5000]

train_highlight = dataset['train']['highlights'][:5000]
test_highlight = dataset['test']['highlights'][:5000]

In [3]:
train_article = np.array(train_article)
test_article = np.array(test_article)

train_highlight = np.array(train_highlight)
test_highlight = np.array(test_highlight)

In [4]:
# Load BART model and tokenizer
bart_model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [5]:
# Define Parameters
max_article_length = 512
max_highlight_length = 150
batch_size = 4
num_epochs = 3

In [6]:
# Prepare data
train_article_tokens = tokenizer(train_article.tolist(), padding='max_length', truncation=True, max_length=max_article_length, return_tensors='tf')
train_highlight_tokens = tokenizer(train_highlight.tolist(), padding='max_length', truncation=True, max_length=max_highlight_length, return_tensors='tf')

In [7]:
# Define TensorFlow model
input_article_ids = tf.keras.Input(shape=(max_article_length,), dtype=tf.int32)
input_highlight_ids = tf.keras.Input(shape=(max_highlight_length,), dtype=tf.int32)

bart_outputs = bart_model(input_article_ids, decoder_input_ids=input_highlight_ids, return_dict=True)
output_logits = bart_outputs.logits

model = tf.keras.Model(inputs=[input_article_ids, input_highlight_ids], outputs=output_logits)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [8]:
# Train the model
history = model.fit(
    x=[train_article_tokens.input_ids, train_highlight_tokens.input_ids],
    y=train_highlight_tokens.input_ids,
    batch_size=batch_size,
    epochs=num_epochs
)

Epoch 1/3
  1/625 [..............................] - ETA: 32:34:25 - loss: 14.1944 - accuracy: 0.0075

KeyboardInterrupt: 

**Note**:
- I stopped because it took a long time to train only 1 epoch (1/625); the above result was shown after training for approximately 32 hours.