In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, SimpleRNN
from tensorflow.keras.callbacks import EarlyStopping

2025-05-27 03:12:25.582039: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-27 03:12:25.595214: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748315545.611827    4091 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748315545.617191    4091 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748315545.629402    4091 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
from collections import Counter
import re

from transformers import AutoTokenizer

### Read Data

In [3]:
train_set = pd.read_csv('valid.csv')
test_set = pd.read_csv('test-curated.csv')

SEED = 123
train_set, validation_set = train_test_split(train_set, test_size=0.2, random_state=SEED)

### Tokenization & Preprocessing

In [4]:
DESC_COL = 'desc'
SLOGAN_COL = 'output'
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s.,?!']", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [5]:
for df in [train_set, validation_set, test_set]:
    df[DESC_COL + '_cleaned'] = df[DESC_COL].apply(clean_text)
    df[SLOGAN_COL + '_cleaned'] = df[SLOGAN_COL].apply(clean_text)

In [6]:
tokenizer_name = 'facebook/bart-base'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

def tokenize_text_with_transformers(text_series, tokenizer_model, max_len=128):
    encoded_inputs = tokenizer_model(
        text_series.tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors=None,
        add_special_tokens=True
    )
    return encoded_inputs['input_ids']

for df in [train_set, validation_set, test_set]:
    df[DESC_COL + '_tokenized'] = tokenize_text_with_transformers(df[DESC_COL + '_cleaned'], tokenizer)
    df[SLOGAN_COL + '_tokenized'] = tokenize_text_with_transformers(df[SLOGAN_COL + '_cleaned'], tokenizer)

### Build Tensorflow datasets

In [7]:
BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 1000
required_desc_col = DESC_COL + '_tokenized'
required_slogan_col = SLOGAN_COL + '_tokenized'

def create_tf_dataset(df, desc_col_name, slogan_col_name, batch_size, shuffle=False, shuffle_buffer_size=None):
    encoder_inputs = np.array(list(df[desc_col_name]), dtype=np.int32)
    raw_slogans = np.array(list(df[slogan_col_name]), dtype=np.int32)

    decoder_inputs = raw_slogans[:, :-1]
    decoder_targets = raw_slogans[:, 1:]

    dataset = tf.data.Dataset.from_tensor_slices(((encoder_inputs, decoder_inputs), decoder_targets))
    if shuffle:
        if shuffle_buffer_size is None:
            shuffle_buffer_size = len(df)
        dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

In [8]:
train_tf_dataset = create_tf_dataset(
        train_set,
        required_desc_col,
        required_slogan_col,
        BATCH_SIZE,
        shuffle=True,
        shuffle_buffer_size=1000
    )
    
val_tf_dataset = create_tf_dataset(
        validation_set,
        required_desc_col,
        required_slogan_col,
        BATCH_SIZE
    )
test_tf_dataset = create_tf_dataset(
        test_set,
        required_desc_col,
        required_slogan_col,
        BATCH_SIZE
    )

I0000 00:00:1748315551.754966    4091 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22335 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:8d:00.0, compute capability: 8.6


### Define RNN Architecture

In [9]:
embedding_dim = 128
rnn_units = 256

input_vocab_size = tokenizer.vocab_size
max_input_len = 128 
target_vocab_size = tokenizer.vocab_size
max_target_len_for_decoder_input = max_input_len - 1

In [10]:
# Encoder
encoder_inputs = tf.keras.layers.Input(shape=(max_input_len,), name='encoder_input')
encoder_embedding_layer = tf.keras.layers.Embedding(input_vocab_size, embedding_dim, name='encoder_embedding')
encoder_embedding = encoder_embedding_layer(encoder_inputs)
encoder_rnn = tf.keras.layers.SimpleRNN(rnn_units, return_state=True, name='encoder_rnn')
encoder_outputs, encoder_state_h = encoder_rnn(encoder_embedding)
encoder_states = [encoder_state_h]

In [11]:
# Decoder
decoder_inputs = tf.keras.layers.Input(shape=(max_target_len_for_decoder_input,), name='decoder_input') # For teacher forcing
decoder_embedding_layer = tf.keras.layers.Embedding(target_vocab_size, embedding_dim, name='decoder_embedding')
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_rnn = tf.keras.layers.SimpleRNN(rnn_units, return_sequences=True, return_state=True, name='decoder_rnn')
# Initial state for the decoder RNN is the encoder's final hidden state
decoder_rnn_outputs, _ = decoder_rnn(decoder_embedding, initial_state=encoder_states)

In [12]:
# Output layer
decoder_dense = tf.keras.layers.TimeDistributed(
    tf.keras.layers.Dense(target_vocab_size, activation='softmax'), name='decoder_output'
)
decoder_outputs = decoder_dense(decoder_rnn_outputs)

In [13]:
def masked_sparse_categorical_crossentropy(y_true, y_pred):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')
    raw_loss = loss_fn(y_true, y_pred)

    mask = tf.cast(tf.not_equal(y_true, 1), tf.float32)
    masked_loss = raw_loss * mask

    return tf.reduce_sum(masked_loss) / tf.reduce_sum(mask)

In [14]:
training_model = tf.keras.models.Model([encoder_inputs, decoder_inputs], decoder_outputs)

def token_level_f1(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=-1)
    y_true = tf.cast(y_true, tf.int64)
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)  

    tp = tf.reduce_sum(tf.cast(tf.logical_and(tf.equal(y_true, y_pred), tf.not_equal(y_true, 0)), tf.float32))
    fp = tf.reduce_sum(tf.cast(tf.logical_and(tf.not_equal(y_true, y_pred), tf.not_equal(y_pred, 0)), tf.float32))
    fn = tf.reduce_sum(tf.cast(tf.logical_and(tf.not_equal(y_true, y_pred), tf.not_equal(y_true, 0)), tf.float32))

    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)
    return f1

training_model.compile(
    optimizer='adam',
    loss= masked_sparse_categorical_crossentropy,
    metrics=[
        'accuracy',
        token_level_f1
    ]
)

training_model.summary()

In [15]:
EPOCHS = 30

early_stop = EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True, verbose=1)

history = training_model.fit(
    train_tf_dataset,
    epochs=EPOCHS,
    validation_data=val_tf_dataset,
    callbacks=[early_stop]
)

Epoch 1/30


I0000 00:00:1748315557.764610    4184 service.cc:152] XLA service 0x722e78002330 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1748315557.764650    4184 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6
2025-05-27 03:12:37.971011: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-05-27 03:12:38.725109: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:39] Ignoring Assert operator compile_loss/masked_sparse_categorical_crossentropy/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
I0000 00:00:1748315559.481750    4184 cuda_dnn.cc:529] Loaded cuDNN version 90701





I0000 00:00:1748315585.771466    4184 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m66/67[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 137ms/step - accuracy: 0.0068 - loss: 9.1445 - token_level_f1: 0.0068

2025-05-27 03:13:15.740106: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:39] Ignoring Assert operator compile_loss/masked_sparse_categorical_crossentropy/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert








[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 573ms/step - accuracy: 0.0069 - loss: 9.1301 - token_level_f1: 0.0069

2025-05-27 03:13:45.682915: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:39] Ignoring Assert operator compile_loss/masked_sparse_categorical_crossentropy/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
2025-05-27 03:13:51.294458: W tensorflow/compiler/tf2xla/kernels/assert_op.cc:39] Ignoring Assert operator compile_loss/masked_sparse_categorical_crossentropy/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 746ms/step - accuracy: 0.0069 - loss: 9.1162 - token_level_f1: 0.0069 - val_accuracy: 0.0081 - val_loss: 7.3663 - val_token_level_f1: 0.0081
Epoch 2/30
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 150ms/step - accuracy: 0.0082 - loss: 6.8445 - token_level_f1: 0.0082 - val_accuracy: 0.0086 - val_loss: 7.1358 - val_token_level_f1: 0.0085
Epoch 3/30
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 150ms/step - accuracy: 0.0086 - loss: 6.5903 - token_level_f1: 0.0086 - val_accuracy: 0.0087 - val_loss: 7.0291 - val_token_level_f1: 0.0087
Epoch 4/30
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 150ms/step - accuracy: 0.0088 - loss: 6.3861 - token_level_f1: 0.0088 - val_accuracy: 0.0086 - val_loss: 7.0036 - val_token_level_f1: 0.0086
Epoch 5/30
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 150ms/step - accuracy: 0.0090 - loss: 6.2065 - token_level_f1: 0.00

In [16]:
pip install nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [17]:
pip install rouge-score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [18]:
import nltk
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
def decode_tokens(tokenizer, token_ids):
    tokens = [id for id in token_ids if id != tokenizer.pad_token_id and id != tokenizer.eos_token_id]
    return tokenizer.decode(tokens, skip_special_tokens=True)

In [20]:
pred_texts = []
true_texts = []
i = 0
for (encoder_input, decoder_input), decoder_target in test_tf_dataset:
    if i == 4:
        break
    preds = training_model.predict([encoder_input, decoder_input])
    pred_ids = preds.argmax(-1)
    for j in range(pred_ids.shape[0]):
        pred_texts.append(decode_tokens(tokenizer, pred_ids[i]))
        true_texts.append(decode_tokens(tokenizer, decoder_target[i].numpy()))
    i += 1
print(pred_texts[0])
print(true_texts[0])

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 445ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step
theprise your,k
eliquid testing uk


In [21]:
# pred_texts = []
# true_texts = []

# for (encoder_input, decoder_input), decoder_target in test_tf_dataset:
#     preds = training_model.predict([encoder_input, decoder_input])
#     pred_ids = preds.argmax(-1)
#     for i in range(pred_ids.shape[0]):
#         pred_texts.append(decode_tokens(tokenizer, pred_ids[i]))
#         true_texts.append(decode_tokens(tokenizer, decoder_target[i].numpy()))

In [22]:
print(pred_texts)

['theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise your,k', 'theprise y

In [35]:
examples = ['Easily deliver personalized activities that enrich the lives of residents in older adult communities. Save time and increase satisfaction.',
'Powerful lead generation software that converts abandoning visitors into subscribers with our dynamic marketing tools and Exit Intent® technology.',
"Twine matches companies to the best digital and creative freelancers from a network of over 260,000. It's free to post a job and you only pay when you hire.",
"Looking for fresh web design & development? Need new marketing materials or a smart campaign to drive business? How about a video or updated photos? Let's talk and tell the world your story.",
# --- test-curated.csv
'Our expert team of Analytical Chemists provide eLiquid analysis & manufacturing services, ensuring full regulatory compliance for the e-cigarette market.',
'From placing entire software engineering teams to integrating easily into your current team, we offer bespoke placements of the very best engineers.',
'Turning ideas into visual content since 1999. Content Creation Studio in Ghent. Branded content - corporate video - visuals for events - 360 video',
'World market leader for robotic vision systems, inline measurement technology & inspection technology. We are your partner at over 25 locations worldwide.',
# --- other examples
'People and projects for sustainable change. Experts in sustainability recruitment, we recruit exceptional people into roles working on sustainability projects or in ethical and responsible organisations.']

In [33]:
txt = "Twine matches companies to the best digital and creative freelancers from a network of over 260,000. It's free to post a job and you only pay when you hire."
greedy_generate(txt)

[[[1.8734576e-07 1.8342716e-07 1.7764331e-04 ... 1.7861251e-07
   1.7817699e-07 1.6502331e-07]
  [6.2433728e-08 6.4332156e-08 2.0050879e-04 ... 5.6496720e-08
   6.8199931e-08 5.3223683e-08]
  [3.7897703e-08 4.0170516e-08 4.4989216e-04 ... 3.8002803e-08
   4.0246746e-08 3.4963413e-08]
  ...
  [2.7318838e-09 2.7340989e-09 6.5169775e-01 ... 2.6485698e-09
   2.6914488e-09 2.2888871e-09]
  [2.7318838e-09 2.7340989e-09 6.5169775e-01 ... 2.6485698e-09
   2.6914488e-09 2.2888871e-09]
  [2.7318801e-09 2.7340952e-09 6.5169811e-01 ... 2.6485714e-09
   2.6914504e-09 2.2888884e-09]]]
[[[1.8734576e-07 1.8342716e-07 1.7764331e-04 ... 1.7861251e-07
   1.7817699e-07 1.6502331e-07]
  [6.4514332e-08 7.1649204e-08 4.4800981e-06 ... 6.3175648e-08
   7.1098405e-08 5.7638221e-08]
  [4.3086434e-08 4.2886953e-08 1.1097053e-03 ... 4.2116444e-08
   4.3483006e-08 3.8520941e-08]
  ...
  [2.7318838e-09 2.7340989e-09 6.5169775e-01 ... 2.6485698e-09
   2.6914488e-09 2.2888871e-09]
  [2.7318838e-09 2.7340989e-09 6.516

'the best your business'

In [36]:
PAD_ID = tokenizer.pad_token_id
BOS_ID = tokenizer.bos_token_id
EOS_ID = tokenizer.eos_token_id

max_input_len   = training_model.input[0].shape[1]
max_target_len  = training_model.input[1].shape[1]

def encode_description(text):
    """Tokenise + pad / truncate to max_input_len."""
    ids = tokenizer.encode(text, add_special_tokens=False)
    ids = ids[:max_input_len]                      # truncate
    pad_len = max_input_len - len(ids)
    return np.asarray(ids + [PAD_ID] * pad_len, dtype=np.int32)

def greedy_generate(description, max_dec_len=max_target_len):
    """Generate one slogan with greedy decoding."""
    enc_inp = encode_description(description)[None, :]

    dec_ids = [BOS_ID]

    for _ in range(max_dec_len - 1):
        dec_inp = np.asarray(dec_ids + [PAD_ID] * (max_dec_len - len(dec_ids)),
                             dtype=np.int32)[None, :]

        logits = training_model.predict([enc_inp, dec_inp], verbose=0)
        next_id = int(np.argmax(logits[0, len(dec_ids)-1]))
        
        if next_id == EOS_ID:
            break
        dec_ids.append(next_id)

    return decode_tokens(tokenizer, dec_ids[1:])

for txt in examples:
    print("DESC :", txt[:90] + ("…" if len(txt) > 90 else ""))
    print("SLOG :", greedy_generate(txt))
    print("-" * 80)

DESC : Easily deliver personalized activities that enrich the lives of residents in older adult c…
SLOG : the best your business
--------------------------------------------------------------------------------
DESC : Powerful lead generation software that converts abandoning visitors into subscribers with …
SLOG : the best your business
--------------------------------------------------------------------------------
DESC : Twine matches companies to the best digital and creative freelancers from a network of ove…
SLOG : the best your business
--------------------------------------------------------------------------------
DESC : Looking for fresh web design & development? Need new marketing materials or a smart campai…
SLOG : the best your business
--------------------------------------------------------------------------------
DESC : Our expert team of Analytical Chemists provide eLiquid analysis & manufacturing services, …
SLOG : the best your business
-------------------------------

In [23]:
print(tokenizer.pad_token_id, tokenizer.pad_token)

1 <pad>


In [25]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [26]:
references = [[nltk.word_tokenize(ref)] for ref in true_texts] 
candidates = [nltk.word_tokenize(pred) for pred in pred_texts]

bleu = corpus_bleu(references, candidates)
print('BLEU:', bleu)

BLEU: 9.659641134380428e-232


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [27]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1_list, rouge2_list, rougel_list = [], [], []

for ref, pred in zip(true_texts, pred_texts):
    scores = scorer.score(ref, pred)
    rouge1_list.append(scores['rouge1'].fmeasure)
    rouge2_list.append(scores['rouge2'].fmeasure)
    rougel_list.append(scores['rougeL'].fmeasure)

print('ROUGE-1:', sum(rouge1_list)/len(rouge1_list))
print('ROUGE-2:', sum(rouge2_list)/len(rouge2_list))
print('ROUGE-L:', sum(rougel_list)/len(rougel_list))

ROUGE-1: 0.10795454545454544
ROUGE-2: 0.0
ROUGE-L: 0.10795454545454544


In [None]:
import matplotlib.pyplot as plt

# Data from the previous example
epochs = list(range(1, 28)) 
training_loss = [
    4.4428, 0.4468, 0.4195, 0.4061, 0.3957, 0.3839, 0.3748, 0.3609, 0.3503, 0.3350,
    0.3229, 0.3077, 0.2971, 0.2832, 0.2689, 0.2564, 0.2437, 0.2315, 0.2191, 0.2077,
    0.1965, 0.1867, 0.1756, 0.1675, 0.1587, 0.1514, 0.1435
]
validation_loss = [
    0.4670, 0.4281, 0.4187, 0.4112, 0.4043, 0.3970, 0.3904, 0.3828, 0.3746, 0.3680,
    0.3612, 0.3555, 0.3497, 0.3435, 0.3377, 0.3322, 0.3270, 0.3215, 0.3166, 0.3117,
    0.3071, 0.3018, 0.2975, 0.2928, 0.2891, 0.2854
]

# Create the plot
plt.figure(figsize=(10, 6))

# Plot training loss
plt.plot(epochs[:len(training_loss)], training_loss, label='Training Loss', color='orange', marker='o', linestyle='-')

# Plot validation loss
plt.plot(epochs[:len(validation_loss)], validation_loss, label='Validation Loss', color='orangered', marker='o', linestyle='-')

# Set title and labels
plt.title('Training and Validation Loss by Epoch', fontsize=16)
plt.xlabel('Epoch', fontsize=14)
plt.ylabel('Loss', fontsize=14)

# Add legend
plt.legend(fontsize=12)

# Add grid
plt.grid(True, linestyle='--', alpha=0.7, color='lightgrey')

# Adjust y-axis if your data is in a similar range to the image
# For the provided data, the loss is much lower. If your data matches the image's scale (e.g. 5-10):
# plt.ylim(5, max(max(training_loss, default=5), max(validation_loss, default=5)) * 1.05) 

# Remove top and right spines
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

# Customize tick parameters for a cleaner look
ax.tick_params(axis='both', which='major', labelsize=12)

plt.show()