In [None]:
# Instalăm pachetele necesare
!pip install numpy pandas keras tensorflow datasets nltk beautifulsoup4

import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
import nltk
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Bidirectional, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
import warnings

In [None]:
# Setări inițiale
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")
nltk.download('stopwords')
stopwords = set(stopwords.words('romanian'))

In [None]:
# Încărcăm dataset-ul
from datasets import load_dataset
dataset = load_dataset("readerbench/ro-text-summarization")

In [None]:
# Funcții pentru curățarea textelor în limba română
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub('"', '', text)
    text = re.sub(r"\'s\b", "", text)
    text = re.sub("[^a-zăîâșț]", " ", text)
    tokens = [w for w in text.split() if not w in stopwords]
    long_words = [i for i in tokens if len(i) >= 3]
    return " ".join(long_words).strip()

In [None]:
def clean_summary(text):
    text = text.lower()
    text = re.sub('"', '', text)
    text = re.sub(r"\'s\b", "", text)
    text = re.sub("[^a-zăîâșț]", " ", text)
    tokens = [w for w in text.split() if len(w) > 1]
    return "start " + " ".join(tokens) + " end"

In [None]:
# Curățăm și tokenizăm datele
train_texts = [clean_text(t) for t in dataset['train']['Content']]
train_summaries = [clean_summary(t) for t in dataset['train']['Summary']]
test_texts = [clean_text(t) for t in dataset['test']['Content']]
test_summaries = [clean_summary(t) for t in dataset['test']['Summary']]

In [None]:
# Tokenizare și padare
max_text_len = 100
max_summary_len = 20

In [None]:
# Tokenizăm textele
x_tokenizer = Tokenizer(num_words=30000)
x_tokenizer.fit_on_texts(train_texts)
train_texts_seq = x_tokenizer.texts_to_sequences(train_texts)
test_texts_seq = x_tokenizer.texts_to_sequences(test_texts)

train_texts_pad = pad_sequences(train_texts_seq, maxlen=max_text_len, padding='post')
test_texts_pad = pad_sequences(test_texts_seq, maxlen=max_text_len, padding='post')
x_voc_size = len(x_tokenizer.word_index) + 1

In [None]:
# Tokenizăm rezumatele
y_tokenizer = Tokenizer(num_words=8000)
y_tokenizer.fit_on_texts(train_summaries)
train_summaries_seq = y_tokenizer.texts_to_sequences(train_summaries)
test_summaries_seq = y_tokenizer.texts_to_sequences(test_summaries)

train_summaries_pad = pad_sequences(train_summaries_seq, maxlen=max_summary_len, padding='post')
test_summaries_pad = pad_sequences(test_summaries_seq, maxlen=max_summary_len, padding='post')
y_voc_size = len(y_tokenizer.word_index) + 1

In [None]:
# Curățarea sesiunii Keras
from keras import backend as K
K.clear_session()

In [None]:
# Dimensiunea latenților și definirea modelului
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(max_text_len,))
enc_emb = Embedding(x_voc_size, latent_dim, trainable=True)(encoder_inputs)

In [None]:
# LSTM Bidirecțional în encoder
encoder_lstm = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(enc_emb)

state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])

In [None]:
# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(y_voc_size, latent_dim * 2, trainable=True)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim * 2, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])

decoder_dense = TimeDistributed(Dense(y_voc_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
# Modelul complet
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
# Compilarea modelului
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
model.summary()

In [None]:
# Antrenarea modelului
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
history = model.fit([train_texts_pad, train_summaries_pad[:,:-1]],
                    train_summaries_pad.reshape(train_summaries_pad.shape[0], train_summaries_pad.shape[1], 1)[:,1:],
                    epochs=50, callbacks=[es], batch_size=64,
                    validation_data=([test_texts_pad, test_summaries_pad[:,:-1]],
                                     test_summaries_pad.reshape(test_summaries_pad.shape[0], test_summaries_pad.shape[1], 1)[:,1:]))

In [None]:
# Vizualizarea pierderii în timp
from matplotlib import pyplot as plt
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

In [None]:
# Pregătirea pentru decodare
reverse_target_word_index = y_tokenizer.index_word
reverse_source_word_index = x_tokenizer.index_word
target_word_index = y_tokenizer.word_index

encoder_model = Model(inputs=encoder_inputs, outputs=[encoder_outputs, state_h, state_c])

decoder_state_input_h = Input(shape=(latent_dim * 2,))
decoder_state_input_c = Input(shape=(latent_dim * 2,))
decoder_hidden_state_input = Input(shape=(max_text_len, latent_dim * 2))

dec_emb2 = dec_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=[decoder_state_input_h, decoder_state_input_c])
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + [decoder_hidden_state_input, decoder_state_input_h, decoder_state_input_c],
    [decoder_outputs2] + [state_h2, state_c2])

In [None]:
# Funcția pentru decodarea secvenței
def decode_sequence(input_seq):
    e_out, e_h, e_c = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_word_index['start']
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [e_out, e_h, e_c])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = reverse_target_word_index.get(sampled_token_index, '')
        if sampled_token != 'end':
            decoded_sentence += ' ' + sampled_token
        if sampled_token == 'end' or len(decoded_sentence.split()) >= max_summary_len - 1:
            stop_condition = True
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        e_h, e_c = h, c
    return decoded_sentence.strip()

In [None]:
# Funcțiile pentru conversie text-sum și invers
def seq2summary(input_seq):
    new_string = ''
    for i in input_seq:
        if (i != 0 and i != target_word_index['start'] and i != target_word_index['end']):
            new_string += reverse_target_word_index.get(i, '') + ' '
    return new_string.strip()

def seq2text(input_seq):
    new_string = ''
    for i in input_seq:
        if i != 0:
            new_string += reverse_source_word_index.get(i, '') + ' '
    return new_string.strip()

In [None]:
# Testarea rezumatului pe câteva exemple
for i in range(5):
    print("Text:", seq2text(train_texts_pad[i]))
    print("Rezumat Original:", seq2summary(train_summaries_pad[i]))
    print("Rezumat Prezis:", decode_sequence(train_texts_pad[i].reshape(1, max_text_len)))
    print("\n\n")