In [None]:
%%capture
%pip install transformers pandas keybert scikit-learn rouge_score bert_score hf_xet keybert

In [None]:
import pandas as pd
import numpy as np
import os

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoModelForSequenceClassification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from keybert import KeyBERT

from rouge_score import rouge_scorer
from bert_score import score as bert_score

## Preprocess data

In [None]:
# Preprocess dataset to extract title keywords
def preprocess_dataset(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    kw_model = KeyBERT()
    
    def extract_keywords(title):
        try:
            # Add prompt to the title
            # prompted_title = f"Return ONLY KEYWORDS from the following text: {title}"
            keywords = kw_model.extract_keywords(
                title, keyphrase_ngram_range=(1, 7), top_n=10
            )
            return ' '.join([kw[0] for kw in keywords])
            # return ','.join([kw[0] for kw in keywords])
        except:
            return ''
    
    df['title_keywords'] = df['title'].apply(extract_keywords)
    df.to_csv(output_csv, index=False)
    return df

"""
def preprocess_dataset(input_csv, output_csv):
    df = pd.read_csv(input_csv)
    
    # Load Flan-T5-small model and tokenizer
    from transformers import AutoModelForSeq2SeqLM
    flan_t5_tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-small')
    flan_t5_model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-small')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    flan_t5_model.to(device)
    
    def extract_keywords(title):
        try:
            # Prompt to return only keywords, comma-separated
            prompted_title = f"Return ONLY KEYWORDS from the following text, separated by commas: {title}"
            # Tokenize input
            inputs = flan_t5_tokenizer(
                prompted_title,
                return_tensors='pt',
                max_length=512,
                truncation=True,
                padding=True
            ).to(device)
            
            # Generate output
            with torch.no_grad():
                outputs = flan_t5_model.generate(
                    **inputs,
                    max_length=100,
                    num_beams=5,
                    early_stopping=True
                )
            
            # Decode generated text
            keywords = flan_t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
            # Ensure clean comma-separated format
            if not keywords.strip():
                return ''
            keyword_list = [kw.strip() for kw in keywords.split(',') if kw.strip()]
            return ','.join(keyword_list)
        except:
            return ''
    
    df['title_keywords'] = df['title'].apply(extract_keywords)
    df.to_csv(output_csv, index=False)
    return df
"""

# Custom Dataset for flan_t5
class KeywordDataset(Dataset):
    def __init__(self, csv_file, tokenizer, mlb, max_length=512):
        self.df = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.mlb = mlb
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        abstract = str(self.df.iloc[idx]['abstract'])
        keywords = str(self.df.iloc[idx]['title_keywords']).split()

        encoding = self.tokenizer(
            abstract,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        labels = self.mlb.transform([keywords])[0]
        labels = torch.tensor(labels, dtype=torch.float32)

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': labels
        }

# Training Function
def train_flan_t5(model, train_loader, val_loader, device, epochs=3, lr=2e-5):
    optimizer = AdamW(model.parameters(), lr=lr)
    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            total_train_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        avg_train_loss = total_train_loss / len(train_loader)
        print(f"Epoch {epoch + 1}, Average Training Loss: {avg_train_loss:.4f}")

        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, labels)
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        print(f"Epoch {epoch + 1}, Average Validation Loss: {avg_val_loss:.4f}")


In [None]:
# File paths
train_csv = '/kaggle/input/springerjournal-450tk-0-7cosine/train.csv'
val_csv = '/kaggle/input/springerjournal-450tk-0-7cosine/val.csv'
processed_train_csv = '/kaggle/working/processed_train.csv'
processed_val_csv = '/kaggle/working/processed_val.csv'

# Preprocess datasets
preprocess_dataset(train_csv, processed_train_csv)
preprocess_dataset(val_csv, processed_val_csv)

# Initialize device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load keyword vocabulary
train_data = pd.read_csv(processed_train_csv)
train_keywords = train_data['title_keywords'].apply(lambda x: x.split() if x else []).values
# Limit vocabulary size to avoid memory issues
keyword_counts = {}
for sublist in train_keywords:
    for kw in sublist:
        keyword_counts[kw] = keyword_counts.get(kw, 0) + 1
top_keywords = sorted(keyword_counts.items(), key=lambda x: x[1], reverse=True)[:1000]
keyword_vocab = [kw for kw, _ in top_keywords]
mlb = MultiLabelBinarizer(classes=keyword_vocab)
# Fit the MultiLabelBinarizer with training keywords
mlb.fit(train_keywords)

# Initialize tokenizer and datasets
# tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
train_dataset = KeywordDataset(processed_train_csv, tokenizer, mlb)
val_dataset = KeywordDataset(processed_val_csv, tokenizer, mlb)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

from IPython.display import clear_output
clear_output()

## Train keyword extractor

In [None]:
# Initialize flan_t5 model
model = AutoModelForSequenceClassification.from_pretrained(
    'google/flan-t5-small',
    num_labels=len(keyword_vocab),
    problem_type='multi_label_classification'
).to(device)

# Train flan_t5
train_flan_t5(model, train_loader, val_loader, device, epochs=3, lr=2e-5)

# Save the model
os.makedirs('/kaggle/working/finetuned_flan_t5_keywords', exist_ok=True)
model.save_pretrained('/kaggle/working/finetuned_flan_t5_keywords')
tokenizer.save_pretrained('/kaggle/working/finetuned_flan_t5_keywords')
# Save keyword vocabulary
keyword_vocab_path = '/kaggle/working/keyword_vocab.txt'
with open('/kaggle/working/keyword_vocab.txt', 'w') as f:
    f.write('\n'.join(keyword_vocab))

In [None]:
# flan_t5-based keyword prediction function
def predict_keywords(abstract, flan_t5_model, flan_t5_tokenizer, keyword_vocab, max_len=512, device='cuda'):
    flan_t5_model.eval()
    encoding = flan_t5_tokenizer(
        abstract,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    with torch.no_grad():
        outputs = flan_t5_model(
            input_ids=encoding['input_ids'].to(device),
            attention_mask=encoding['attention_mask'].to(device)
        )

    logits = outputs.logits
    predictions = torch.sigmoid(logits).squeeze()
    predicted_keywords = [keyword_vocab[i] for i, prob in enumerate(predictions) if prob > 0.5]
    return predicted_keywords

# Custom Dataset
class ScientificPaperDataset(Dataset):
    def __init__(self, csv_file, tokenizer, flan_t5_model, flan_t5_tokenizer, keyword_vocab, max_length=512, device='cuda'):
        self.df = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.flan_t5_model = flan_t5_model
        self.flan_t5_tokenizer = flan_t5_tokenizer
        self.keyword_vocab = keyword_vocab
        self.max_length = max_length
        self.device = device

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        abstract = str(self.df.iloc[idx]['abstract'])
        title = str(self.df.iloc[idx]['title'])
        title_keywords = str(self.df.iloc[idx]['title_keywords'])

        abstract_keywords = predict_keywords(abstract, self.flan_t5_model, self.flan_t5_tokenizer, self.keyword_vocab, device=self.device)
        abstract_keywords_text = ' '.join(abstract_keywords) if abstract_keywords else ''

        input_text = f"Generate a concise and informative title based on this abstract of scientific research:\n{abstract}\nKeywords: {abstract_keywords_text}"

        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        title_encoding = self.tokenizer(
            title,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'title_input_ids': title_encoding['input_ids'].squeeze(),
            'title_attention_mask': title_encoding['attention_mask'].squeeze(),
            'title_keywords': title_keywords,
            'abstract_keywords': abstract_keywords_text,
            'title': title
        }

from transformers import BartForConditionalGeneration, BartTokenizerFast


# Multi-task BART Model
class MultiTaskBART(nn.Module):
    def __init__(self, model_name='facebook/bart-base'):
        super(MultiTaskBART, self).__init__()
        self.bart = BartForConditionalGeneration.from_pretrained(model_name)
        self.tokenizer = BartTokenizerFast.from_pretrained(model_name)
        self.tfidf = TfidfVectorizer(max_features=1000)

    def forward(self, input_ids, attention_mask, title_input_ids, title_keywords, abstract_keywords):
        outputs = self.bart(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=title_input_ids
        )
        title_loss = outputs.loss

        try:
            keyword_vectors = self.tfidf.fit_transform([title_keywords, abstract_keywords]).toarray()
            title_kw_vector = torch.tensor(keyword_vectors[0], dtype=torch.float32)
            abstract_kw_vector = torch.tensor(keyword_vectors[1], dtype=torch.float32)
            cosine_sim = nn.functional.cosine_similarity(title_kw_vector, abstract_kw_vector, dim=0)
            keyword_loss = 1 - cosine_sim
        except:
            keyword_loss = torch.tensor(0.0, device=input_ids.device)

        return title_loss, keyword_loss

## Training and Validation

In [None]:
# Training and Validation Function
def train_and_validate(model, train_loader, val_loader, device, epochs=3, lr=5e-5, title_loss_weight=1.0, keyword_loss_weight=0.5):
    optimizer = AdamW(model.parameters(), lr=lr)
    
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            title_input_ids = batch['title_input_ids'].to(device)
            title_keywords = batch['title_keywords']
            abstract_keywords = batch['abstract_keywords']

            title_loss, keyword_loss = model(
                input_ids, attention_mask, title_input_ids,
                title_keywords, abstract_keywords
            )

            loss = title_loss_weight * title_loss + keyword_loss_weight * keyword_loss
            total_train_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        avg_train_loss = total_train_loss / len(train_loader)
        print(f"Epoch {epoch + 1}, Average Training Loss: {avg_train_loss:.4f}")

        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                title_input_ids = batch['title_input_ids'].to(device)
                title_keywords = batch['title_keywords']
                abstract_keywords = batch['abstract_keywords']

                title_loss, keyword_loss = model(
                    input_ids, attention_mask, title_input_ids,
                    title_keywords, abstract_keywords
                )

                loss = title_loss_weight * title_loss + keyword_loss_weight * keyword_loss
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_loader)
        print(f"Epoch {epoch + 1}, Average Validation Loss: {avg_val_loss:.4f}")

# Evaluation Function with ROUGE and BERTScore
def evaluate_model(model, test_loader, device):
    model.eval()
    generated_titles = []
    reference_titles = []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': [], 'rougeLsum': []}

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            titles = batch['title']

            # Generate titles
            generated_ids = model.bart.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=32,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )
            generated_title = [model.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

            generated_titles.extend(generated_title)
            reference_titles.extend(titles)

            # Compute ROUGE scores
            for gen, ref in zip(generated_title, titles):
                scores = scorer.score(ref, gen)
                for metric in rouge_scores:
                    rouge_scores[metric].append(scores[metric].fmeasure)

    # Compute BERTScore
    try:
        _, _, F1 = bert_score(generated_titles, reference_titles, lang="en", verbose=True)
        bertscore_f1 = F1.mean().item()
    except:
        bertscore_f1 = 0.0

    # Average ROUGE scores
    avg_rouge = {metric: np.mean(scores) for metric, scores in rouge_scores.items()}

    # Print results
    print("\nEvaluation Results:")
    print(f"ROUGE-1: {avg_rouge['rouge1']:.4f}")
    print(f"ROUGE-2: {avg_rouge['rouge2']:.4f}")
    print(f"ROUGE-L: {avg_rouge['rougeL']:.4f}")
    print(f"ROUGE-Lsum: {avg_rouge['rougeLsum']:.4f}")
    print(f"BERTScore F1: {bertscore_f1:.4f}")

    # Save generated titles
    results_df = pd.DataFrame({
        'reference_title': reference_titles,
        'generated_title': generated_titles
    })
    results_df.to_csv('/kaggle/working/generated_titles.csv', index=False)

    return avg_rouge, bertscore_f1

In [None]:
# File paths
test_csv = '/kaggle/input/springerjournal-450tk-0-7cosine/test.csv'
processed_test_csv = '/kaggle/working/processed_test.csv'

# Preprocess datasets
preprocess_dataset(train_csv, processed_train_csv)
preprocess_dataset(val_csv, processed_val_csv)
preprocess_dataset(test_csv, processed_test_csv)

# Initialize device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load flan_t5 model and tokenizer for keyword prediction
flan_t5_model_path = '/kaggle/working/finetuned_flan_t5_keywords'
if not os.path.exists(flan_t5_model_path):
    raise FileNotFoundError(f"flan_t5 model directory {flan_t5_model_path} does not exist. Please run the fine-tuning script first.")
flan_t5_model = AutoModelForSequenceClassification.from_pretrained(flan_t5_model_path).to(device)
flan_t5_tokenizer = AutoTokenizer.from_pretrained(flan_t5_model_path)

# Load keyword vocabulary
if not os.path.exists(keyword_vocab_path):
    raise FileNotFoundError(f"Keyword vocabulary file {keyword_vocab_path} does not exist. Please run the fine-tuning script first.")
with open(keyword_vocab_path, 'r') as f:
    keyword_vocab = [line.strip() for line in f]

# Initialize flan-t5 tokenizer and datasets
t5_tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
train_dataset = ScientificPaperDataset(processed_train_csv, t5_tokenizer, flan_t5_model, flan_t5_tokenizer, keyword_vocab, device=device)
val_dataset = ScientificPaperDataset(processed_val_csv, t5_tokenizer, flan_t5_model, flan_t5_tokenizer, keyword_vocab, device=device)
test_dataset = ScientificPaperDataset(processed_test_csv, t5_tokenizer, flan_t5_model, flan_t5_tokenizer, keyword_vocab, device=device)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Initialize BART model
bart_model = MultiTaskBART().to(device)

# Train and validate
train_and_validate(bart_model, train_loader, val_loader, device, epochs=3, lr=5e-5,
                    title_loss_weight=1.0, keyword_loss_weight=0.5)

# Evaluate on test set
rouge_results, bertscore_result = evaluate_model(bart_model, test_loader, device)

# Save the model
bart_model.bart.save_pretrained('/kaggle/working/multitask_bart_model')
t5_tokenizer.save_pretrained('/kaggle/working/multitask_bart_model')