# **Model from Claude.AI By Antropic**

In [None]:
# !pip install transformers torch pandas tqdm wandb scikit-learn
!pip install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-joqn2p9o
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-joqn2p9o
  Resolved https://github.com/huggingface/transformers to commit 19dabe96362803fb0a9ae7073d03533966598b17
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.47.0.dev0-py3-none-any.whl size=10106394 sha256=e63001c4acbba6c2d0885d6da334fadbe7d68d639825f63984581745389cf984
  Stored in directory: /tmp/pip-ephem-wheel-cache-ou5xc1n0/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformer

In [None]:
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler
import numpy as np
from tqdm import tqdm
import wandb
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from typing import List, Dict

In [None]:
class EarlyStopping:
    """Early stopping to prevent overfitting"""
    def __init__(self, patience=3, min_delta=0, verbose=False):
        self.patience = patience
        self.min_delta = min_delta
        self.verbose = verbose
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

class TrainingHistory:
    """Track and plot training metrics"""
    def __init__(self):
        self.train_losses: List[float] = []
        self.val_losses: List[float] = []
        self.learning_rates: List[float] = []

    def update(self, train_loss: float, val_loss: float, lr: float):
        self.train_losses.append(train_loss)
        self.val_losses.append(val_loss)
        self.learning_rates.append(lr)

    def plot_losses(self, save_path: str = 'learning_curves.png'):
        plt.figure(figsize=(10, 5))
        plt.plot(self.train_losses, label='Training Loss')
        plt.plot(self.val_losses, label='Validation Loss')
        plt.title('Training and Validation Losses')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True)
        plt.savefig(save_path)
        plt.close()

    def plot_lr(self, save_path: str = 'learning_rate.png'):
        plt.figure(figsize=(10, 5))
        plt.plot(self.learning_rates)
        plt.title('Learning Rate over Time')
        plt.xlabel('Epoch')
        plt.ylabel('Learning Rate')
        plt.grid(True)
        plt.savefig(save_path)
        plt.close()

In [None]:
class KeywordDataset(Dataset):
    def __init__(self, abstracts, keywords, tokenizer, max_length=512):
        self.abstracts = abstracts
        self.keywords = keywords
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.abstracts)

    def __getitem__(self, idx):
        abstract = f"Generate keywords: {self.abstracts[idx]}"
        keywords = self.keywords[idx]

        inputs = self.tokenizer(
            abstract,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        targets = self.tokenizer(
            keywords,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze()
        }

def print_gpu_memory():
    """Print GPU memory usage"""
    if torch.cuda.is_available():
        print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
        print(f"GPU memory cached: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

In [None]:
def train_model(model, train_loader, val_loader, optimizer, scheduler, num_epochs,
                device, accumulation_steps=2, fp16=True, patience=3):
    """
    Training loop with early stopping and optimization techniques
    """
    print_gpu_memory()

    # Initialize wandb
    wandb.init(project="keyword-extraction")

    # Initialize training utilities
    scaler = GradScaler() if fp16 else None
    early_stopping = EarlyStopping(patience=patience, verbose=True)
    history = TrainingHistory()
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        total_train_loss = 0
        optimizer.zero_grad()

        train_pbar = tqdm(train_loader, desc=f'Training Epoch {epoch+1}')
        for batch_idx, batch in enumerate(train_pbar):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with autocast(enabled=fp16):
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss / accumulation_steps

            if fp16:
                scaler.scale(loss).backward()
                if (batch_idx + 1) % accumulation_steps == 0:
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    scaler.step(optimizer)
                    scaler.update()
                    scheduler.step()
                    optimizer.zero_grad()
            else:
                loss.backward()
                if (batch_idx + 1) % accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()

            total_train_loss += loss.item() * accumulation_steps
            train_pbar.set_postfix({'train_loss': loss.item() * accumulation_steps})

        avg_train_loss = total_train_loss / len(train_loader)
        print_gpu_memory()

        # Validation phase
        model.eval()
        total_val_loss = 0

        with torch.no_grad():
            val_pbar = tqdm(val_loader, desc=f'Validation Epoch {epoch+1}')
            for batch in val_pbar:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                total_val_loss += loss.item()
                val_pbar.set_postfix({'val_loss': loss.item()})

        avg_val_loss = total_val_loss / len(val_loader)
        current_lr = scheduler.get_last_lr()[0]

        # Update training history
        history.update(avg_train_loss, avg_val_loss, current_lr)

        # Log metrics
        wandb.log({
            'epoch': epoch + 1,
            'train_loss': avg_train_loss,
            'val_loss': avg_val_loss,
            'learning_rate': current_lr
        })

        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'best_val_loss': best_val_loss,
            }, 'best_model.pt')
            print(f'New best model saved with validation loss: {best_val_loss:.4f}')

        # Early stopping check
        early_stopping(avg_val_loss)
        if early_stopping.early_stop:
            print("Early stopping triggered")
            # Plot learning curves before stopping
            history.plot_losses()
            history.plot_lr()
            break

        print(f'Epoch {epoch+1}:')
        print(f'Average training loss: {avg_train_loss:.4f}')
        print(f'Average validation loss: {avg_val_loss:.4f}')
        print(f'Best validation loss: {best_val_loss:.4f}')
        print(f'Current learning rate: {current_lr}')

    # Plot final learning curves
    history.plot_losses()
    history.plot_lr()
    return history

In [None]:
def main():
    # Set seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)

    # RTX 4050 Optimized Parameters
    BATCH_SIZE = 4
    ACCUMULATION_STEPS = 2
    LEARNING_RATE = 3e-5
    MODEL_NAME = "t5-small"  # Upgraded from t5-small

    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    if torch.cuda.is_available():
        print(f"GPU: {torch.cuda.get_device_name()}")

    # Load data
    df = pd.read_csv('more_filtered_scopus_data.csv')

    # Split data into train, validation, and test sets (60/20/20)
    train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)

    print(f"Training samples: {len(train_df)}")
    print(f"Validation samples: {len(val_df)}")
    print(f"Test samples: {len(test_df)}")

    # Initialize model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    model.to(device)

    # Create datasets and dataloaders
    train_dataset = KeywordDataset(train_df['abstract'].tolist(),
                                    train_df['keywords'].tolist(),
                                    tokenizer)
    val_dataset = KeywordDataset(val_df['abstract'].tolist(),
                                val_df['keywords'].tolist(),
                                tokenizer)

    train_loader = DataLoader(train_dataset,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            num_workers=2,  # Parallel data loading
                            pin_memory=True)  # Faster data transfer to GPU

    val_loader = DataLoader(val_dataset,
                            batch_size=BATCH_SIZE,
                            num_workers=2,
                            pin_memory=True)

    # Optimizer with weight decay
    optimizer = torch.optim.AdamW(model.parameters(),
                                lr=LEARNING_RATE,
                                weight_decay=0.01)  # Added weight decay

    # Learning rate scheduler with warmup
    num_training_steps = len(train_loader) * 10  # 10 epochs
    num_warmup_steps = num_training_steps // 10
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    # Train model
    train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        num_epochs=10,
        device=device,
        accumulation_steps=ACCUMULATION_STEPS,
        fp16=True,
        patience=3  # Early stopping patience
    )

In [None]:
if __name__ == "__main__":
    main()

Using device: cuda
GPU: Tesla T4
Training samples: 9774
Validation samples: 3258
Test samples: 3258


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

GPU memory allocated: 230.81 MB
GPU memory cached: 238.00 MB


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  scaler = GradScaler() if fp16 else None
  with autocast(enabled=fp16):
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Training Epoch 1: 100%|██████████| 2444/2444 [05:03<00:00,  8.07it/s, train_loss=0.735]


GPU memory allocated: 741.71 MB
GPU memory cached: 2422.00 MB


Validation Epoch 1: 100%|██████████| 815/815 [00:53<00:00, 15.30it/s, val_loss=0.484]


New best model saved with validation loss: 0.5440
Epoch 1:
Average training loss: 3.4059
Average validation loss: 0.5440
Best validation loss: 0.5440
Current learning rate: 1.5e-05


Training Epoch 2: 100%|██████████| 2444/2444 [04:57<00:00,  8.22it/s, train_loss=1.07]


GPU memory allocated: 741.71 MB
GPU memory cached: 2422.00 MB


Validation Epoch 2: 100%|██████████| 815/815 [00:53<00:00, 15.23it/s, val_loss=0.433]


New best model saved with validation loss: 0.5032
Epoch 2:
Average training loss: 0.5884
Average validation loss: 0.5032
Best validation loss: 0.5032
Current learning rate: 3e-05


Training Epoch 3: 100%|██████████| 2444/2444 [04:51<00:00,  8.38it/s, train_loss=0.689]


GPU memory allocated: 741.71 MB
GPU memory cached: 2422.00 MB


Validation Epoch 3: 100%|██████████| 815/815 [00:52<00:00, 15.48it/s, val_loss=0.415]


New best model saved with validation loss: 0.4810
Epoch 3:
Average training loss: 0.5449
Average validation loss: 0.4810
Best validation loss: 0.4810
Current learning rate: 2.8333333333333332e-05


Training Epoch 4: 100%|██████████| 2444/2444 [04:51<00:00,  8.39it/s, train_loss=0.338]


GPU memory allocated: 741.40 MB
GPU memory cached: 2422.00 MB


Validation Epoch 4: 100%|██████████| 815/815 [00:52<00:00, 15.38it/s, val_loss=0.407]


New best model saved with validation loss: 0.4674
Epoch 4:
Average training loss: 0.5215
Average validation loss: 0.4674
Best validation loss: 0.4674
Current learning rate: 2.6666666666666667e-05


Training Epoch 5: 100%|██████████| 2444/2444 [04:52<00:00,  8.36it/s, train_loss=0.46]


GPU memory allocated: 741.40 MB
GPU memory cached: 2422.00 MB


Validation Epoch 5: 100%|██████████| 815/815 [00:52<00:00, 15.44it/s, val_loss=0.404]


New best model saved with validation loss: 0.4608
Epoch 5:
Average training loss: 0.5041
Average validation loss: 0.4608
Best validation loss: 0.4608
Current learning rate: 2.5e-05


Training Epoch 6: 100%|██████████| 2444/2444 [04:50<00:00,  8.41it/s, train_loss=0.864]


GPU memory allocated: 741.71 MB
GPU memory cached: 2422.00 MB


Validation Epoch 6: 100%|██████████| 815/815 [00:52<00:00, 15.42it/s, val_loss=0.397]


New best model saved with validation loss: 0.4519
Epoch 6:
Average training loss: 0.4911
Average validation loss: 0.4519
Best validation loss: 0.4519
Current learning rate: 2.3333333333333336e-05


Training Epoch 7: 100%|██████████| 2444/2444 [04:52<00:00,  8.34it/s, train_loss=0.929]


GPU memory allocated: 741.71 MB
GPU memory cached: 2422.00 MB


Validation Epoch 7: 100%|██████████| 815/815 [00:52<00:00, 15.41it/s, val_loss=0.402]


New best model saved with validation loss: 0.4471
Epoch 7:
Average training loss: 0.4808
Average validation loss: 0.4471
Best validation loss: 0.4471
Current learning rate: 2.1666666666666667e-05


Training Epoch 8: 100%|██████████| 2444/2444 [04:53<00:00,  8.31it/s, train_loss=0.476]


GPU memory allocated: 741.71 MB
GPU memory cached: 2422.00 MB


Validation Epoch 8: 100%|██████████| 815/815 [00:52<00:00, 15.44it/s, val_loss=0.399]


New best model saved with validation loss: 0.4423
Epoch 8:
Average training loss: 0.4722
Average validation loss: 0.4423
Best validation loss: 0.4423
Current learning rate: 1.9999999999999998e-05


Training Epoch 9: 100%|██████████| 2444/2444 [04:51<00:00,  8.38it/s, train_loss=0.504]


GPU memory allocated: 741.71 MB
GPU memory cached: 2422.00 MB


Validation Epoch 9: 100%|██████████| 815/815 [00:53<00:00, 15.30it/s, val_loss=0.407]


New best model saved with validation loss: 0.4395
Epoch 9:
Average training loss: 0.4654
Average validation loss: 0.4395
Best validation loss: 0.4395
Current learning rate: 1.8333333333333336e-05


Training Epoch 10: 100%|██████████| 2444/2444 [04:52<00:00,  8.35it/s, train_loss=0.529]


GPU memory allocated: 741.40 MB
GPU memory cached: 2422.00 MB


Validation Epoch 10: 100%|██████████| 815/815 [00:53<00:00, 15.33it/s, val_loss=0.396]


New best model saved with validation loss: 0.4363
Epoch 10:
Average training loss: 0.4582
Average validation loss: 0.4363
Best validation loss: 0.4363
Current learning rate: 1.6666666666666667e-05


In [24]:
!pip install nltk rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=ec23ba774fb90264b8476012af8c0a9a2b97386c2d0e925153d138d0bb6f1dc1
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [40]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import pandas as pd
from tqdm import tqdm
import numpy as np

def load_trained_model(checkpoint_path: str, device: str = 'cuda'):
    """Load the trained model and tokenizer with the correct model size"""
    try:
        # Initialize the model and tokenizer with t5-small instead of t5-base
        model_name = "t5-small"  # Changed from t5-base to t5-small
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Load the checkpoint
        checkpoint = torch.load(checkpoint_path, map_location=device)

        # Load state dict
        if 'model_state_dict' in checkpoint:
            model.load_state_dict(checkpoint['model_state_dict'])
        else:
            raise KeyError("model_state_dict not found in checkpoint")

        model.to(device)
        model.eval()
        return model, tokenizer

    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise

def extract_keywords(text: str, model, tokenizer, device: str = 'cuda'):
    """Extract keywords from a given text"""
    try:
        # Prepare input
        input_text = f"Generate keywords: {text}"
        inputs = tokenizer(
            input_text,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(device)

        # Generate keywords
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=128,
                num_beams=4,
                no_repeat_ngram_size=2
            )

        # Decode the generated keywords
        keywords = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return keywords

    except Exception as e:
        print(f"Error in keyword extraction: {str(e)}")
        return ""

def evaluate_model(model, tokenizer, test_data: pd.DataFrame, device: str = 'cuda'):
    """Evaluate the model and generate metrics"""
    true_keywords_list = []
    pred_keywords_list = []

    print("Generating predictions...")
    for _, row in tqdm(test_data.iterrows(), total=len(test_data)):
        # Get true keywords
        true_keywords = set(row['keyword'].lower().split(','))
        true_keywords_list.append(true_keywords)

        # Get predicted keywords
        pred_keywords = extract_keywords(row['abstract'], model, tokenizer, device)
        pred_keywords = set(pred_keywords.lower().split(','))
        pred_keywords_list.append(pred_keywords)

    # Calculate metrics
    precision_list = []
    recall_list = []
    f1_list = []

    for true_kw, pred_kw in zip(true_keywords_list, pred_keywords_list):
        # Calculate intersection of predicted and true keywords
        correct_predictions = len(true_kw.intersection(pred_kw))

        # Calculate precision, recall, and F1 score
        precision = correct_predictions / len(pred_kw) if pred_kw else 0
        recall = correct_predictions / len(true_kw) if true_kw else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    # Calculate average metrics
    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_f1 = np.mean(f1_list)

    return {
        'Precision': avg_precision,
        'Recall': avg_recall,
        'F1-score': avg_f1,
        'Number of samples': len(test_data)
    }

def main():
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    try:
        # Load model and tokenizer
        print("Loading model...")
        model_path = 'best_model.pt'  # Make sure this path is correct
        model, tokenizer = load_trained_model(model_path, device)

        # Load your test data
        test_data = pd.read_csv('scrape_output.csv')  # Replace with your test data path

        # Evaluate model
        print("\nEvaluating model...")
        report = evaluate_model(model, tokenizer, test_data, device)

        # Print classification report
        print("\nClassification Report:")
        print("-" * 50)
        for metric, value in report.items():
            if metric != 'Number of samples':
                print(f"{metric:15} {value:.4f}")
            else:
                print(f"{metric:15} {value}")
        print("-" * 50)

        # Test with a sample text
        print("\nTesting with sample text...")
        sample_text = "Online recruitment platforms typically employ Person-Job Fit models in the core service that automatically match suitable job seekers with appropriate job positions. While existing works leverage historical or contextual information, they often disregard a crucial aspect: job seekers’ social relationships in professional networks. This paper emphasizes the importance of incorporating professional networks into the Person-Job Fit model. Our innovative approach consists of two stages: (1) defining a Workplace Heterogeneous Information Network (WHIN) to capture heterogeneous knowledge, including professional connections and pre-training representations of various entities using a heterogeneous graph neural network; (2) designing a Contextual Social Attention Graph Neural Network (CSAGNN) that supplements users’ missing information with professional connections’ contextual information. We introduce a job-specific attention mechanism in CSAGNN to handle noisy professional networks, leveraging pre-trained entity representations from WHIN. We demonstrate the effectiveness of our approach through experimental evaluations conducted across three real-world recruitment datasets from LinkedIn, showing superior performance compared to baseline models."
        keywords = extract_keywords(sample_text, model, tokenizer, device)
        print(f"Sample text: {sample_text}")
        print(f"Generated keywords: {keywords}")

    except Exception as e:
        print(f"An error occurred in main: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Using device: cuda
Loading model...


  checkpoint = torch.load(checkpoint_path, map_location=device)



Evaluating model...
Generating predictions...


100%|██████████| 271/271 [01:53<00:00,  2.38it/s]



Classification Report:
--------------------------------------------------
Precision       0.0037
Recall          0.0009
F1-score        0.0015
Number of samples 271
--------------------------------------------------

Testing with sample text...
Sample text: Online recruitment platforms typically employ Person-Job Fit models in the core service that automatically match suitable job seekers with appropriate job positions. While existing works leverage historical or contextual information, they often disregard a crucial aspect: job seekers’ social relationships in professional networks. This paper emphasizes the importance of incorporating professional networks into the Person-Job Fit model. Our innovative approach consists of two stages: (1) defining a Workplace Heterogeneous Information Network (WHIN) to capture heterogeneous knowledge, including professional connections and pre-training representations of various entities using a heterogeneous graph neural network; (2) designing a Con