In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import numpy as np

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [None]:
# Load SenticNet Dataset
def load_senticnet(file_path):
    senticnet_df = pd.read_csv(file_path, sep=",")
    concept_dict = {}

    for _, row in senticnet_df.iterrows():
        concept = row['CONCEPT']
        primary_emotion = row['PRIMARY EMOTION']
        polarity_value = row['POLARITY VALUE']
        semantics = row['SEMANTICS'].split() if pd.notna(row['SEMANTICS']) else []
        concept_dict[concept] = {
            'primary_emotion': primary_emotion,
            'polarity_value': polarity_value,
            'semantics': semantics[:3]  # Top 3 related concepts
        }
    return concept_dict

# Inject Knowledge into Sentence
def inject_knowledge(sentence, senticnet):
    """
    Inject knowledge from SenticNet into a sentence and create a structured sentence tree.
    """
    words = sentence.split()
    sentence_tree = []

    for word in words:
        # Fetch knowledge for the word
        knowledge = senticnet.get(word, {})
        if knowledge:
            # Create a node with the word’s own emotion and polarity
            node = {
                'word': word,
                'primary_emotion': knowledge.get('primary_emotion'),
                'polarity_value': knowledge.get('polarity_value'),
                'semantics': []
            }
            # Attach up to three related semantic concepts, each with its own emotion/polarity
            for sem_word in knowledge.get('semantics', []):
                sem_know = senticnet.get(sem_word, {})
                node['semantics'].append({
                    'word': sem_word,
                    'primary_emotion': sem_know.get('primary_emotion'),
                    'polarity_value': sem_know.get('polarity_value'),
                })
        else:
            # If no SenticNet entry exists, fill in with None values
            node = {
                'word': word,
                'primary_emotion': None,
                'polarity_value': None,
                'semantics': []
            }

        sentence_tree.append(node)
    return sentence_tree


In [4]:
#for removing noise

def assign_soft_positions_and_reorder(sentence_tree):
    tokens = []
    soft_positions = []
    position = 0

    # Step 1: Add main words
    main_tokens = []
    main_positions = []

    # Step 2: Collect semantics separately
    semantics_tokens = []
    semantics_positions = []

    for node in sentence_tree:
        # Add main word
        main_tokens.append(node['word'])
        main_positions.append(position)

        # Collect semantics (related words)
        for semantic in node['semantics']:
            semantics_tokens.append(semantic['word'])
            semantics_positions.append(position)

        position += 1

    # Step 3: Reorder by appending semantics after main tokens
    tokens = main_tokens + semantics_tokens
    soft_positions = main_positions + semantics_positions

    return tokens, soft_positions



#Visible Matrix
def create_visible_matrix_from_tree(sentence_tree):
    total_tokens = sum(1 + len(node['semantics']) for node in sentence_tree)
    visible_matrix = torch.zeros((total_tokens, total_tokens))

    idx = 0
    for i, node in enumerate(sentence_tree):
        visible_matrix[idx, idx] = 1  # Main word visible to itself
        child_start_idx = idx + 1

        # Main word visible to its semantics
        for j, semantic in enumerate(node['semantics']):
            visible_matrix[idx, child_start_idx + j] = 1
            visible_matrix[child_start_idx + j, idx] = 1

        idx += 1 + len(node['semantics'])

    return visible_matrix


def pad_visible_matrix(visible_matrix, max_length):
    """Pad visible matrix to max_length."""
    padded_matrix = torch.zeros((max_length, max_length))
    current_length = visible_matrix.size(0)
    padded_matrix[:current_length, :current_length] = visible_matrix
    return padded_matrix


In [5]:
class K_BERTDataset(Dataset):
    def __init__(self, data, tokenizer, senticnet, max_length=128):
        self.data = data  # Accept DataFrame directly
        self.tokenizer = tokenizer
        self.senticnet = senticnet
        self.max_length = max_length

        # Create a label mapping
        unique_labels = sorted(self.data['label'].unique())
        self.label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
        self.data['label'] = self.data['label'].map(self.label_mapping)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['content']
        label = self.data.iloc[idx]['label']

        # Inject SenticNet knowledge
        sentence_tree = inject_knowledge(sentence, self.senticnet)
        
        tokens, soft_positions = assign_soft_positions_and_reorder(sentence_tree)
        soft_positions = torch.tensor(soft_positions[:self.max_length], dtype=torch.long)
            
        # Tokenize inputs
        inputs = self.tokenizer(
            tokens,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            is_split_into_words=True
        )

        # Create visible matrix
        visible_matrix = create_visible_matrix_from_tree(sentence_tree)

        # Pad visible matrix to match max_length
        padded_visible_matrix = pad_visible_matrix(visible_matrix, self.max_length)

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'visible_matrix': padded_visible_matrix,
            'soft_positions': soft_positions,
            'label': torch.tensor(label, dtype=torch.long)
        }


In [6]:
def expand_visible_matrix(visible_matrix, num_heads):
    batch_size, seq_len, seq_len_2 = visible_matrix.size()
    expanded_matrix = visible_matrix.unsqueeze(1).repeat(1, num_heads, 1, 1)
    return expanded_matrix.view(batch_size * num_heads, seq_len, seq_len)

class MaskedSelfAttention(nn.Module):
    def __init__(self, hidden_size, num_attention_heads):
        super().__init__()
        self.num_attention_heads = num_attention_heads
        self.attention = nn.MultiheadAttention(hidden_size, num_attention_heads)
    
    def forward(self, inputs, attention_mask, visible_matrix):
        batch_size, seq_len, hidden_size = inputs.size()

        # Incorporate attention_mask into visible_matrix:
        # attention_mask: [batch_size, seq_len], 1 for real tokens, 0 for padded
        # Make broadcastable
        valid_tokens = attention_mask.unsqueeze(-1) * attention_mask.unsqueeze(1)
        combined_matrix = visible_matrix * valid_tokens

        # Expand for multi-head
        combined_matrix = expand_visible_matrix(combined_matrix, self.num_attention_heads)

        # Create attn_mask with a large negative number instead of -inf
        attn_mask = torch.zeros_like(combined_matrix, dtype=torch.float)
        attn_mask[combined_matrix == 0] = -1e9  # large negative number

        # inputs: [batch_size, seq_len, hidden_size] -> [seq_len, batch_size, hidden_size]
        inputs = inputs.permute(1, 0, 2)

        # Apply multi-head attention
        # IMPORTANT: multihead attention expects attn_mask to be [L, S] or [N*num_heads, L, S].
        # We have [N*num_heads, seq_len, seq_len], which should be correct since we repeated for heads.
        outputs, _ = self.attention(inputs, inputs, inputs, attn_mask=attn_mask)
        
        # [seq_len, batch_size, hidden_size] -> [batch_size, seq_len, hidden_size]
        outputs = outputs.permute(1, 0, 2)
        return outputs


In [7]:
class K_RoBERTa(nn.Module):
    def __init__(self, transformer, hidden_size, num_attention_heads, num_labels, max_length=128):
        super().__init__()
        self.transformer = transformer  # Use RobertaForSequenceClassification
        self.masked_attention = MaskedSelfAttention(hidden_size, num_attention_heads)
        self.classifier = nn.Linear(hidden_size, num_labels)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.soft_position_embeddings = nn.Embedding(max_length, hidden_size)

    def forward(self, input_ids, attention_mask, visible_matrix, soft_positions):
        outputs = self.transformer(input_ids, attention_mask=attention_mask)
        # print("After transformer:", outputs.logits if hasattr(outputs, 'logits') else outputs.hidden_states[-1])

        hidden_states = outputs.hidden_states[-1]
        # print("hidden_states check NaN:", torch.isnan(hidden_states).any())
        soft_position_embeds = self.soft_position_embeddings(soft_positions)

        normalized_hidden_states = self.layer_norm(hidden_states)
        # print("normalized hidden_states check NaN:", torch.isnan(normalized_hidden_states).any())

        masked_outputs = self.masked_attention(normalized_hidden_states, attention_mask, visible_matrix)
        # print("masked_outputs check NaN:", torch.isnan(masked_outputs).any())

        pooled_output = masked_outputs.mean(dim=1)
        # print("pooled_output check NaN:", torch.isnan(pooled_output).any())

        logits = self.classifier(pooled_output)
        # print("classifier logits check NaN:", torch.isnan(logits).any())
        
        return logits

In [8]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    visible_matrices = [item['visible_matrix'] for item in batch]
    soft_positions = [item['soft_positions'] for item in batch]
    labels = [item['label'] for item in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    visible_matrices = pad_sequence(visible_matrices, batch_first=True, padding_value=0)
    soft_positions = pad_sequence(soft_positions, batch_first=True, padding_value=0)
    labels = torch.stack(labels)  # stack them into a single tensor

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'visible_matrix': visible_matrices,
        'soft_positions': soft_positions,
        'labels': labels
    }

In [9]:
import os 

root_path = "cleaned_full_isear_data_with_sentiment.csv"
df = pd.read_csv(os.path.join(root_path))

In [10]:
unique_labels = sorted(df['label'].unique())
label_mapping = {original_label: idx for idx, original_label in enumerate(unique_labels)}
df['label'] = df['label'].map(label_mapping)

# Print to verify the mapping
print("Label mapping:", label_mapping)
print("Mapped labels:", df['label'].unique())

num_labels = len(unique_labels)
print(num_labels)

Label mapping: {2: 0, 11: 1, 14: 2, 17: 3, 25: 4, 28: 5, 29: 6}
Mapped labels: [3 2 0 4 1 5 6]
7


In [11]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# 1. Load the SenticNet Data
senticnet_path = "senticnet/senticnet.csv"
senticnet_data = load_senticnet(senticnet_path)

# 2. Load and Split the Dataset
data_path = "cleaned_full_isear_data_with_sentiment.csv"
full_data = pd.read_csv(data_path)

# Split into train, validation, and test sets
train_data, test_data = train_test_split(full_data, test_size=0.2, random_state=42)
train_data, dev_data = train_test_split(train_data, test_size=0.1, random_state=42)

print(f"Train size: {len(train_data)}, Validation size: {len(dev_data)}, Test size: {len(test_data)}")

#Tokenizer
pretrained_model_path = "results/checkpoint-3250"
tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_path)
transformer = RobertaForSequenceClassification.from_pretrained(
    pretrained_model_path, num_labels=num_labels, output_hidden_states=True
)

#Create K_BERTDataset Instances
train_dataset = K_BERTDataset(train_data, tokenizer, senticnet_data)
dev_dataset = K_BERTDataset(dev_data, tokenizer, senticnet_data)
test_dataset = K_BERTDataset(test_data, tokenizer, senticnet_data)

#Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

num_attention_heads = 12  # Default number of attention heads for Roberta


# Initialize the model
model = K_RoBERTa(
    transformer = RobertaForSequenceClassification.from_pretrained(
    pretrained_model_path, num_labels=num_labels, output_hidden_states=True
),
    num_attention_heads=num_attention_heads,
    num_labels=num_labels,
    hidden_size = transformer.config.hidden_size
)

#Optimizer and Loss Function
optimizer = AdamW(
    model.parameters(),
    lr=3e-7,  # Reduced learning rate for stability
    weight_decay=1e-4,
    eps=1e-8
)

criterion = nn.CrossEntropyLoss()

#Training Mode
model.train()


Train size: 5334, Validation size: 593, Test size: 1482




K_RoBERTa(
  (transformer): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-11): 12 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense)

In [12]:
model = model.to(device)

In [13]:
#torch.autograd.set_detect_anomaly(True)

for epoch in range(50):
    total_loss = 0
    model.train()
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        visible_matrix = batch['visible_matrix'].to(device)
        soft_positions = batch['soft_positions'].to(device)
        labels = batch['labels'].to(device)

#         if torch.isnan(input_ids).any() or torch.isnan(visible_matrix).any():
#             print("NaN in inputs before forward pass")
        
        optimizer.zero_grad()
        
        logits = model(input_ids, attention_mask, visible_matrix, soft_positions)
        
#         if torch.isnan(logits).any():
#             print("NaN detected in logits before loss computation")
        
        loss = criterion(logits, labels)

        # Gradient clipping to avoid parameter explosion
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} Average Loss: {avg_loss}")


Training Epoch 1: 100%|██████████| 334/334 [00:12<00:00, 26.05it/s]


Epoch 1 Average Loss: 1.742286432645992


Training Epoch 2: 100%|██████████| 334/334 [00:12<00:00, 27.34it/s]


Epoch 2 Average Loss: 1.3846035565801724


Training Epoch 3: 100%|██████████| 334/334 [00:12<00:00, 27.32it/s]


Epoch 3 Average Loss: 1.1133766085087895


Training Epoch 4: 100%|██████████| 334/334 [00:12<00:00, 27.30it/s]


Epoch 4 Average Loss: 0.9318083389076645


Training Epoch 5: 100%|██████████| 334/334 [00:12<00:00, 27.38it/s]


Epoch 5 Average Loss: 0.816101782603892


Training Epoch 6: 100%|██████████| 334/334 [00:12<00:00, 27.30it/s]


Epoch 6 Average Loss: 0.741339543800868


Training Epoch 7: 100%|██████████| 334/334 [00:12<00:00, 27.34it/s]


Epoch 7 Average Loss: 0.6913469489106161


Training Epoch 8: 100%|██████████| 334/334 [00:12<00:00, 27.34it/s]


Epoch 8 Average Loss: 0.6466747961358396


Training Epoch 9: 100%|██████████| 334/334 [00:12<00:00, 27.29it/s]


Epoch 9 Average Loss: 0.6112514674485087


Training Epoch 10: 100%|██████████| 334/334 [00:12<00:00, 27.31it/s]


Epoch 10 Average Loss: 0.5946847325313591


Training Epoch 11: 100%|██████████| 334/334 [00:12<00:00, 27.21it/s]


Epoch 11 Average Loss: 0.5662804308854891


Training Epoch 12: 100%|██████████| 334/334 [00:12<00:00, 27.37it/s]


Epoch 12 Average Loss: 0.5387972978551587


Training Epoch 13: 100%|██████████| 334/334 [00:12<00:00, 27.25it/s]


Epoch 13 Average Loss: 0.5312360629305511


Training Epoch 14: 100%|██████████| 334/334 [00:12<00:00, 27.29it/s]


Epoch 14 Average Loss: 0.5186238767590351


Training Epoch 15: 100%|██████████| 334/334 [00:12<00:00, 26.95it/s]


Epoch 15 Average Loss: 0.5037548653558342


Training Epoch 16: 100%|██████████| 334/334 [00:12<00:00, 27.26it/s]


Epoch 16 Average Loss: 0.49079915609009966


Training Epoch 17: 100%|██████████| 334/334 [00:12<00:00, 27.40it/s]


Epoch 17 Average Loss: 0.48228908624656186


Training Epoch 18: 100%|██████████| 334/334 [00:12<00:00, 27.34it/s]


Epoch 18 Average Loss: 0.4600169605012247


Training Epoch 19: 100%|██████████| 334/334 [00:12<00:00, 27.36it/s]


Epoch 19 Average Loss: 0.46345898573716243


Training Epoch 20: 100%|██████████| 334/334 [00:12<00:00, 27.37it/s]


Epoch 20 Average Loss: 0.44164441151176387


Training Epoch 21: 100%|██████████| 334/334 [00:12<00:00, 27.25it/s]


Epoch 21 Average Loss: 0.4351989479211276


Training Epoch 22: 100%|██████████| 334/334 [00:12<00:00, 27.35it/s]


Epoch 22 Average Loss: 0.42569702150980515


Training Epoch 23: 100%|██████████| 334/334 [00:12<00:00, 27.27it/s]


Epoch 23 Average Loss: 0.4174324171673395


Training Epoch 24: 100%|██████████| 334/334 [00:12<00:00, 27.35it/s]


Epoch 24 Average Loss: 0.41034732544850444


Training Epoch 25: 100%|██████████| 334/334 [00:12<00:00, 27.36it/s]


Epoch 25 Average Loss: 0.4030585873002064


Training Epoch 26: 100%|██████████| 334/334 [00:12<00:00, 27.27it/s]


Epoch 26 Average Loss: 0.3900498673877495


Training Epoch 27: 100%|██████████| 334/334 [00:12<00:00, 27.35it/s]


Epoch 27 Average Loss: 0.380885520816117


Training Epoch 28: 100%|██████████| 334/334 [00:12<00:00, 27.24it/s]


Epoch 28 Average Loss: 0.3775050260215819


Training Epoch 29: 100%|██████████| 334/334 [00:12<00:00, 27.31it/s]


Epoch 29 Average Loss: 0.3646011140509815


Training Epoch 30: 100%|██████████| 334/334 [00:12<00:00, 27.31it/s]


Epoch 30 Average Loss: 0.36227137453764857


Training Epoch 31: 100%|██████████| 334/334 [00:12<00:00, 27.23it/s]


Epoch 31 Average Loss: 0.3662828447486826


Training Epoch 32: 100%|██████████| 334/334 [00:12<00:00, 27.33it/s]


Epoch 32 Average Loss: 0.34460380800559137


Training Epoch 33: 100%|██████████| 334/334 [00:12<00:00, 27.22it/s]


Epoch 33 Average Loss: 0.3422045748859287


Training Epoch 34: 100%|██████████| 334/334 [00:12<00:00, 27.20it/s]


Epoch 34 Average Loss: 0.34626887467673084


Training Epoch 35: 100%|██████████| 334/334 [00:12<00:00, 27.05it/s]


Epoch 35 Average Loss: 0.3296638093554153


Training Epoch 36: 100%|██████████| 334/334 [00:12<00:00, 27.27it/s]


Epoch 36 Average Loss: 0.31420751386126894


Training Epoch 37: 100%|██████████| 334/334 [00:12<00:00, 27.34it/s]


Epoch 37 Average Loss: 0.31880813682016856


Training Epoch 38: 100%|██████████| 334/334 [00:12<00:00, 27.23it/s]


Epoch 38 Average Loss: 0.31266475636735114


Training Epoch 39: 100%|██████████| 334/334 [00:12<00:00, 27.31it/s]


Epoch 39 Average Loss: 0.30393786936953754


Training Epoch 40: 100%|██████████| 334/334 [00:12<00:00, 27.22it/s]


Epoch 40 Average Loss: 0.3062571483123267


Training Epoch 41: 100%|██████████| 334/334 [00:12<00:00, 27.29it/s]


Epoch 41 Average Loss: 0.29052861909309546


Training Epoch 42: 100%|██████████| 334/334 [00:12<00:00, 27.35it/s]


Epoch 42 Average Loss: 0.2945202805525707


Training Epoch 43: 100%|██████████| 334/334 [00:12<00:00, 27.23it/s]


Epoch 43 Average Loss: 0.291099814675793


Training Epoch 44: 100%|██████████| 334/334 [00:12<00:00, 27.30it/s]


Epoch 44 Average Loss: 0.28643781445853544


Training Epoch 45: 100%|██████████| 334/334 [00:12<00:00, 27.22it/s]


Epoch 45 Average Loss: 0.27569911700344374


Training Epoch 46: 100%|██████████| 334/334 [00:12<00:00, 27.31it/s]


Epoch 46 Average Loss: 0.2709143208674417


Training Epoch 47: 100%|██████████| 334/334 [00:12<00:00, 27.14it/s]


Epoch 47 Average Loss: 0.2608676276666051


Training Epoch 48: 100%|██████████| 334/334 [00:12<00:00, 26.97it/s]


Epoch 48 Average Loss: 0.25702353410177425


Training Epoch 49: 100%|██████████| 334/334 [00:12<00:00, 27.14it/s]


Epoch 49 Average Loss: 0.25774772021874875


Training Epoch 50: 100%|██████████| 334/334 [00:12<00:00, 27.15it/s]

Epoch 50 Average Loss: 0.2617207909571732





In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, dataloader):
    model.eval()
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            visible_matrix = batch['visible_matrix'].to(device)
            soft_positions = batch['soft_positions'].to(device)
            labels = batch['labels'].to(device)
            
            
            logits = model(input_ids, attention_mask, visible_matrix, soft_positions)

            preds = torch.argmax(logits, dim=1)

            all_labels.extend(labels.tolist())
            all_preds.extend(preds.tolist())
      

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted') 
    recall = recall_score(all_labels, all_preds, average='weighted')        
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return accuracy, precision, recall, f1

In [15]:
kbert_accuracy, kbert_precision, kbert_recall, kbert_f1 = evaluate_model(model, test_loader)
print(f"K-BERT with SenticNet Accuracy: {kbert_accuracy * 100:.2f}%")
print(f"K-BERT Mean Precision: {kbert_precision * 100:.2f}%")
print(f"K-BERT Mean Recall: {kbert_recall * 100:.2f}%")
print(f"K-BERT Mean F1 Score: {kbert_f1 * 100:.2f}%")

Evaluating: 100%|██████████| 93/93 [00:01<00:00, 59.21batch/s]

K-BERT with SenticNet Accuracy: 88.06%
K-BERT Mean Precision: 88.09%
K-BERT Mean Recall: 88.06%
K-BERT Mean F1 Score: 88.06%



