In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install torch transformers scikit-learn

In [2]:
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertModel, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

def load_data(filename):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def preprocess_data(data):
    """
    Preprocess the relation extraction dataset by formatting entity mentions
    with special markers [E1], [/E1], [E2], [/E2] for BERT-based models.

    Args:
        data (dict): A dictionary where keys are relation labels and values
                     are lists of sentence instances containing tokenized text
                     and entity positions.

    Returns:
        list: A list of tuples where each tuple contains a processed sentence
              and its corresponding relation label.
    """
    samples = []
    for relation, instances in data.items():
        for instance in instances:
            # Extract "tokens" (words in the sentence) and entity information
            tokens = instance["tokens"]
            h_text, h_id, h_pos = instance["h"] # Head entity details
            t_text, t_id, t_pos = instance["t"] # Tail entity details

            # Add special markers around head (h) and tail (t) entities
            tokens[h_pos[0][0]] = "[E1] " + tokens[h_pos[0][0]]
            tokens[h_pos[-1][-1]] = tokens[h_pos[-1][-1]] + " [/E1]"
            tokens[t_pos[0][0]] = "[E2] " + tokens[t_pos[0][0]]
            tokens[t_pos[-1][-1]] = tokens[t_pos[-1][-1]] + " [/E2]"

            sentence = " ".join(tokens)
            samples.append((sentence, relation))

    return samples


In [3]:
class REDataset(Dataset):

    def __init__(self, data, tokenizer, label_encoder, max_len=128):
        """
        Custom PyTorch dataset for relation extraction.

        Args:
            data (list): List of (sentence, relation) tuples.
            tokenizer (BertTokenizer): Tokenizer for encoding sentences.
            label_encoder (LabelEncoder): Encoder for converting relation labels to numerical format.
            max_len (int, optional): Maximum sequence length for tokenization. Default is 128.
        """
        self.data = data
        self.tokenizer = tokenizer
        self.label_encoder = label_encoder
        self.max_len = max_len

    def __len__(self):
        """
        Returns the total number of samples in the dataset.
        """
        return len(self.data)

    def __getitem__(self, idx):
        """
        Retrieves a single sample from the dataset.

        Args:
            idx (int): Index of the sample.

        Returns:
            tuple: (input_ids, attention_mask, label_id) as PyTorch tensors.
        """
        sentence, label = self.data[idx]

        # Tokenize and encode the sentence
        encoding = self.tokenizer(sentence, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt")

        # Extract input_ids and attention_mask, squeezing to remove extra dimensions
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)
        label_id = self.label_encoder.transform([label])[0] # Encode the relation label into a numerical ID

        return input_ids, attention_mask, torch.tensor(label_id, dtype=torch.long)

# BERT relation extraction model
class BERTRE(nn.Module):
    def __init__(self, num_labels):
        super(BERTRE, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-uncased") # Load pre-trained BERT model
        self.dropout = nn.Dropout(0.3) # Dropout layer to prevent overfitting
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels) # Fully connected layer to classify relations

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) # Pass input through BERT
        cls_output = outputs.last_hidden_state[:, 0, :]  # Extract the [CLS] token representation
        cls_output = self.dropout(cls_output) # Apply dropout for regularization
        logits = self.classifier(cls_output) # Pass through classifier to get logits
        return logits

In [4]:
def train(model, dataloader, optimizer, loss_fn, device):
    """
    Trains the model for one epoch.

    Args:
        model: The neural network model.
        dataloader: DataLoader for the training dataset.
        optimizer: Optimizer for updating model weights.
        loss_fn: Loss function used for training.
        device: The device (CPU/GPU) where computations will run.

    Returns:
        Tuple of (average loss, average accuracy) for the epoch.
    """
    model.train()
    total_loss, total_acc = 0, 0  # Initialize total loss and accuracy accumulators

    # Iterate over batches in the dataloader
    for input_ids, attention_mask, labels in dataloader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad() # Clear previous gradients
        logits = model(input_ids, attention_mask) # Forward pass: get model predictions
        loss = loss_fn(logits, labels)
        loss.backward() # Backpropagation: compute gradients
        optimizer.step()  # Update model parameters

        preds = torch.argmax(logits, dim=1)  # Get predicted labels (highest logit)
        acc = accuracy_score(labels.cpu(), preds.cpu())

        total_loss += loss.item()
        total_acc += acc

    # Compute average loss and accuracy over all batches
    return total_loss / len(dataloader), total_acc / len(dataloader)

In [5]:
def evaluate(model, dataloader, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad(): # Disable gradient computation to save memory
        for input_ids, attention_mask, labels in dataloader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            # Forward pass: Get model predictions
            logits = model(input_ids, attention_mask)
            preds = torch.argmax(logits, dim=1) # Get the class with the highest probability

            # Store predictions and true labels for evaluation
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    # Compute accuracy and generate a classification report
    return accuracy_score(all_labels, all_preds), classification_report(all_labels, all_preds)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess training data
train_data = load_data("/content/drive/MyDrive/textmining/train_wiki.json")
samples = preprocess_data(train_data)

# Encode relation labels
relations = list(set([r for _, r in samples]))
label_encoder = LabelEncoder()
label_encoder.fit(relations)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:

# Create dataset
dataset = REDataset(samples, tokenizer, label_encoder)

# Split dataset into 80% training and 20% validation
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize BERT-based relation extraction model
model = BERTRE(num_labels=len(relations)).to(device)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(3):
    loss, acc = train(model, train_loader, optimizer, loss_fn, device)
    print(f"Epoch {epoch+1}: Loss={loss:.4f}, Acc={acc:.4f}")

# Evaluation on validation set
acc, report = evaluate(model, val_loader, device)
print(f"Test Accuracy: {acc:.4f}")
print(report)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



Epoch 1: Loss=1.0234, Acc=0.7475
Epoch 2: Loss=0.3308, Acc=0.9064
Epoch 3: Loss=0.2064, Acc=0.9415
Test Accuracy: 0.8905
              precision    recall  f1-score   support

           0       0.95      0.83      0.88       168
           1       0.90      0.85      0.88       151
           2       0.99      0.99      0.99       141
           3       1.00      1.00      1.00       138
           4       0.94      0.89      0.91       135
           5       0.98      0.99      0.98       128
           6       0.76      0.82      0.79       154
           7       0.65      0.56      0.60       144
           8       0.98      1.00      0.99       142
           9       0.79      0.82      0.80       130
          10       0.99      0.95      0.97       129
          11       0.80      0.86      0.83       142
          12       0.98      0.95      0.96       144
          13       0.95      0.93      0.94       149
          14       0.84      0.74      0.78       167
          15  

In [8]:
# torch.save(model, "/content/drive/MyDrive/textmining/relation_extraction_model.pth")

In [9]:
# model=torch.load('/content/drive/MyDrive/textmining/relation_extraction_model.pth')

In [10]:
def infer(model, tokenizer, label_encoder, sentence, h_pos, t_pos, device):
    """
    Perform real-time inference to predict relations between two entities in a given sentence.

    Args:
        model: Trained BERT-based relation extraction model.
        tokenizer: BERT tokenizer for encoding sentences.
        label_encoder: Label encoder used for decoding predicted relation indices.
        sentence (str): The input sentence containing two entities.
        h_pos (tuple): The start and end position of the head entity in the tokenized sentence.
        t_pos (tuple): The start and end position of the tail entity in the tokenized sentence.
        device: The device (CPU/GPU) to run the model on.

    Returns:
        str: Predicted relation label.
    """
    model.eval()
    tokens = sentence.split()

    # Add entity markers
    tokens[h_pos[0]] = "[E1] " + tokens[h_pos[0]]
    tokens[h_pos[1]] = tokens[h_pos[1]] + " [/E1]"
    tokens[t_pos[0]] = "[E2] " + tokens[t_pos[0]]
    tokens[t_pos[1]] = tokens[t_pos[1]] + " [/E2]"
    processed_sentence = " ".join(tokens)

    # Tokenize and encode sentence
    encoding = tokenizer(processed_sentence, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        pred_label_id = torch.argmax(logits, dim=1).cpu().item()

    predicted_relation = label_encoder.inverse_transform([pred_label_id])[0]
    return predicted_relation


In [11]:
# Wikidata PID, name and description for each relation.
pid2name = load_data("/content/drive/MyDrive/textmining/pid2name.json")

# Example usage
sentence = "Barack Obama was born in Honolulu, Hawaii."
h_pos = (0, 1)  # "Barack Obama"
t_pos = (5, 5)  # "Honolulu"

predicted_relation = infer(model, tokenizer, label_encoder, sentence, h_pos, t_pos, device)
print(f"Predicted Relation: {pid2name[predicted_relation]}")

Predicted Relation: ['residence', 'the place where the person is or has been, resident']
