# **1. Import Necessary Libraries**

In [5]:
import pandas as pd
import numpy as np
import spacy
import json
import torch
import joblib
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import FunctionTransformer,MaxAbsScaler
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from transformers import BertTokenizer, BertModel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# **2. Loading and Data Preprocessing of FewRel Dataset**
This section retrieves and preprocesses the FewRel dataset, a benchmark for few-shot RE, chosen for its public availability, size (11,200 instances across 80 relations), and annotation reliability. 

The dataset is sourced by cloning the FewRel GitHub repository (`!git clone`), navigating to its directory (`%cd FewRel`), and loading `train_wiki.json` (8,400 instances) and `val_wiki.json` (2,800 instances) into memory as JSON dictionaries. Each instance contains tokenized sentences, head/tail entity details (names and token indices), and relation labels (e.g., "P276"). 

The `process_fewrel_json` function transforms these into a pandas DataFrame, extracting text, relations, entities, and indices (`h_seq`, `t_seq`). An adaptation is made by concatenating the training and validation sets into `df_merged` (11,200 instances), maximizing data for supervised learning—a departure from typical train/validation splits to leverage FewRel’s full corpus. The spaCy `en_core_web_sm` model is initialized for subsequent linguistic processing, setting the stage for feature extraction.

In [6]:
# Clone FewRel repository from GitHub if not already present
!git clone https://github.com/thunlp/FewRel
# Change working directory to FewRel for file access
%cd FewRel

# Load training set (8,400 instances) from JSON
with open('./data/train_wiki.json', 'r') as file:
    fewrel_train = json.load(file)

# Load validation set (2,800 instances) from JSON
with open('./data/val_wiki.json', 'r') as file:
    fewrel_val = json.load(file)

def process_fewrel_json(fewrel_json):
    """Convert FewRel JSON data into a structured DataFrame."""
    texts, relations, heads, tails, h_seq, t_seq = [], [], [], [], [], []  # Initialize lists for DataFrame columns

    for relation, instances in fewrel_json.items(): # Iterate over relation types
        for instance in instances: # Process each instance within a relation
            texts.append(' '.join(instance['tokens'])) # Join tokens into a full sentence
            relations.append(relation) # Store relation ID (e.g., "P276")
            heads.append(instance['h'][0]) # Extract head entity name
            tails.append(instance['t'][0]) # Extract tail entity name
            h_seq.append(instance['h'][2][0]) # Store head entity token indices
            t_seq.append(instance['t'][2][0]) # Store tail entity token indices
    # Return DataFrame with extracted features
    return pd.DataFrame({
        'text': texts,
        'relation': relations,
        'head': heads,
        'tail': tails,
        'h_seq': h_seq,
        't_seq': t_seq
    })

# Process train and validation datasets into DataFrames
df_train = process_fewrel_json(fewrel_train)
df_val = process_fewrel_json(fewrel_val)

# Merge train and val sets (11,200 instances) to maximize supervised learning data
df_merged = pd.concat([df_train, df_val], axis=0).reset_index(drop=True)

# Initialize spaCy model for linguistic feature extraction
nlp = spacy.load('en_core_web_sm')

fatal: destination path 'FewRel' already exists and is not an empty directory.
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


c:\Users\wathi\OneDrive\Desktop\Text Mining\Assignment\FewRel


# **3. Feature Extraction**
This section defines functions to extract a hybrid feature set, a key enhancement over traditional SVM-based RE, combining linguistic and semantic features from FewRel instances. 

The `extract_spacy_features` function uses spaCy to derive Named Entity Recognition (NER) labels (e.g., "PERSON"), Part-of-Speech (POS) tags (e.g., "NOUN"), and dependency relations (e.g., "nsubj") for head and tail entity spans, identified by `h_seq` and `t_seq`. These features capture structural context critical for relation classification.

The `calculate_distance` function computes the token distance between entities, handling cases where entities are adjacent (distance = 0) or separated, encoding spatial relationships (e.g., for "location" relations). 
 
The `get_avg_vector` function generates semantic representations by averaging 100-dimensional GloVe embeddings (`glove-wiki-gigaword-100`) over entity tokens, with zero vectors for out-of-vocabulary words. 
 
This trio of functions forms the foundation of a custom feature extractor, blending traditional linguistic analysis with modern embeddings to improve SVM performance on FewRel’s diverse relations.

In [7]:
def extract_spacy_features(text, h_seq, t_seq):
    """Extract NER, POS, and dependency features for head and tail entities using spaCy."""
    
    doc = nlp(text) # Process text with spaCy pipeline
    
    # Extract NER labels for head and tail spans (empty if no entities detected)
    head_ner = [ent.label_ for ent in doc[h_seq[0]:h_seq[-1]+1].ents]
    tail_ner = [ent.label_ for ent in doc[t_seq[0]:t_seq[-1]+1].ents]
    
    # Extract POS tags for head and tail spans
    head_pos = [token.pos_ for token in doc[h_seq[0]:h_seq[-1]+1]]
    tail_pos = [token.pos_ for token in doc[t_seq[0]:t_seq[-1]+1]]
    
    # Extract dependency relations for head and tail spans
    head_dep = [token.dep_ for token in doc[h_seq[0]:h_seq[-1]+1]]
    tail_dep = [token.dep_ for token in doc[t_seq[0]:t_seq[-1]+1]]
    
    return head_ner, tail_ner, head_pos, tail_pos, head_dep, tail_dep

def calculate_distance(h_seq, t_seq):
    """Compute token distance between head and tail entities."""
    
    if h_seq and t_seq: # Ensure both sequences are non-empty
        head_end = max(h_seq) # End of head
        tail_start = min(t_seq) # Start of tail
        head_start = min(h_seq) # Start of head
        tail_end = max(t_seq) # End of tail
        
        # Calculate distance based on entity positions
        if head_end < tail_start:
            distance = tail_start - head_end - 1 # Tokens between head and tail
        elif tail_end < head_start:
            distance = head_start - tail_end - 1 # Tokens between tail and head
        else:
            distance = 0 # Entities overlap or are adjacent

    return distance


def get_avg_vector(phrase, word_vectors):
    """Generate average GloVe embedding for a phrase."""
    
    words = phrase.split() # Split phrase into tokens
    # Fetch GloVe vectors, use zero vector for out-of-vocabulary words
    vectors = [word_vectors[word] if word in word_vectors else np.zeros(word_vectors.vector_size) for word in words]
    
    # Return mean vector or zero vector if no valid embeddings
    return np.mean(vectors, axis=0) if vectors else np.zeros(word_vectors.vector_size)

# **4. Model Pipeline and Training**
This section constructs and trains the SVM-based RE model using a scikit-learn `Pipeline`, integrating feature extraction, scaling, and classification.

Pre-trained GloVe embeddings (`glove-wiki-gigaword-100`) are loaded to support semantic feature extraction. The `extract_features` function combines spaCy-derived features (NER/POS/dependency counts, distance) with GloVe embeddings (200D concatenated for head/tail), wrapped in a `CustomFeatureExtractor` transformer.

Another novel adaptation, the `BertEmbedder` transformer, leverages `bert-base-uncased` to generate 768D mean-pooled sentence embeddings, enhancing contextual understanding beyond entity-specific features. 

These are unified via a `ColumnTransformer`, producing a 975D feature vector (207D custom + 768D BERT). The pipeline scales features with `MaxAbsScaler` to preserve sparsity and trains an SVM with an RBF kernel (`SVC(kernel='rbf', probability=True)`), chosen for its ability to model non-linear interactions among complex features.

The merged dataset (`df_merged`) is split into 8,960 training and 2,240 testing instances (80-20, stratified by relation), and the pipeline is fitted to the training subset, optimizing for FewRel’s 80 relations.

In [None]:
# Load pre-trained GloVe embeddings (100D) for semantic feature extractio
word_vectors = api.load('glove-wiki-gigaword-100')

def extract_features(df):
    """Extract hybrid feature set combining linguistic and GloVe embeddings."""
    
    features = []
    for _, row in df.iterrows():
        # Extract linguistic features using spaCy
        head_ner, tail_ner, head_pos, tail_pos, head_dep, tail_dep = extract_spacy_features(
            row['text'], row['h_seq'], row['t_seq']
        )        
        
        distance = calculate_distance(row['h_seq'], row['t_seq']) # Compute entity distance
        
        # Generate GloVe embeddings for head and tail entities
        head_vector = get_avg_vector(row['head'], word_vectors)
        tail_vector = get_avg_vector(row['tail'], word_vectors)
        combined_vector = np.concatenate([head_vector, tail_vector]) # Concatenate to 200D
        
        # Combine all features into a 207D vector
        feature_vector = np.concatenate([
            np.array([len(head_ner), len(tail_ner)]),       # NER counts
            np.array([len(head_pos), len(tail_pos)]),       # POS counts
            np.array([len(head_dep), len(tail_dep)]),       # Dependency counts
            np.array([distance]),                           # Distance feature
            combined_vector                                 # Word embeddings (from GloVe)
        ])
        
        features.append(feature_vector)
    
    return np.vstack(features) # Stack into a feature matrix

class CustomFeatureExtractor(BaseEstimator, TransformerMixin):
    """Custom transformer for hybrid feature extraction."""
    
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self # No fitting required

    def transform(self, X):
        return extract_features(X) # Apply feature extraction to DataFrame

# Define a BERT embedder transformer
class BertEmbedder(BaseEstimator, TransformerMixin):
    """Transformer to generate BERT embeddings from text."""
    
    def __init__(self, model_name='bert-base-uncased', device='cuda', pooling='mean'):
        self.model_name = model_name
        self.device = device
        self.pooling = pooling
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval() # Set to evaluation mode

    def fit(self, X, y=None):
        return self # No fitting required

    def transform(self, X):
        embeddings = []
        with torch.no_grad(): # Disable gradient computation for inference
            for text in X:
                # Tokenize and encode text for BERT (max 512 tokens)
                encoded_input = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
                encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
                output = self.model(**encoded_input)
                
                # Apply mean pooling to get 768D sentence embedding
                if self.pooling == 'cls':
                    emb = output.last_hidden_state[:, 0, :]
                elif self.pooling == 'mean':
                    emb = output.last_hidden_state.mean(dim=1)
                else:
                    raise ValueError("Pooling must be either 'cls' or 'mean'")

                embeddings.append(emb.squeeze().cpu().numpy())
        
        return np.vstack(embeddings) # Stack into embedding matrix

# Define pipelines for custom features and BERT embeddings
custom_features_pipeline = Pipeline([
    ('extractor', CustomFeatureExtractor())
])
bert_pipeline = Pipeline([
    ('bert', BertEmbedder(model_name='bert-base-uncased', device='cuda', pooling='mean'))
])

# Combine features using ColumnTransformer (975D total: 207D custom + 768D BERT)
combined_features = ColumnTransformer([
    ('custom', custom_features_pipeline, ['text', 'h_seq', 't_seq', 'head', 'tail']),
    ('bert', bert_pipeline, 'text')
])

# Build final pipeline with scaling and SVM classifier
pipeline = Pipeline([
    ('features', combined_features), # Extract and combine features
    ('scaler', MaxAbsScaler()), # Scale features, preserving sparsity
    ('svm', SVC(kernel='rbf',probability=True, random_state=42)) # Train RBF SVM
])

# Prepare data for training
X = df_merged  
y = df_merged['relation']

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Fit the pipeline to training data
pipeline.fit(X_train, y_train)

In [9]:
# Export trained pipeline to file for reuse
joblib.dump(pipeline, 'svm_re_pipeline.pkl')
print("Model exported to 'svm_re_pipeline.pkl'")

Model exported to 'svm_re_pipeline.pkl'


# **5. Model Evaluation**

This section assesses the trained SVM pipeline’s performance on the test subset (2,240 instances). Predictions are generated using `pipeline.predict(X_test)`, and a `classification_report` computes precision, recall, and F1-score across 80 relations. 

The macro averaged F1-score of 0.78 reflects robust generalization, with per-relation metrics highlighting strengths (e.g., P105: 0.99 F1) and challenges (e.g., P40: 0.19 F1). This evaluation validates the efficacy of the hybrid feature set and RBF kernel, providing a quantitative basis for comparing this approach to the second RE method.

In [10]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

       P1001       0.68      0.90      0.78       140
        P101       0.78      0.75      0.77       140
        P102       0.89      0.97      0.93       140
        P105       0.98      1.00      0.99       140
        P106       0.94      0.80      0.86       140
        P118       0.91      0.98      0.94       140
        P123       0.73      0.63      0.67       140
        P127       0.59      0.49      0.53       140
       P1303       0.91      0.97      0.94       140
        P131       0.64      0.61      0.62       140
       P1344       0.94      0.96      0.95       140
       P1346       0.80      0.84      0.82       140
        P135       0.82      0.92      0.87       140
        P136       0.88      0.71      0.79       140
        P137       0.61      0.75      0.68       140
        P140       0.91      0.96      0.93       140
       P1408       0.91      0.99      0.95       140
       P1411       0.95    

# **6. Inference Mode**
This section prepares the inference environment and implements real-time RE by loading the exported model (`svm_re_pipeline.pkl`) with `joblib` and `pid2name.json` for relation metadata.

If the FewRel repository was not cloned in Section 2, Uncomment and execute the next cell to clone the repository (`!git clone`) and navigate to its directory (`%cd FewRel`) to access `pid2name.json`. This file maps relation IDs to names and descriptions, enriching inference outputs. This step is optional if Section 2 was executed, ensuring flexibility for standalone inference runs.

In [None]:
# Clone FewRel repository if not already present (optional if run earlier)
#!git clone https://github.com/thunlp/FewRel

# Navigate to FewRel directory to access pid2name.json
#%cd FewRel

c:\Users\wathi\OneDrive\Desktop\Text Mining\Assignment\FewRel\FewRel


fatal: destination path 'FewRel' already exists and is not an empty directory.
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


Inference mode relies on feature extraction utilities (`extract_spacy_features`, `calculate_distance`, `get_avg_vector`) and transformer classes (`CustomFeatureExtractor`, `BertEmbedder`), originally defined in Section 4. If those cells were not executed, the subsequent following cell must be run to define these components, ensuring the pipeline deserializes correctly.

In [24]:
def extract_spacy_features(text, h_seq, t_seq):
    """Extract NER, POS, and dependency features for head and tail entities using spaCy."""
    
    doc = nlp(text) # Process text with spaCy pipeline
    
    # Extract NER labels for head and tail spans (empty if no entities detected)
    head_ner = [ent.label_ for ent in doc[h_seq[0]:h_seq[-1]+1].ents]
    tail_ner = [ent.label_ for ent in doc[t_seq[0]:t_seq[-1]+1].ents]
    
    # Extract POS tags for head and tail spans
    head_pos = [token.pos_ for token in doc[h_seq[0]:h_seq[-1]+1]]
    tail_pos = [token.pos_ for token in doc[t_seq[0]:t_seq[-1]+1]]
    
    # Extract dependency relations for head and tail spans
    head_dep = [token.dep_ for token in doc[h_seq[0]:h_seq[-1]+1]]
    tail_dep = [token.dep_ for token in doc[t_seq[0]:t_seq[-1]+1]]
    
    return head_ner, tail_ner, head_pos, tail_pos, head_dep, tail_dep

def calculate_distance(h_seq, t_seq):
    """Compute token distance between head and tail entities."""
    
    if h_seq and t_seq: # Ensure both sequences are non-empty
        head_end = max(h_seq) # End of head
        tail_start = min(t_seq) # Start of tail
        head_start = min(h_seq) # Start of head
        tail_end = max(t_seq) # End of tail
        
        # Calculate distance based on entity positions
        if head_end < tail_start:
            distance = tail_start - head_end - 1 # Tokens between head and tail
        elif tail_end < head_start:
            distance = head_start - tail_end - 1 # Tokens between tail and head
        else:
            distance = 0 # Entities overlap or are adjacent

    return distance

def get_avg_vector(phrase, word_vectors):
    """Generate average GloVe embedding for a phrase."""
    
    words = phrase.split() # Split phrase into tokens
    # Fetch GloVe vectors, use zero vector for out-of-vocabulary words
    vectors = [word_vectors[word] if word in word_vectors else np.zeros(word_vectors.vector_size) for word in words]
    
    # Return mean vector or zero vector if no valid embeddings
    return np.mean(vectors, axis=0) if vectors else np.zeros(word_vectors.vector_size)

# Initialize spaCy model (needed for extract_relation)
nlp = spacy.load('en_core_web_sm')

# Load pre-trained GloVe embeddings (100D) for semantic feature extractio
word_vectors = api.load('glove-wiki-gigaword-100')

def extract_features(df):
    """Extract hybrid feature set combining linguistic and GloVe embeddings."""
    
    features = []
    for _, row in df.iterrows():
        # Extract linguistic features using spaCy
        head_ner, tail_ner, head_pos, tail_pos, head_dep, tail_dep = extract_spacy_features(
            row['text'], row['h_seq'], row['t_seq']
        )        
        
        distance = calculate_distance(row['h_seq'], row['t_seq']) # Compute entity distance
        
        # Generate GloVe embeddings for head and tail entities
        head_vector = get_avg_vector(row['head'], word_vectors)
        tail_vector = get_avg_vector(row['tail'], word_vectors)
        combined_vector = np.concatenate([head_vector, tail_vector]) # Concatenate to 200D
        
        # Combine all features into a 207D vector
        feature_vector = np.concatenate([
            np.array([len(head_ner), len(tail_ner)]),       # NER counts
            np.array([len(head_pos), len(tail_pos)]),       # POS counts
            np.array([len(head_dep), len(tail_dep)]),       # Dependency counts
            np.array([distance]),                           # Distance feature
            combined_vector                                 # Word embeddings (from GloVe)
        ])
        
        features.append(feature_vector)
    
    return np.vstack(features) # Stack into a feature matrix

class CustomFeatureExtractor(BaseEstimator, TransformerMixin):
    """Custom transformer for hybrid feature extraction."""
    
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self # No fitting required

    def transform(self, X):
        return extract_features(X) # Apply feature extraction to DataFrame

# Define a BERT embedder transformer
class BertEmbedder(BaseEstimator, TransformerMixin):
    """Transformer to generate BERT embeddings from text."""
    
    def __init__(self, model_name='bert-base-uncased', device='cuda', pooling='mean'):
        self.model_name = model_name
        self.device = device
        self.pooling = pooling
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval() # Set to evaluation mode

    def fit(self, X, y=None):
        return self # No fitting required

    def transform(self, X):
        embeddings = []
        with torch.no_grad(): # Disable gradient computation for inference
            for text in X:
                # Tokenize and encode text for BERT (max 512 tokens)
                encoded_input = self.tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
                encoded_input = {k: v.to(self.device) for k, v in encoded_input.items()}
                output = self.model(**encoded_input)
                
                # Apply mean pooling to get 768D sentence embedding
                if self.pooling == 'cls':
                    emb = output.last_hidden_state[:, 0, :]
                elif self.pooling == 'mean':
                    emb = output.last_hidden_state.mean(dim=1)
                else:
                    raise ValueError("Pooling must be either 'cls' or 'mean'")

                embeddings.append(emb.squeeze().cpu().numpy())
        
        return np.vstack(embeddings) # Stack into embedding matrix

If you have not done so yet, Upload the model file 'svm_re_pipeline.pkl' into the FewRel Directory.

The `extract_relation` function processes a user-supplied sentence and entities, using spaCy to tokenize and locate entity indices (`h_seq`, `t_seq`). It constructs a DataFrame, predicts the relation and probability via the loaded pipeline, and outputs the result with descriptive text.

An example demonstrates this with "London Heathrow Airport serves the greater London area efficiently." (head: "London Heathrow Airport", tail: "London"), correctly predicting "P931" with a probability of 0.48.

In [23]:
# Load the trained SVM model
loaded_pipeline = joblib.load('svm_re_pipeline.pkl')
print("Model loaded from 'svm_re_pipeline.pkl'")

# Load relation metadata for descriptive outputs
with open('./data/pid2name.json', 'r') as file:
    pid2name = json.load(file)
    
def extract_relation(sentence, head, tail):
    """Predict relation between head and tail entities in a sentence."""
    
    doc = nlp(sentence)# Tokenize sentence with spaCy
    tokens = [token.text for token in doc] # Convert to token list
    
    # Find consecutive tokens matching 'head' and 'tail'
    def find_token_indices(entity_text):
        """Locate token indices for an entity in the sentence."""
        
        entity_words = entity_text.lower().split() # Split entity into words
        n = len(entity_words)
        tokens_lower = [t.lower() for t in tokens] # Lowercase tokens
        
        for i in range(len(tokens_lower) - n + 1): # Search for consecutive match
            if tokens_lower[i:i+n] == entity_words:
                return list(range(i, i+n)) # Return token indices
        return [] # Return empty list if not found
    
    # Find token indices for head and tail entities
    h_seq = find_token_indices(head)
    t_seq = find_token_indices(tail)
    
    # If we couldn't find a match for either entity, we can't proceed
    if not h_seq or not t_seq:
        print("Could not locate head or tail entity in the tokenized sentence. Please ensure exact match.")
        return
    
    # Create input DataFrame for pipeline prediction
    df_input = pd.DataFrame({
        'text': [sentence],
        'head': [head],
        'tail': [tail],
        'h_seq': [h_seq],
        't_seq': [t_seq]
    })
    
    # Predict relation and probability
    pred_rel = loaded_pipeline.predict(df_input)[0]
    proba = loaded_pipeline.predict_proba(df_input)[0]
    classes = loaded_pipeline.named_steps['svm'].classes_
    rel_idx = list(classes).index(pred_rel)
    rel_prob = proba[rel_idx]
    
    # Attempt to retrieve relation info from pid2name (which has [name, description])
    if pred_rel in pid2name:
        rel_data = pid2name[pred_rel]  # e.g. ["place served by transport hub", "..."]
        # Unpack the two-element list
        if len(rel_data) == 2:
            rel_name, rel_desc = rel_data
        else:
            # If for some reason the list isn't exactly length 2
            rel_name = rel_data[0] if len(rel_data) > 0 else "Unknown relation name"
            rel_desc = rel_data[1] if len(rel_data) > 1 else "No description available."
    else:
        rel_name = "Unknown relation name"
        rel_desc = "No description available."
    
    # Display prediction results
    print(f"The relation of '{head}' and '{tail}' in the sentence")
    print(f"'{sentence}' is '{pred_rel}'")
    print(f"- Relation name: {rel_name}")
    print(f"- Description: {rel_desc}")
    print(f"- Probability of relation: {rel_prob:.2f}\n")

# Prompt user for input
# Static example for submission
sentence = "London Heathrow Airport serves London daily."
head = "London Heathrow Airport"
tail = "London"

extract_relation(sentence, head, tail)

Model loaded from 'svm_re_pipeline.pkl'
The relation of 'London Heathrow Airport' and 'London' in the sentence
'London Heathrow Airport serves London daily.' is 'P931'
- Relation name: place served by transport hub
- Description: territorial entity or entities served by this transport hub (airport, train station, etc.)
- Probability of relation: 0.54



