In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### Loading dataset (use GoogleDrive)

In [None]:
import json

train_data_path = '/content/drive/MyDrive/Text_Mining/Datasets/train_wiki.json'
test_data_path = '/content/drive/MyDrive/Text_Mining/Datasets/val_wiki.json'
wiki_pid_path = '/content/drive/MyDrive/Text_Mining/Datasets/pid2name.json'

with open(train_data_path, 'r') as file:
    train_data = json.load(file)
    print(train_data)
with open(test_data_path, 'r') as file:
    val_data = json.load(file)
    print(val_data)
with open(wiki_pid_path, 'r') as file:
    wiki_pid = json.load(file)
    print(wiki_pid)


In [None]:
sample = train_data['P101'][0]
print("Sentence:", " ".join(sample["tokens"]))
print("Head Entity:", sample["h"])
print("Tail Entity:", sample["t"])
print("Relation:", "P101 =",wiki_pid['P740'][0])

Sentence: It was first described by German botanist Conrad Moench .
Head Entity: ['conrad moench', 'Q60948', [[7, 8]]]
Tail Entity: ['botanist', 'Q441', [[6]]]
Relation: P101 = location of formation


In [None]:
print(sample)

{'tokens': ['It', 'was', 'first', 'described', 'by', 'German', 'botanist', 'Conrad', 'Moench', '.'], 'h': ['conrad moench', 'Q60948', [[7, 8]]], 't': ['botanist', 'Q441', [[6]]]}


#### Training

In [None]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Loading dataset
def load_fewrel_data1(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)

    sentences, labels = [], []
    for relation, samples in data.items():
      for sample in samples:
          sentence = " ".join(sample["tokens"])
          sentences.append(sentence)
          labels.append(relation)
    return sentences, labels

# Split the dataset
sentences,labels = load_fewrel_data1(train_data_path)
train_sentences,test_sentences,train_labels,test_labels = train_test_split(sentences,labels,test_size=0.2,stratify=labels,random_state=42)

# Calculate Tf-Idf
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_sentences)
X_test = vectorizer.transform(test_sentences)

# SVM training
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train, train_labels)

# Evaluation
y_pred = svm_model.predict(X_test)
print("Classification Report:\n", classification_report(test_labels, y_pred))

# Inference
def infer(sentence):
    sentence_vector = vectorizer.transform([sentence])
    predicted_relation = svm_model.predict(sentence_vector)
    return predicted_relation[0]

# Testing
test_sentence = "Google was founded in California."
predicted_relation = infer(test_sentence)
print(f"Predicted Relation: {predicted_relation}, {predicted_relation} = {wiki_pid[predicted_relation]}")

Classification Report:
               precision    recall  f1-score   support

       P1001       0.47      0.52      0.49       140
        P101       0.46      0.48      0.47       140
        P102       0.59      0.69      0.64       140
        P105       0.89      0.97      0.93       140
        P106       0.52      0.54      0.53       140
        P118       0.60      0.73      0.66       140
        P123       0.44      0.51      0.47       140
        P127       0.32      0.32      0.32       140
       P1303       0.76      0.71      0.74       140
        P131       0.32      0.23      0.27       140
       P1344       0.67      0.59      0.63       140
       P1346       0.54      0.69      0.60       140
        P135       0.57      0.64      0.60       140
        P136       0.45      0.38      0.41       140
        P137       0.58      0.62      0.60       140
        P140       0.73      0.70      0.71       140
       P1408       0.86      0.81      0.84       140
   

## Enhancement

In this section, I perform entity marking to force the model to focus on the entity context. In this way, the model does not need to infer entity locations on its own and can focus directly on the semantic links between entities.

In [None]:
from scipy.sparse import hstack
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def entity_marking(sample):
    tokens = sample["tokens"]
    '''
    This function is to mark the entities in a sentence.
    {'P931': [{'tokens': ['Merpati', 'flight', '106', 'departed', 'Jakarta', '(', 'CGK', ')', 'on', 'a', 'domestic', 'flight', 'to', 'Tanjung', 'Pandan', '(', 'TJQ', ')', '.'],
    'h': ['tjq', 'Q1331049', [[16]]], 't': ['tanjung pandan', 'Q3056359', [[13, 14]]]}
    '''

    h_pos = sample["h"][2][0]  # Get head position
    h_start = h_pos[0]
    h_end = h_pos[-1]  # If the entity is a single token, h_end = h_start
    # Get tail entity position
    t_pos = sample["t"][2][0]
    t_start = t_pos[0]
    t_end = t_pos[-1]
    # Add entity markers
    tokens[h_start] = "[E1]" + tokens[h_start]
    tokens[h_end] += "[/E1]"
    tokens[t_start] = "[E2]" + tokens[t_start]
    tokens[t_end] += "[/E2]"
    sentence = " ".join(tokens)
    return sentence

def load_fewrel_data2(file_path):
    # load the dataset and perform entity marking
    with open(file_path, "r") as f:
        data = json.load(f)

    sentences,labels = [],[]
    for relation,samples in data.items():
      for sample in samples:
        sentence = entity_marking(sample)
        sentences.append(sentence)
        labels.append(relation)
    return sentences, labels


def enrich_text(text):
    # Extract linguistic features
    doc = nlp(text)
    features = []
    for token in doc:
        features.append(f"{token.text}_{token.pos_}_{token.dep_}")
    return " ".join(features)

def ProcessSentenceForPre(text):
    """
    This function is to convert a normal sentence into a FewRel structure.
    """
    doc = nlp(text)
    tokens = [token.text for token in doc]
    entities = [(ent.text, ent.start, ent.end, ent.label_) for ent in doc.ents]

    if len(entities) < 2:
      return "Not enough entities detected for relation extraction."
    head, tail = entities[0], entities[1]
    h_text, h_start, h_end, h_type = head
    t_text, t_start, t_end, t_type = tail

    tokens[h_start] = "[E1]" + tokens[h_start]
    tokens[h_end - 1] += "[/E1]"
    tokens[t_start] = "[E2]" + tokens[t_start]
    tokens[t_end - 1] += "[/E2]"
    processed_sentence = {
      "tokens": tokens,
      "h": [h_text, h_type, [[h_start, h_end - 1]]],  # Head entity
      "t": [t_text, t_type, [[t_start, t_end - 1]]]   # Tail entity
    }

    return processed_sentence

# Example
test_sentence = "Google was founded in California."
fewrel_format = ProcessSentenceForPre(test_sentence)
print("FewRel-Formatted Output:\n", fewrel_format)

FewRel-Formatted Output:
 {'tokens': ['[E1]Google[/E1]', 'was', 'founded', 'in', '[E2]California[/E2]', '.'], 'h': ['Google', 'ORG', [[0, 0]]], 't': ['California', 'GPE', [[4, 4]]]}


In [None]:
s = "Merpati flight 106 departed Jakarta ( CGK ) on a domestic flight to [E2]Tanjung Pandan[/E2] ( [E1]TJQ[/E1] ) ."
print(enrich_text(s))

Merpati_PROPN_compound flight_NOUN_nsubj 106_NUM_nummod departed_VERB_ROOT Jakarta_PROPN_npadvmod (_PUNCT_punct CGK_PROPN_appos )_PUNCT_punct on_ADP_prep a_DET_det domestic_ADJ_amod flight_NOUN_pobj to_ADP_prep [_PUNCT_dep E2]Tanjung_PROPN_compound Pandan[/E2_PROPN_pobj ]_PUNCT_punct (_PUNCT_punct [_X_nmod E1]TJQ[/E1_NOUN_ROOT ]_PUNCT_punct )_PUNCT_punct ._PUNCT_punct


In [None]:
# Split the dataset
sentences,labels = load_fewrel_data2(train_data_path)
# print(sentences)
# sentences = [enrich_text(s) for s in sentences]
train_sentences,test_sentences,train_labels,test_labels = train_test_split(sentences,labels,test_size=0.2,stratify=labels,random_state=42)

In [None]:
# TF-IDF
vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 3),  # Capture word combinations of size 1-3
    stop_words='english',
    sublinear_tf=True    # Use 1+log(tf) instead of raw term frequency
)

X_train = vectorizer.fit_transform(train_sentences)
X_test = vectorizer.transform(test_sentences)

# SVM training
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train, train_labels)

# param_grid = {
#     'C': [0.1, 1, 10, 100],
#     'gamma': ['scale', 'auto', 0.01, 0.1],
#     'kernel': ['rbf', 'linear']
# }

# grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='f1_macro')
# grid_search.fit(X_train, train_labels)
# print("Best parameters:", grid_search.best_params_)
# svm_model = grid_search.best_estimator_

In [None]:
# Evaluation, print the classification performance on training set and testing set
y_pred = svm_model.predict(X_test)
print("Training accuracy:\n",classification_report(train_labels,svm_model.predict(X_train)))
print("Classification Report:\n", classification_report(test_labels, y_pred))

Training accuracy:
               precision    recall  f1-score   support

       P1001       0.90      0.90      0.90       560
        P101       0.90      0.93      0.91       560
        P102       0.95      0.96      0.95       560
        P105       1.00      0.99      0.99       560
        P106       0.96      0.93      0.95       560
        P118       0.97      0.98      0.97       560
        P123       0.90      0.91      0.91       560
        P127       0.85      0.80      0.83       560
       P1303       0.99      0.97      0.98       560
        P131       0.89      0.80      0.84       560
       P1344       0.95      0.98      0.96       560
       P1346       0.90      0.94      0.92       560
        P135       0.94      0.95      0.95       560
        P136       0.98      0.90      0.94       560
        P137       0.90      0.91      0.90       560
        P140       0.96      0.95      0.96       560
       P1408       0.95      0.98      0.96       560
       

In [None]:
import joblib
# Save the model
joblib.dump(svm_model, 'svm_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [None]:
loaded_svm_model = joblib.load('svm_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

In [None]:
# Inference
def infer(sentence,model,vectorizer):
    sentence = ProcessSentenceForPre(sentence)
    sentence = entity_marking(sentence)
    sentence_vector = vectorizer.transform([sentence])
    predicted_relation = model.predict(sentence_vector)
    return predicted_relation[0]

In [None]:
loaded_svm_model = joblib.load('svm_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')
test_sentence = "Barack Obama was born in Honolulu, Hawaii."
predicted_relation = infer(test_sentence,loaded_svm_model,loaded_vectorizer)
print(f"Predicted Relation: {predicted_relation}, {predicted_relation} = {wiki_pid[predicted_relation]}")

Predicted Relation: P551, P551 = ['residence', 'the place where the person is or has been, resident']


### SVM implemented by deep learning
In this section, I tried a different way to implement `SVC` model using deep learning language `Pytorch`. The advantages of using PyTorch to implement Support Vector Classifiers lie in the combination of flexibility and the depth of the deep learning method. This idea was inspired by ChatGPT and DeepSeek.

Code reference:

https://github.com/kazuto1011/svm-pytorch

https://github.com/USHAHANE/T/blob/185bd7e5bf02b2b44b7e4e2fb35349eab4bd7d20/SVM.txt#L39

In [None]:
import json
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset, random_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import rbf_kernel

In [None]:
print(torch.cuda.is_available())

False


In [None]:
# Split the dataset
sentences,labels = load_fewrel_data2(train_data_path)
# sentences = [enrich_text(s) for s in sentences]
train_sentences,test_sentences,train_labels,test_labels = train_test_split(sentences,labels,test_size=0.2,stratify=labels,random_state=42)

In [None]:
# TF-IDF
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 3),  # Capture word combinations of size 1-3
    stop_words='english',
    sublinear_tf=True    # Use 1+log(tf) instead of raw term frequency
)

X_train = vectorizer.fit_transform(train_sentences).toarray().astype(np.float32)
X_test = vectorizer.transform(test_sentences).toarray().astype(np.float32)

In [None]:
# Convert data to PyTorch tensors
label_to_idx = {label: idx for idx, label in enumerate(set(labels))}
y_train = torch.tensor([label_to_idx[label] for label in train_labels], dtype=torch.long)
y_test = torch.tensor([label_to_idx[label] for label in test_labels], dtype=torch.long)

In [None]:
# Create DataLoader
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), y_train)
test_dataset = TensorDataset(torch.tensor(X_test, dtype=torch.float32), y_test)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Define the architecture of SVC
class SVCDL(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_dim,num_classes)

    def forward(self, x):
        x = self.fc1(x)
        return x

In [None]:
# Training
num_labels = len(set(labels))
model = SVCDL(X_train.shape[1],num_labels)
optimizer = optim.SGD(model.parameters(), lr=0.005,momentum=0.9, weight_decay=0.001)
criterion = nn.MultiMarginLoss(margin=1.0,p=1)
model.train()
epochs = 100
for epoch in range(epochs):
  total_loss = 0
  for batch_x,batch_y in train_loader:
      optimizer.zero_grad()
      outputs = model(batch_x)
      loss = criterion(outputs, batch_y)
      loss.backward()
      optimizer.step()
      total_loss += loss.item()

  if (epoch+1) % 10 == 0:
      print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 10, Loss: 0.6977
Epoch 20, Loss: 0.5853
Epoch 30, Loss: 0.5429
Epoch 40, Loss: 0.5253
Epoch 50, Loss: 0.5171
Epoch 60, Loss: 0.5131
Epoch 70, Loss: 0.5111
Epoch 80, Loss: 0.5099
Epoch 90, Loss: 0.5093
Epoch 100, Loss: 0.5090


In [None]:
# print the classification performance on training set and testing set
model.eval()
with torch.no_grad():
    all_preds = []
    all_labels = []

    for X, y in train_loader:
        outputs = model(X)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y.cpu().numpy())
    print("Classification report on training set\n")
    print(classification_report(all_labels, all_preds))

    all_preds = []
    all_labels = []
    for X, y in test_loader:
        outputs = model(X)
        preds = torch.argmax(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y.cpu().numpy())
    print("Classification report on testing set\n")
    print(classification_report(all_labels, all_preds))

Classification report on training set

              precision    recall  f1-score   support

           0       0.47      0.29      0.36       560
           1       0.70      0.73      0.72       560
           2       0.80      0.61      0.69       560
           3       0.76      0.70      0.72       560
           4       0.87      0.71      0.78       560
           5       1.00      0.84      0.91       560
           6       0.93      0.60      0.73       560
           7       0.52      0.70      0.60       560
           8       0.59      0.52      0.55       560
           9       0.65      0.70      0.67       560
          10       0.42      0.41      0.41       560
          11       0.47      0.65      0.54       560
          12       0.81      0.49      0.61       560
          13       0.68      0.52      0.59       560
          14       0.65      0.65      0.65       560
          15       0.80      0.51      0.62       560
          16       0.68      0.75      0.7

In [None]:
# Inference
def infer(sentence):
    '''
    This function is to infer the relation between the entity in a sentence.
    '''
    sentence = ProcessSentenceForPre(sentence)
    sentence = entity_marking(sentence)
    sentence_vector = vectorizer.transform([sentence]).toarray().astype(np.float32)
    sentence_tensor = torch.tensor(sentence_vector, dtype=torch.float32)
    model.eval()
    with torch.no_grad():
        outputs = model(sentence_tensor)
        probabilities = torch.softmax(outputs, dim=1)
        confidence, predicted_idx = torch.max(probabilities, dim=1)
        confidence = confidence.item()
    idx_to_label = {v: k for k, v in label_to_idx.items()}
    return idx_to_label[predicted_idx.item()]

In [None]:
# Testing
test_sentence = "Google was founded in California."
predicted_relation = infer(test_sentence)
print(f"Predicted Relation: {predicted_relation}, {predicted_relation} = {wiki_pid.get(predicted_relation, 'Unknown')}")

Predicted Relation: P159, P159 = ['headquarters location', 'specific location where an organization\'s headquarters is or has been situated. Inverse property of "occupant" (P466).']
