## Separating entities according their type

In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV , train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score, make_scorer
import numpy as np
import pandas as pd
from tqdm import tqdm
from copy import deepcopy
import torch
from kge.util.io import load_checkpoint

## Load entities with their embeddings

In [2]:
# Load entity type Dataframe
entities_df = pd.read_csv('data/entities_types.csv', sep=',', header=0)

#load YAGO Embedding
checkpoint = load_checkpoint('../embeddings/yago3-10-complex.pt') # Change the model here
model = checkpoint["model"]
tensor_embed: torch.Tensor = dict(checkpoint['model'][0])['_entity_embedder._embeddings.weight']
all_arrays = tensor_embed.numpy()

### Load dataset

In [3]:
# Load dataset to get the relations
print("Loading triples...")
GLOBAL_PATH = "data/"
NAME_SUBGRAPH = ["train", "test", "valid"]
cols_name = ["head", "relation", "tail"]
triples_df = pd.concat(
    (pd.read_csv(
        GLOBAL_PATH + f + '.txt',
        sep="\t",
        names=cols_name) for f in NAME_SUBGRAPH),
    ignore_index=True)

Loading triples...


In [4]:
# defining parameter range ofr GridSearch
# Define the hyperparameter grid
param_grid = [
  {'C': [0.1, 1, 2.5, 5, 7.5, 10], 'kernel': ['rbf']},
 ]

### Utils

In [5]:
def similarity_score(ref, neighbours, df_with_labels, mode: str = "all"):
    # Determine the label of the reference
    label_ref = df_with_labels[df_with_labels['Entity']== ref]['labels'].to_list()[0]
    
    # Determine labels for all neighbors at once
    all_neighbours_labels = np.array([df_with_labels[df_with_labels['Entity'] == name]['labels'].to_list()[0] for name in neighbours])

    # Calculate intersection scores

    if mode == "one":
        intersect_labels = np.logical_and(label_ref, all_neighbours_labels)
        # Sum the intersect labels along axis 1 (columns) and count non-zero elements
        non_zero = np.count_nonzero(intersect_labels, axis=1)

        # Count number of rows with non-zero elements
        score = np.count_nonzero(non_zero)
    elif mode == "all":
        all_each_line = np.all(all_neighbours_labels == label_ref, axis=1)
        # Count number of rows with non-zero elements
        score = np.count_nonzero(all_each_line)
    else:
        raise ValueError("This is not a valid mode.")

    return score / len(neighbours)

def closest_neighbours(ref_entity, other_entities, df_with_labels, k, mode: str = "euclidean"):
    if mode == "euclidean":
        relative_ref = other_entities - ref_entity
        dist_to_ref = np.linalg.norm(relative_ref, axis=1)
        k_closest_entity = np.argsort(dist_to_ref)[1:(k+1)]
        return df_with_labels['Entity'].iloc[k_closest_entity]
    elif mode == "cosine":
        norm_ref = np.linalg.norm(ref_entity)
        ref_norm = ref_entity / norm_ref
        norm_other = np.linalg.norm(other_entities, axis=1)
        normalized_other = np.divide(other_entities, norm_other.reshape(-1,1)) # division elementwise
        cosine_sim = normalized_other.dot(ref_norm.T).reshape(-1,)
        # We take the k elemnt with the highest score
        k_closest_values = np.argsort(cosine_sim)[-(k+1):-1]
        return df_with_labels['Entity'].iloc[k_closest_values]
    raise ValueError("This is not a valid mode of similarity")

## Apply SVM on Persons

### Only keep person type

In [6]:
# Loads all the data corresponding to person type
entity_type = "person_100007846"

person_name = entities_df[entities_df["type"].apply(lambda x: entity_type in x)]
person_index = np.array(person_name.index, dtype=int)

In [7]:
# Adapt scale embeddings to only concerned data
scale_person_data = deepcopy(all_arrays)
scale = StandardScaler()
scale_person_data[person_index] = scale.fit_transform(scale_person_data[person_index])

In [8]:
all_person_types = ["player_110439851", "artist_109812338", "politician_110450303", "scientist_110560637", "officeholder_110371450", "writer_110794014"]

In [9]:
# Extracting relevant data from entities_df to avoid repeated lookups
entity_types = entities_df.set_index('Entity')['type']

# Creating a list to store the labels
labels_person = []

# Iterate over each entity in person_index
for entity in tqdm(entities_df['Entity'][person_index], total=len(entities_df['Entity'][person_index])):
    # Extracting types for the current entity
    entity_type = entity_types.get(entity, None)
    if entity_type is not None:
        # Creating labels for all_person_types
        labels = [person_type in entity_type for person_type in all_person_types]
        labels_person.append(labels)
    else:
        # If entity not found, fill with False labels
        labels_person.append([False] * len(all_person_types))

# Converting labels_person to numpy array
labels_person = np.array(labels_person)

# We select only the embeddings of the entities
embeddings_person = scale_person_data[person_index]


  0%|          | 0/67476 [00:00<?, ?it/s]


 27%|██▋       | 17911/67476 [00:00<00:00, 104075.34it/s]


 75%|███████▍  | 50359/67476 [00:00<00:00, 204040.06it/s]


100%|██████████| 67476/67476 [00:00<00:00, 207225.46it/s]




In [10]:
# We split the entities
X_person_train, X_person_test, y_person_train, y_person_test  = train_test_split(embeddings_person, labels_person, test_size=0.2, random_state=42)

In [11]:
# Statistics

for idx in range(labels_person.shape[1]):
    nb_condition = len(labels_person[labels_person[:,idx] == True])
    nb_not_condition = len(labels_person[labels_person[:,idx] == False])

    nb_total = nb_condition + nb_not_condition

    print(f"The number of entities following the condition is {nb_condition}, which is {round(100*nb_condition/nb_total,2)} %")
    print(f"The number of entities not following the condition is {nb_not_condition}, which is {round(100*nb_not_condition/nb_total,2)} %")
print("The total number of entities is ", nb_total)

The number of entities following the condition is 44820, which is 66.42 %
The number of entities not following the condition is 22656, which is 33.58 %
The number of entities following the condition is 5251, which is 7.78 %
The number of entities not following the condition is 62225, which is 92.22 %
The number of entities following the condition is 2769, which is 4.1 %
The number of entities not following the condition is 64707, which is 95.9 %
The number of entities following the condition is 2137, which is 3.17 %
The number of entities not following the condition is 65339, which is 96.83 %
The number of entities following the condition is 1890, which is 2.8 %
The number of entities not following the condition is 65586, which is 97.2 %
The number of entities following the condition is 5152, which is 7.64 %
The number of entities not following the condition is 62324, which is 92.36 %
The total number of entities is  67476


In [12]:
# Process SVM for all continents
grid_person = [GridSearchCV(SVC(class_weight="balanced", gamma="scale"), param_grid, refit=True, verbose=1, scoring = make_scorer(cohen_kappa_score), n_jobs=16)]*len(all_person_types)

distances_svm = []
for idx, svm in enumerate(grid_person):
    svm.fit(X_person_train, y_person_train[:,idx])

    clf_person = svm.best_estimator_
    print("The best parameters are: ", svm.best_params_)
    print("The training accuracy is: {:.2f} %".format(clf_person.score(X_person_train, y_person_train[:,idx])*100))
    print("The testing accuracy is: {:.2f} %".format(clf_person.score(X_person_test, y_person_test[:,idx])*100))
    print("The kappa score (testing set) is {:.2f}".format(cohen_kappa_score(clf_person.predict(X_person_test), y_person_test[:,idx])))

    distances_svm.append(svm.best_estimator_.decision_function(scale_person_data).reshape(-1,1))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


The best parameters are:  {'C': 2.5, 'kernel': 'rbf'}


The training accuracy is: 99.31 %


The testing accuracy is: 97.29 %


The kappa score (testing set) is 0.94


Fitting 5 folds for each of 6 candidates, totalling 30 fits


The best parameters are:  {'C': 7.5, 'kernel': 'rbf'}


The training accuracy is: 99.15 %


The testing accuracy is: 95.70 %


The kappa score (testing set) is 0.69


Fitting 5 folds for each of 6 candidates, totalling 30 fits


The best parameters are:  {'C': 2.5, 'kernel': 'rbf'}


The training accuracy is: 99.73 %


The testing accuracy is: 97.96 %


The kappa score (testing set) is 0.71


Fitting 5 folds for each of 6 candidates, totalling 30 fits


The best parameters are:  {'C': 2.5, 'kernel': 'rbf'}


The training accuracy is: 99.94 %


The testing accuracy is: 99.30 %


The kappa score (testing set) is 0.88


Fitting 5 folds for each of 6 candidates, totalling 30 fits




The best parameters are:  {'C': 1, 'kernel': 'rbf'}


The training accuracy is: 99.17 %


The testing accuracy is: 98.32 %


The kappa score (testing set) is 0.70


Fitting 5 folds for each of 6 candidates, totalling 30 fits




The best parameters are:  {'C': 2.5, 'kernel': 'rbf'}


The training accuracy is: 97.62 %


The testing accuracy is: 92.60 %


The kappa score (testing set) is 0.51


### Projection into a smaller space

In [13]:
# Project and add bias through decision function
entity_person_proj = np.concatenate(distances_svm, axis=1)

### Evaluation

In [14]:
# Concatenate all labels for each axis
person_entities = entities_df.iloc[person_index]
person_entities["labels"] = labels_person.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  person_entities["labels"] = labels_person.tolist()


In [None]:
# k parameter
k=10

# Global similarity score
scores_raw = 0
scores_proj = 0
for idx in tqdm(person_index):
    # Choose a ref
    ref_name = person_entities['Entity'][idx]

    # Before projection
    closest_entity_raw = closest_neighbours(scale_person_data[idx], scale_person_data[person_index], person_entities, k)
    scores_raw += similarity_score(ref_name, closest_entity_raw, person_entities)

    # After projection
    closest_entity = closest_neighbours(entity_person_proj[idx,:], entity_person_proj[person_index,:], person_entities, k)
    scores_proj += similarity_score(ref_name, closest_entity, person_entities)

scores_raw /= len(person_index)
scores_proj /= len(person_index)

In [17]:
print(f"The similarity score before projection of the top{k} entity is: {round(scores_raw,3)}")
print(f"The similarity score after projection of the top{k} entity is: {round(scores_proj,3)}")

The similarity score before projection of the top10 entity is: 0.662
The similarity score after projection of the top10 entity is: 0.938
