# Person analysis: relations playsFor and wasBornIn

In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV , train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score, make_scorer
import numpy as np
import pandas as pd
from tqdm import tqdm
from copy import deepcopy
import torch
from kge.util.io import load_checkpoint

## Load entities with their embbedings

In [2]:
# Load entity type Dataframe
entities_df = pd.read_csv('data/entities_types.csv', sep=',', header=0)

#load YAGO Embeddings
checkpoint = load_checkpoint('../embeddings/yago3-10-complex.pt') # Change the model here
model = checkpoint["model"]
tensor_embed: torch.Tensor = dict(checkpoint['model'][0])['_entity_embedder._embeddings.weight']
all_arrays = tensor_embed.numpy()

## Load dataset

In [3]:
# Load dataset to get the relations
print("Loading triples...")
GLOBAL_PATH = "data/"
NAME_SUBGRAPH = ["train", "test", "valid"]
cols_name = ["head", "relation", "tail"]
triples_df = pd.concat(
    (pd.read_csv(
        GLOBAL_PATH + f + '.txt',
        sep="\t",
        names=cols_name) for f in NAME_SUBGRAPH),
    ignore_index=True)

Loading triples...


In [4]:
print("Loading dataframe city to continent")
# Load the Dataframe of city to continent
city_continent_df = pd.read_csv("data/isLocated_global.csv", index_col=0)

Loading dataframe city to continent


In [5]:
# defining parameter range ofr GridSearch
# Define the hyperparameter grid
param_grid = [
  {'C': [0.1, 1, 2.5, 5, 7.5, 10], 'kernel': ['rbf']},
 ]

## Utils

In [6]:
def similarity_score(ref, neighbours, df_with_labels, mode: str = "all"):
    # Determine the label of the reference
    label_ref = df_with_labels[df_with_labels['Entity']== ref]['labels'].to_list()[0]
    
    # Determine labels for all neighbors at once
    all_neighbours_labels = np.array([df_with_labels[df_with_labels['Entity'] == name]['labels'].to_list()[0] for name in neighbours])

    # Calculate intersection scores

    if mode == "one":
        intersect_labels = np.logical_and(label_ref, all_neighbours_labels)
        # Sum the intersect labels along axis 1 (columns) and count non-zero elements
        non_zero = np.count_nonzero(intersect_labels, axis=1)

        # Count number of rows with non-zero elements
        score = np.count_nonzero(non_zero)
    elif mode == "all":
        all_each_line = np.all(all_neighbours_labels == label_ref, axis=1)
        # Count number of rows with non-zero elements
        score = np.count_nonzero(all_each_line)
    else:
        raise ValueError("This is not a valid mode.")

    return score / len(neighbours)

def closest_neighbours(ref_entity, other_entities, df_with_labels, k, mode: str = "euclidean"):
    if mode == "euclidean":
        relative_ref = other_entities - ref_entity
        dist_to_ref = np.linalg.norm(relative_ref, axis=1)
        k_closest_entity = np.argsort(dist_to_ref)[1:(k+1)]
        return df_with_labels['Entity'].iloc[k_closest_entity]
    elif mode == "cosine":
        norm_ref = np.linalg.norm(ref_entity)
        ref_norm = ref_entity / norm_ref
        norm_other = np.linalg.norm(other_entities, axis=1)
        normalized_other = np.divide(other_entities, norm_other.reshape(-1,1)) # division elementwise
        cosine_sim = normalized_other.dot(ref_norm.T).reshape(-1,)
        # We take the k elemnt with the highest score
        k_closest_values = np.argsort(cosine_sim)[-(k+1):-1]
        return df_with_labels['Entity'].iloc[k_closest_values]
    raise ValueError("This is not a valid mode of similarity")

## Apply SVM on Persons

#### We filter the entities (only person type)

In [7]:
# Loads all the data corresponding to person type
entity_type = "person_100007846"

# Triples with the relation wasBornIn
triples_birth = triples_df[(triples_df['relation'] == 'wasBornIn')]
birth_head = set(triples_birth['head'].to_list())
birth_name = entities_df[entities_df['Entity'].isin(birth_head) & entities_df["type"].apply(lambda x: entity_type in x)]
birth_index = np.array(birth_name.index, dtype=int)

In [8]:
# Adapt scale embeddings to only concerned data
scale_data = deepcopy(all_arrays)
scale =StandardScaler()
scale_data[birth_index] = scale.fit_transform(scale_data[birth_index])

In [9]:
all_continents = ['Europe', 'Asia', 'North America']

In [None]:
def is_in_continent(city: str, continent: str) -> bool:
    """Return True if the city is in the specified continent, otherwise False"""
    city_row = city_continent_df[city_continent_df['cities'] == city]
    return (city_row['continents'] == continent).any()

# We label the data
labels_birth = np.array([
     [is_in_continent(triples_birth[triples_birth['head'] == entity]['tail'].item(), continent)
     for continent in all_continents]
     for entity in tqdm(entities_df['Entity'][birth_index], total=len(entities_df['Entity'][birth_index]))
])

# We select only the embeddings of the entities
embeddings_birth = scale_data[birth_index,:]

In [11]:
# We split the entities
X_birth_train, X_birth_test, y_birth_train, y_birth_test  = train_test_split(embeddings_birth, labels_birth, test_size=0.2, random_state=42)

In [12]:
#Statistics place
# Europe
nb_europe = len(labels_birth[labels_birth[:,0] == True])
nb_not_europe = len(labels_birth[labels_birth[:,0] == False])

nb_total = nb_europe + nb_not_europe

# Asia
nb_asia = len(labels_birth[labels_birth[:,1] == True])
nb_not_asia = len(labels_birth[labels_birth[:,1] == False])

# North America
nb_north_america = len(labels_birth[labels_birth[:,2] == True])
nb_not_north_america = len(labels_birth[labels_birth[:,2] == False])

print(f"The total number of entities is {nb_total}")
print(f"The number of European entities is {nb_europe}, which is {round(100*nb_europe/nb_total,2)} %")
print(f"The number of non European entities is {nb_not_europe}, which is {round(100*nb_not_europe/nb_total,2)} %")
print(f"The number of Asian entities is {nb_asia}, which is {round(100*nb_asia/nb_total,2)} %")
print(f"The number of non Asian entities is {nb_not_asia}, which is {round(100*nb_not_asia/nb_total,2)} %")
print(f"The number of North American entities is {nb_north_america}, which is {round(100*nb_north_america/nb_total,2)} %")
print(f"The number of non North American entities is {nb_not_north_america}, which is {round(100*nb_not_north_america/nb_total,2)} %")

The total number of entities is 43017
The number of European entities is 23986, which is 55.76 %
The number of non European entities is 19031, which is 44.24 %
The number of Asian entities is 3257, which is 7.57 %
The number of non Asian entities is 39760, which is 92.43 %
The number of North American entities is 9769, which is 22.71 %
The number of non North American entities is 33248, which is 77.29 %


In [13]:
# Process SVM for all continents
grid_birth = [GridSearchCV(SVC(class_weight="balanced", gamma="scale"), param_grid, refit=True, verbose=1, scoring = make_scorer(cohen_kappa_score), n_jobs=16)]*len(all_continents)

decision_birth = []
for idx, svm in enumerate(grid_birth):
    svm.fit(X_birth_train, y_birth_train[:,idx])

    clf_birth = svm.best_estimator_
    print("The best parameters are: ", svm.best_params_)
    print("The training accuracy is: {:.2f} %".format(clf_birth.score(X_birth_train, y_birth_train[:,idx])*100))
    print("The testing accuracy is: {:.2f} %".format(clf_birth.score(X_birth_test, y_birth_test[:,idx])*100))
    print("The kappa score (testing set) is {:.2f}".format(cohen_kappa_score(clf_birth.predict(X_birth_test), y_birth_test[:,idx])))
    decision_birth.append(svm.best_estimator_.decision_function(scale_data).reshape(-1,1))


Fitting 5 folds for each of 6 candidates, totalling 30 fits


The best parameters are:  {'C': 2.5, 'kernel': 'rbf'}


The training accuracy is: 99.60 %


The testing accuracy is: 94.85 %


The kappa score (testing set) is 0.90


Fitting 5 folds for each of 6 candidates, totalling 30 fits


The best parameters are:  {'C': 2.5, 'kernel': 'rbf'}


The training accuracy is: 99.98 %


The testing accuracy is: 98.35 %


The kappa score (testing set) is 0.88


Fitting 5 folds for each of 6 candidates, totalling 30 fits


The best parameters are:  {'C': 10, 'kernel': 'rbf'}


The training accuracy is: 100.00 %


The testing accuracy is: 97.00 %


The kappa score (testing set) is 0.91


### Projection into a smaller space

In [14]:
# Project and add bias through decision function
entity_proj = np.concatenate(decision_birth, axis=1)

### Evaluation

In [15]:
# Concatenate all labels for each axis
intersect_entities = entities_df.iloc[birth_index]
intersect_entities["labels"] = labels_birth.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intersect_entities["labels"] = labels_birth.tolist()


In [None]:
# k parameter
k=10

# Global similarity score
scores_raw = 0
scores_proj = 0
for idx in tqdm(birth_index):
    # Choose a ref
    ref_name = intersect_entities['Entity'][idx]

    # Before projection
    closest_entity_raw = closest_neighbours(scale_data[idx,:], scale_data[birth_index], intersect_entities, k)
    scores_raw += similarity_score(ref_name, closest_entity_raw, intersect_entities)

    # After projection
    closest_entity = closest_neighbours(entity_proj[idx,:], entity_proj[birth_index], intersect_entities, k)
    scores_proj += similarity_score(ref_name, closest_entity, intersect_entities)

scores_raw /= len(birth_index)
scores_proj /= len(birth_index)

In [18]:
print(f"The similarity score before projection of the top{k} entity is: {round(scores_raw,3)}")
print(f"The similarity score after projection of the top{k} entity is: {round(scores_proj,3)}")

The similarity score before projection of the top10 entity is: 0.831
The similarity score after projection of the top10 entity is: 0.979


### We repeat this process for the relation playsFor

In [19]:
# Triples with the relation playsFor
triples_play = triples_df[(triples_df['relation'] == 'playsFor')]
play_tail = set(triples_play['tail'].to_list())

triples_location = triples_df[triples_df["head"].isin(play_tail) & (triples_df["relation"] == "isLocatedIn")]
location_head = set(triples_location["head"].to_list())

# We take the intersection of the triples due to the lack of some triples in Yago3-10
triples_play_location = triples_df[(triples_df['relation'] == 'playsFor') & (triples_df['tail'].isin(location_head))] 
play_location_head = set(triples_play_location["head"].to_list()) 

play_name = entities_df[entities_df['Entity'].isin(play_location_head) & entities_df["type"].apply(lambda x: entity_type in x)]
play_name_index = play_name.index
play_index = np.array(play_name_index, dtype=int)



In [20]:
# Adapt scale embeddings to only concerned data
scale_play_data = deepcopy(all_arrays)
scale =StandardScaler()
scale_play_data[play_index] = scale.fit_transform(scale_play_data[play_index])

In [21]:
all_countries = ['United Kingdom', 'Germany', 'Italy', 'United States']

In [None]:
def play_is_in_continent(entity: str, country: str) -> bool:
    """Return True if the city is in the specified country, otherwise False"""
    play = triples_play_location[triples_play_location['head'] == entity]['tail'].to_list()[0]
    location = triples_location[(triples_location['head']==play) & (triples_location["relation"]=="isLocatedIn")]["tail"].to_list()[0]
    city_row = city_continent_df[city_continent_df['cities'] == location]
    return (city_row['countries'] == country).any()

# We label the data
labels_play = np.array([
     [play_is_in_continent(entity, country)
     for country in all_countries]
     for entity in tqdm(entities_df['Entity'][play_index], total=len(entities_df['Entity'][play_index]))
])

# We select only the embeddings of the entities
embeddings_play = scale_play_data[play_index]

In [23]:
# We split the entities
X_play_train, X_play_test, y_play_train, y_play_test  = train_test_split(embeddings_play, labels_play, test_size=0.2, random_state=42)

In [24]:
# Process SVM for all continents
grid_play = [GridSearchCV(SVC(class_weight="balanced", gamma="scale"), param_grid, refit=True, verbose=1, scoring = make_scorer(cohen_kappa_score), n_jobs=-1)]*len(all_countries)

decision_play = []
for idx, svm in enumerate(grid_play):
    svm.fit(X_play_train, y_play_train[:,idx])

    clf_play = svm.best_estimator_
    print("The best parameters are: ", svm.best_params_)
    print("The training accuracy is: {:.2f} %".format(clf_play.score(X_play_train, y_play_train[:,idx])*100))
    print("The testing accuracy is: {:.2f} %".format(clf_play.score(X_play_test, y_play_test[:,idx])*100))
    print("The kappa score (testing set) is {:.2f}".format(cohen_kappa_score(clf_play.predict(X_play_test), y_play_test[:,idx])))
    decision_play.append(svm.best_estimator_.decision_function(scale_data).reshape(-1,1))


Fitting 5 folds for each of 6 candidates, totalling 30 fits


The best parameters are:  {'C': 2.5, 'kernel': 'rbf'}


The training accuracy is: 98.77 %


The testing accuracy is: 95.23 %


The kappa score (testing set) is 0.86


Fitting 5 folds for each of 6 candidates, totalling 30 fits


The best parameters are:  {'C': 1, 'kernel': 'rbf'}


The training accuracy is: 98.61 %


The testing accuracy is: 96.89 %


The kappa score (testing set) is 0.83


Fitting 5 folds for each of 6 candidates, totalling 30 fits


The best parameters are:  {'C': 1, 'kernel': 'rbf'}


The training accuracy is: 99.24 %


The testing accuracy is: 97.83 %


The kappa score (testing set) is 0.83


Fitting 5 folds for each of 6 candidates, totalling 30 fits


The best parameters are:  {'C': 2.5, 'kernel': 'rbf'}


The training accuracy is: 99.76 %


The testing accuracy is: 96.95 %


The kappa score (testing set) is 0.70


### Projecting into a smaller space

In [25]:
# Project and add bias through decision function
entity_play_proj = np.concatenate(decision_play, axis=1)

### Evaluation

In [26]:
# Concatenate all labels for each axis
play_entities = entities_df.iloc[play_index]
play_entities["labels"] = labels_play.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  play_entities["labels"] = labels_play.tolist()


In [None]:
# k parameter
k=10

# Global similarity score
scores_raw = 0
scores_proj = 0
for idx in tqdm(play_index):
    # Choose a ref
    ref_name = play_entities['Entity'][idx]

    # Before projection
    closest_entity_raw = closest_neighbours(scale_play_data[idx,:], scale_play_data[play_index], play_entities, k)
    scores_raw += similarity_score(ref_name, closest_entity_raw, play_entities)

    # After projection
    closest_entity = closest_neighbours(entity_play_proj[idx,:], entity_play_proj[play_index], play_entities, k)
    scores_proj += similarity_score(ref_name, closest_entity, play_entities)

scores_raw /= len(play_index)
scores_proj /= len(play_index)

In [29]:
print(f"The similarity score before projection of the top{k} entity is: {round(scores_raw,3)}")
print(f"The similarity score after projection of the top{k} entity is: {round(scores_proj,3)}")

The similarity score before projection of the top5 entity is: 0.854
The similarity score after projection of the top5 entity is: 0.943
