# Playing with embeddings: gender + wasBornIn, city isLocatedIn, scientists 

In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV , train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score, make_scorer
import numpy as np
import pandas as pd
from tqdm import tqdm
from copy import deepcopy
import torch
from kge.util.io import load_checkpoint

## Load entities and their embeddings

In [2]:
# Load entity type Dataframe
entities_df = pd.read_csv('data/entities_types.csv', sep=',', header=0)

#load YAGO Embeddings
checkpoint = load_checkpoint('../embeddings/yago3-10-complex.pt') # Change the model here
model = checkpoint["model"]
tensor_embed: torch.Tensor = dict(checkpoint['model'][0])['_entity_embedder._embeddings.weight']
all_arrays = tensor_embed.numpy()

Loading Embeddings...


## Load dataset

In [3]:
# Load dataset to get the relations
print("Loading triples...")
GLOBAL_PATH = "data/"
NAME_SUBGRAPH = ["train", "test", "valid"]
cols_name = ["head", "relation", "tail"]
triples_df = pd.concat(
    (pd.read_csv(
        GLOBAL_PATH + f + '.txt',
        sep="\t",
        names=cols_name) for f in NAME_SUBGRAPH),
    ignore_index=True)
# Get distinct relations
distinct_relations = triples_df["relation"].unique()

Loading triples...


In [4]:
# Dictionnary for all the boolean value of relation regarding entitiy
value_relations = {relation: set() for relation in distinct_relations}

print("Check if the relations exists...")
for _, row in triples_df.iterrows():
    # Extract entity and relations
    head = row["head"]
    relation = row["relation"]
    # Add the entity to the corresponding set
    value_relations[relation].add(head)

Check if the relations exists...


In [5]:
print("Generate a global DataFrame with all the data...")
# Convert the sets into a boolean lists
value_relations = {relation: [entity in value_relations[relation]
                              for entity in entities_df["Entity"]] for relation in distinct_relations}
# Convert to a Pandas Dataframe
relations_df = pd.DataFrame(value_relations)
global_df = pd.concat((entities_df, relations_df), axis=1)

Generate a global DataFrame with all the data...


In [6]:
print("Loading dataframe city to continent")
# Load the Dataframe of city to continent
city_continent_df = pd.read_csv("data/city_global_location.csv", index_col=0)

Loading dataframe city to continent


In [7]:
# defining parameter range ofr GridSearch
# Define the hyperparameter grid
param_grid = [
  {'C': [0.1, 1, 2.5, 5, 7.5, 10], 'kernel': ['rbf']},
 ]

## Utils 

In [8]:
def similarity_score(ref, neighbours, df_with_labels, mode: str = "one"):
    # Determine the label of the reference
    label_ref = df_with_labels[df_with_labels['Entity']== ref]['labels'].to_list()[0]
    
    # Determine labels for all neighbors at once
    all_neighbours_labels = np.array([df_with_labels[df_with_labels['Entity'] == name]['labels'].to_list()[0] for name in neighbours])

    # Calculate intersection scores

    if mode == "one":
        intersect_labels = np.logical_and(label_ref, all_neighbours_labels)
        # Sum the intersect labels along axis 1 (columns) and count non-zero elements
        non_zero = np.count_nonzero(intersect_labels, axis=1)

        # Count number of rows with non-zero elements
        score = np.count_nonzero(non_zero)
    elif mode == "all":
        all_each_line = np.all(all_neighbours_labels == label_ref, axis=1)
        # Count number of rows with non-zero elements
        score = np.count_nonzero(all_each_line)
    else:
        raise ValueError("This is not a valid mode.")

    return score / len(neighbours)

def closest_neighbours(ref_entity, other_entities, df_with_labels, k, mode: str = "euclidean"):
    if mode == "euclidean":
        relative_ref = other_entities - ref_entity
        dist_to_ref = np.linalg.norm(relative_ref, axis=1)
        k_closest_entity = np.argsort(dist_to_ref)[1:(k+1)]
        return df_with_labels['Entity'].iloc[k_closest_entity]
    elif mode == "cosine":
        norm_ref = np.linalg.norm(ref_entity)
        ref_norm = ref_entity / norm_ref
        norm_other = np.linalg.norm(other_entities, axis=1)
        normalized_other = np.divide(other_entities, norm_other.reshape(-1,1)) # division elementwise
        cosine_sim = normalized_other.dot(ref_norm.T).reshape(-1,)
        # We take the k elemnt with the highest score
        k_closest_values = np.argsort(cosine_sim)[-(k+1):-1]
        return df_with_labels['Entity'].iloc[k_closest_values]
    raise ValueError("This is not a valid mode of similarity")

## Apply SVM on gender and place of birth (Person)

### SVM with preprocess for gender entities and location of birth

In [None]:
# Loads all the data corresponding to person type
entity_type = "person_100007846"

# Triples with the relation wasBornIn
triples_birth = triples_df[(triples_df['relation'] == 'wasBornIn')]
birth_head = set(triples_birth['head'].to_list())
birth_name = entities_df[entities_df['Entity'].isin(birth_head) & entities_df["type"].apply(lambda x: entity_type in x)]
birth_index = np.array(birth_name.index, dtype=int)

In [None]:
# Load index for gender
male_head = set(triples_df[(triples_df['relation'] == 'hasGender') & (triples_df['tail'] == 'male')]['head'].to_list())
male_index = np.array(entities_df[entities_df['Entity'].isin(male_head) & (entities_df["type"].apply(lambda x: entity_type in x))].index, dtype=int)
female_head = set(triples_df[(triples_df['relation'] == 'hasGender') & (triples_df['tail'] == 'female')]['head'].to_list())
female_index = np.array(entities_df[entities_df['Entity'].isin(female_head) & (entities_df["type"].apply(lambda x: entity_type in x))].index, dtype=int)

gender_head = set(triples_df[(triples_df['relation'] == 'hasGender')]['head'].to_list())
gender_index = np.array(entities_df[entities_df['Entity'].isin(gender_head) & entities_df["type"].apply(lambda x: entity_type in x)].index, dtype=int)

In [None]:
# Intersection of both indexes
intersect_index = np.intersect1d(birth_index, gender_index)

# Update the previous indexes
male_index = np.intersect1d(intersect_index, male_index)
female_index = np.intersect1d(intersect_index, female_index)
birth_index = np.intersect1d(intersect_index, birth_index)

In [None]:
# Adapt scale embeddings to only concerned data
scale_data = deepcopy(all_arrays)
scale =StandardScaler()
scale_data[intersect_index] = scale.fit_transform(scale_data[intersect_index])

In [None]:
# Get the correponding embedding
male_embeddings = scale_data[male_index,:]
female_embeddings = scale_data[female_index,:]

gender_embeddings = np.concatenate((male_embeddings, female_embeddings))

# Generate the labels
labels_gender = np.array([0]*len(male_embeddings) + [1]*len(female_embeddings))

# We split the entities
X_gender_train, X_gender_test, y_gender_train, y_gender_test  = train_test_split(gender_embeddings, labels_gender, test_size=0.2, random_state=42)

def isInEurope(city:str) -> bool:
    """Retrun True if the city is in Europe, otherwise False"""
    city_row = city_continent_df[city_continent_df['cities'] == city]
    return any(city_row['continents'] == 'Europe')

# We select only the embeddings with the place of birth
embeddings_city = scale_data[birth_index,:]

# We generate the labels
labels_place = np.array([isInEurope(triples_birth[triples_birth['head'] == entity]['tail'].item()) for entity in tqdm(entities_df['Entity'][intersect_index], total=len(entities_df['Entity'][intersect_index]))])

# We store the indexes of places verifying the condition 
condition_index = np.where(labels_place == True)[0]
not_condition_index = np.where(labels_place == False)[0]

# We split the entities
X_place_train, X_place_test, y_place_train, y_place_test  = train_test_split(embeddings_city, labels_place, test_size=0.2, random_state=42)

In [None]:
#Statistics gender
nb_male = len(male_embeddings)
nb_female = len(female_embeddings)
nb_person = nb_male + nb_female
print(f"The number of male entities is {nb_male}, which is {round(100*nb_male/nb_person,2)} %")
print(f"The number of female entities is {nb_female}, which is {round(100*nb_female/nb_person,2)} %")
print(f"The total number of persons is {nb_person}")

#Statistics place
nb_c = len(labels_place[labels_place == True])
nb_nc = len(labels_place[labels_place == False])
nb_condition = nb_c + nb_nc
print(f"The number of European entities is {nb_c}, which is {round(100*nb_c/nb_condition,2)} %")
print(f"The number of non European entities is {nb_nc}, which is {round(100*nb_nc/nb_condition,2)} %")
print(f"The total number of birth place is {nb_condition}")

In [None]:
# Process SVM with GridSearch Cross validation
grid_gender = GridSearchCV(SVC(class_weight='balanced', gamma='scale'), param_grid, refit = True, verbose = 1, scoring = make_scorer(cohen_kappa_score), n_jobs=16) 
grid_gender.fit(X_gender_train, y_gender_train)

# Get best estimator
clf_gender = grid_gender.best_estimator_
print("The training accuracy is: {:.2f} %".format(clf_gender.score(X_gender_train, y_gender_train)*100))
print("The testing accuracy is: {:.2f} %".format(clf_gender.score(X_gender_test, y_gender_test)*100))
print("The kappa score (testing set) is {:.2f}".format(cohen_kappa_score(clf_gender.predict(X_gender_test), y_gender_test)))

In [None]:
print(grid_gender.best_params_)

In [None]:
# Process SVM
grid_c = GridSearchCV(SVC(class_weight='balanced', gamma='scale'), param_grid, refit = True, verbose = 1, scoring = make_scorer(cohen_kappa_score), n_jobs=16) 
grid_c.fit(X_place_train, y_place_train)

# Get best estimator
clf_c = grid_c.best_estimator_
print("The training accuracy is: {:.2f} %".format(clf_c.score(X_place_train, y_place_train)*100))
print("The testing accuracy is: {:.2f} %".format(clf_c.score(X_place_test, y_place_test)*100))
print("The kappa score (testing set) is {:.2f}".format(cohen_kappa_score(clf_c.predict(X_place_test), y_place_test)))

In [None]:
print(grid_c.best_params_)

#### Projection in the new dimensional space

In [None]:
# Project and add bias through decision function
entity_proj = np.concatenate((clf_gender.decision_function(scale_data).reshape(-1,1), clf_c.decision_function(scale_data).reshape(-1,1)), axis=1)

### Evaluation

In [None]:
# Concatenate all labels for each axis
all_labels_gender = np.concatenate((labels_gender.reshape(-1,1), labels_place.reshape(-1,1)), axis=1)
intersect_entities = entities_df.iloc[intersect_index]
intersect_entities["labels"] = all_labels_gender.tolist()

In [None]:
# k parameter
k=10

# Global similarity score
scores_raw = 0
scores_proj = 0
for idx in tqdm(intersect_index):
    # Choose a ref
    ref_name = intersect_entities['Entity'][idx]

    # Before projection
    closest_entity_raw = closest_neighbours(scale_data[idx,:], scale_data[intersect_index], intersect_entities, k)
    scores_raw += similarity_score(ref_name, closest_entity_raw, intersect_entities)

    # After projection
    closest_entity = closest_neighbours(entity_proj[idx,:], entity_proj[intersect_index], intersect_entities, k)
    scores_proj += similarity_score(ref_name, closest_entity, intersect_entities)

scores_raw /= len(intersect_index)
scores_proj /= len(intersect_index)

In [None]:
print(f"The similarity score before projection of the top{k} entity is: {round(scores_raw,3)}")
print(f"The similarity score after projection of the top{k} entity is: {round(scores_proj,3)}")

## Apply SVM on the relation isLocatedIn

In [None]:
# Loads all the data converted to a continent
entity_type = "city_108524735"

isLocated_df = pd.read_csv('data/isLocated_global.csv', index_col=0)
all_unknown_location = set(isLocated_df[isLocated_df['continents'] == 'Unknown']['cities'].to_list())
triples_location = triples_df[(triples_df['relation'] == 'isLocatedIn') & (~triples_df['head'].isin(all_unknown_location))]
location_head = set(triples_location['head'].to_list())

location_name = entities_df[entities_df['Entity'].isin(location_head) & entities_df["type"].apply(lambda x: entity_type in x)]
location_index = np.array(location_name.index, dtype=int)

In [None]:
# Scale embeddings only to concerned data
scale_location_data = deepcopy(all_arrays)
scale = StandardScaler()
scale_location_data[location_index] = scale.fit_transform(scale_location_data[location_index])

### Label continent

In [None]:
def isinEuropeOrAsia(location) -> bool:
    continent = isLocated_df[isLocated_df['cities'] == location]['continents'].item()
    return continent == 'Europe' or continent == 'Asia'

def isinEuropeOrNorthAmerica(location) -> bool:
    continent = isLocated_df[isLocated_df['cities'] == location]['continents'].item()
    return continent == 'Europe' or continent == 'North America'

def isinNorthOrSouthAmerica(location) -> bool:
    continent = isLocated_df[isLocated_df['cities'] == location]['continents'].item()
    return continent == 'North America' or continent == 'South America'

# Label entities according to the continent
labels_EuAsia = np.array([isinEuropeOrAsia(loc) for loc in entities_df['Entity'][location_index]])
X_EuAsia_train, X_EuAsia_test, y_EuAsia_train, y_EuAsia_test  = train_test_split(scale_location_data[location_index], labels_EuAsia, test_size=0.2, random_state=42)

# Label entities according to the continent
labels_EuNAme = np.array([isinEuropeOrNorthAmerica(loc) for loc in entities_df['Entity'][location_index]])
X_EuNAme_train, X_EuNAme_test, y_EuNAme_train, y_EuNAme_test  = train_test_split(scale_location_data[location_index], labels_EuNAme, test_size=0.2, random_state=42)

# Label entities according to the continent
labels_NSAme = np.array([isinNorthOrSouthAmerica(loc) for loc in entities_df['Entity'][location_index]])
X_NSAme_train, X_NSAme_test, y_NSAme_train, y_NSAme_test  = train_test_split(scale_location_data[location_index], labels_NSAme, test_size=0.2, random_state=42)

In [None]:
#Statistics Europe or Asia
nb_EuAsia = len(labels_EuAsia[labels_EuAsia == True])
nb_total_loc = len(labels_EuAsia)
print(f"The number of entities in Europe or Asia is {nb_EuAsia}, which is {round(100*nb_EuAsia/nb_total_loc,2)} %")

#Statistics Europe or North America
nb_EuNAme = len(labels_EuNAme[labels_EuNAme == True])
print(f"The number of elements in Europe or North America is {nb_EuNAme}, which is {round(100*nb_EuNAme/nb_total_loc,2)} %")

#Statistics North America or South America
nb_NSAme = len(labels_NSAme[labels_NSAme == True])
print(f"The number of elements in Europe or North America is {nb_NSAme}, which is {round(100*nb_NSAme/nb_total_loc,2)} %")


print(f"The total number of concerned places is {nb_total_loc}")

### Train SVM

In [None]:
# Process SVM
grid_EuAsia = GridSearchCV(SVC(class_weight='balanced', gamma='scale'), param_grid, refit = True, verbose = 1, scoring = make_scorer(cohen_kappa_score), n_jobs=16)
grid_EuAsia.fit(X_EuAsia_train, y_EuAsia_train)
# Get best estimator
clf_EuAsia: SVC = grid_EuAsia.best_estimator_

print("The training accuracy is: {:.2f} %".format(clf_EuAsia.score(X_EuAsia_train, y_EuAsia_train)*100))
print("The testing accuracy is: {:.2f} %".format(clf_EuAsia.score(X_EuAsia_test, y_EuAsia_test)*100))
print("The kappa score (testing set) is {:.2f}".format(cohen_kappa_score(clf_EuAsia.predict(X_EuAsia_test), y_EuAsia_test)))

In [None]:
print(grid_EuAsia.best_params_)

In [None]:
# Process SVM

grid_EuNAme = GridSearchCV(SVC(class_weight='balanced', gamma='scale'), param_grid, refit = True, verbose = 1, scoring = make_scorer(cohen_kappa_score), n_jobs=16) 
grid_EuNAme.fit(X_EuNAme_train, y_EuNAme_train)
# Get best estimator
clf_EuNAme: SVC = grid_EuNAme.best_estimator_

print("The training accuracy is: {:.2f} %".format(clf_EuNAme.score(X_EuNAme_train, y_EuNAme_train)*100))
print("The testing accuracy is: {:.2f} %".format(clf_EuNAme.score(X_EuNAme_test, y_EuNAme_test)*100))
print("The kappa score (testing set) is {:.2f}".format(cohen_kappa_score(clf_EuNAme.predict(X_EuNAme_test), y_EuNAme_test)))

In [None]:
print(grid_EuNAme.best_params_)

In [None]:
# Process SVM
grid_NSAme = GridSearchCV(SVC(class_weight='balanced', gamma='scale'), param_grid, refit = True, verbose = 1, scoring = make_scorer(cohen_kappa_score), n_jobs=16) 
grid_NSAme.fit(X_NSAme_train, y_NSAme_train)

# Get best estimator
clf_NSAme: SVC = grid_NSAme.best_estimator_
print("The training accuracy is: {:.2f} %".format(clf_NSAme.score(X_NSAme_train, y_NSAme_train)*100))
print("The testing accuracy is: {:.2f} %".format(clf_NSAme.score(X_NSAme_test, y_NSAme_test)*100))
print("The kappa score (testing set) is {:.2f}".format(cohen_kappa_score(clf_NSAme.predict(X_NSAme_test), y_NSAme_test)))

In [None]:
print(grid_NSAme.best_params_)

In [None]:
# Project and add bias through decision function
entity_continent_proj = np.concatenate((clf_EuAsia.decision_function(scale_location_data).reshape(-1,1), clf_EuNAme.decision_function(scale_location_data).reshape(-1,1), clf_NSAme.decision_function(scale_location_data).reshape(-1,1)), axis=1)

In [None]:
europe_asia_index = np.where(labels_EuAsia == True)[0]
europe_namerica_index = np.where(labels_EuNAme == True)[0]
north_souh_america_index = np.where(labels_NSAme == True)[0]
not_north_souh_america_index = np.where(labels_NSAme == False)[0]
not_europe_asia_index = np.where(labels_EuAsia == False)[0]
not_europe_namerica_index = np.where(labels_EuNAme == False)[0]

europe_index = np.intersect1d(europe_asia_index, europe_namerica_index)
asia_index = np.intersect1d(europe_asia_index, not_europe_namerica_index)
namerica_index = np.intersect1d(europe_namerica_index, not_europe_asia_index)
samerica_index = np.intersect1d(north_souh_america_index, not_europe_namerica_index)

other_index = np.intersect1d(np.intersect1d(not_europe_asia_index, not_europe_namerica_index), not_north_souh_america_index)

### Evaluation

In [None]:
# Concatenate all labels for each axis
all_labels_location = np.concatenate((labels_EuAsia.reshape(-1,1), labels_EuNAme.reshape(-1,1), labels_NSAme.reshape(-1,1)), axis=1)
location_name["labels"] = all_labels_location.tolist()

In [None]:
print(all_labels_location[:,1].shape)

In [None]:
# k parameter
k=10

# Global similarity score
scores_raw = 0
scores_proj = 0
for idx in tqdm(location_index):
    # Choose a ref
    ref_proj = entity_continent_proj[idx,:]
    ref_name = location_name['Entity'][idx]

    # Before projection
    closest_entity_raw = closest_neighbours(scale_location_data[idx,:], scale_location_data[location_index], location_name, k, mode='cosine')
    scores_raw += similarity_score(ref_name, closest_entity_raw, location_name, mode="all")

    # After projection
    closest_entity = closest_neighbours(ref_proj, entity_continent_proj[location_index], location_name, k, mode='cosine')
    scores_proj += similarity_score(ref_name, closest_entity, location_name, mode="all")

scores_raw /= len(location_index)
scores_proj /= len(location_index)

In [None]:
print(f"The similarity score before projection of the top{k} entity is: {round(scores_raw,3)}")
print(f"The similarity score after projection of the top{k} entity is: {round(scores_proj,3)}")

## SVM on Scientists awards

In [9]:
# Loads all the data converted to a scientist
entity_type = "scientist_110560637"

triples_award = triples_df[(triples_df['relation'] == 'hasWonPrize')]
award_head = set(triples_award['head'].to_list())
scientist_award_head = entities_df[entities_df['Entity'].isin(award_head) & entities_df["type"].apply(lambda x: entity_type in x)]
scientist_award_index = np.array(scientist_award_head.index, dtype=int)

In [10]:
# Scale embeddings only to concerned data
scale_scientist_data = deepcopy(all_arrays)
scale = StandardScaler()
scale_scientist_data[scientist_award_index] = scale.fit_transform(scale_scientist_data[scientist_award_index])

In [11]:
top_awards = ["National_Medal_of_Science", "Nobel_Prize_in_Physics", "Copley_Medal", "Royal_Medal", "Nobel_Prize_in_Physiology_or_Medicine", "Nobel_Prize_in_Chemistry"]
SVC_awards: list[GridSearchCV] = [GridSearchCV(SVC(class_weight='balanced'), param_grid, refit = True, verbose = 1, scoring = make_scorer(cohen_kappa_score), n_jobs=16)]*len(top_awards)

### Label data

In [12]:
labels_award = [np.array([award in set(triples_award[triples_award['head'] == scientist]['tail'].to_list()) for scientist in entities_df['Entity'][scientist_award_index]]) for award in top_awards]

### Train SVM

In [13]:
decision_scientist = []
for idx, svm in enumerate(SVC_awards):
    # We split the entities
    X_award_train, X_award_test, y_award_train, y_award_test  = train_test_split(scale_scientist_data[scientist_award_index], labels_award[idx], test_size=0.2, random_state=42)
    svm.fit(X_award_train, y_award_train)
    print("The training accuracy is: {:.2f} %".format(svm.best_estimator_.score(X_award_train, y_award_train)*100))
    print("The testing accuracy is: {:.2f} %".format(svm.best_estimator_.score(X_award_test, y_award_test)*100))
    print("The testing kappa score is {:.2f}".format(cohen_kappa_score(svm.best_estimator_.predict(X_award_test), y_award_test)))
    print(svm.best_params_)
    decision_scientist.append(svm.best_estimator_.decision_function(scale_scientist_data).reshape(-1,1))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
The training accuracy is: 100.00 %
The testing accuracy is: 100.00 %
The testing kappa score is 1.00
{'C': 2.5, 'kernel': 'rbf'}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
The training accuracy is: 100.00 %
The testing accuracy is: 99.62 %
The testing kappa score is 0.98
{'C': 1, 'kernel': 'rbf'}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
The training accuracy is: 100.00 %
The testing accuracy is: 99.25 %
The testing kappa score is 0.95
{'C': 2.5, 'kernel': 'rbf'}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
The training accuracy is: 100.00 %
The testing accuracy is: 100.00 %
The testing kappa score is 1.00
{'C': 2.5, 'kernel': 'rbf'}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
The training accuracy is: 100.00 %
The testing accuracy is: 99.25 %
The testing kappa score is 0.94
{'C': 1, 'kernel': 'rbf'}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Th

### Project into the 2d space

In [14]:
# Project and add bias through decision function
scientist_award_proj = np.concatenate(decision_scientist, axis=1)

### Evaluation

In [16]:
# Add a new column with all labels
labels_award = np.array(labels_award).T
scientist_award_head["labels"] = labels_award.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scientist_award_head["labels"] = labels_award.tolist()


In [19]:
# Global similarity score
k=10
scores_raw = 0
scores_proj = 0
for idx in tqdm(scientist_award_index):
    # Choose a ref
    ref_proj = scientist_award_proj[idx,:]
    ref_name = scientist_award_head['Entity'][idx]

    # Before projection
    ref_entity = scale_scientist_data[idx,:]
    closest_entity_raw = closest_neighbours(ref_entity, scale_scientist_data[scientist_award_index], scientist_award_head, k, mode='cosine')
    scores_raw += similarity_score(ref_name, closest_entity_raw, scientist_award_head, mode='all')

    # After projection
    closest_entity = closest_neighbours(ref_proj, scientist_award_proj[scientist_award_index], scientist_award_head, k, mode='cosine')
    scores_proj += similarity_score(ref_name, closest_entity, scientist_award_head, mode='all')

scores_raw /= len(scientist_award_index)
scores_proj /= len(scientist_award_index)

100%|██████████| 1323/1323 [00:18<00:00, 73.04it/s] 


In [20]:
print(f"The similarity score before projection of the top{k} entity is: {round(scores_raw,3)}")
print(f"The similarity score after projection of the top{k} entity is: {round(scores_proj,3)}")

The similarity score before projection of the top10 entity is: 0.578
The similarity score after projection of the top10 entity is: 0.972
