# Playing with organizations locations and persons with worksAt

In [91]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV , train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score, make_scorer
import numpy as np
import pandas as pd
from tqdm import tqdm
from copy import deepcopy
import torch
from kge.util.io import load_checkpoint

## Load entities and their embeddings

In [92]:
PATH_EMBED = '../embeddings/'

# Load entity type Dataframe
entities_df = pd.read_csv('data/entities_types.csv', sep=',', header=0)

#load YAGO Embeddings
checkpoint = load_checkpoint(PATH_EMBED + 'yago3-10-complex.pt') # Change the model here
model = checkpoint["model"]
tensor_embed: torch.Tensor = dict(checkpoint['model'][0])['_entity_embedder._embeddings.weight']
all_arrays = tensor_embed.numpy()

Loading Embeddings...


## Load dataset

In [93]:
# Load dataset to get the relations
print("Loading triples...")
GLOBAL_PATH = "data/"
NAME_SUBGRAPH = ["train", "test", "valid"]
cols_name = ["head", "relation", "tail"]
triples_df = pd.concat(
    (pd.read_csv(
        GLOBAL_PATH + f + '.txt',
        sep="\t",
        names=cols_name) for f in NAME_SUBGRAPH),
    ignore_index=True)
# Get distinct relations
distinct_relations = triples_df["relation"].unique()

Loading triples...


In [94]:
# Dictionnary for all the boolean value of relation regarding entitiy
value_relations = {relation: set() for relation in distinct_relations}

print("Check if the relations exists...")
for _, row in triples_df.iterrows():
    # Extract entity and relations
    head = row["head"]
    relation = row["relation"]
    # Add the entity to the corresponding set
    value_relations[relation].add(head)

Check if the relations exists...


In [95]:
print("Generate a global DataFrame with all the data...")
# Convert the sets into a boolean lists
value_relations = {relation: [entity in value_relations[relation]
                              for entity in entities_df["Entity"]] for relation in distinct_relations}
# Convert to a Pandas Dataframe
relations_df = pd.DataFrame(value_relations)
global_df = pd.concat((entities_df, relations_df), axis=1)

Generate a global DataFrame with all the data...


In [96]:
print("Loading dataframe city to continent")
# Load the Dataframe of city to continent
city_continent_df = pd.read_csv("data/city_global_location.csv", index_col=0)

Loading dataframe city to continent


In [97]:
# defining parameter range ofr GridSearch
# Define the hyperparameter grid
param_grid = [
  {'C': [0.1, 1, 2.5, 5, 7.5, 10], 'kernel': ['rbf']},
 ]

## Utils 

In [98]:
def similarity_score(ref, neighbours, df_with_labels, mode: str = "one"):
    # Determine the label of the reference
    label_ref = df_with_labels[df_with_labels['Entity']== ref]['labels'].to_list()[0]
    
    # Determine labels for all neighbors at once
    all_neighbours_labels = np.array([df_with_labels[df_with_labels['Entity'] == name]['labels'].to_list()[0] for name in neighbours])

    # Calculate intersection scores

    if mode == "one":
        intersect_labels = np.logical_and(label_ref, all_neighbours_labels)
        # Sum the intersect labels along axis 1 (columns) and count non-zero elements
        non_zero = np.count_nonzero(intersect_labels, axis=1)

        # Count number of rows with non-zero elements
        score = np.count_nonzero(non_zero)
    elif mode == "all":
        all_each_line = np.all(all_neighbours_labels == label_ref, axis=1)
        # Count number of rows with non-zero elements
        score = np.count_nonzero(all_each_line)
    else:
        raise ValueError("This is not a valid mode.")

    return score / len(neighbours)

def closest_neighbours(ref_entity, other_entities, df_with_labels, k, mode: str = "euclidean"):
    if mode == "euclidean":
        relative_ref = other_entities - ref_entity
        dist_to_ref = np.linalg.norm(relative_ref, axis=1)
        k_closest_entity = np.argsort(dist_to_ref)[1:(k+1)]
        return df_with_labels['Entity'].iloc[k_closest_entity]
    elif mode == "cosine":
        norm_ref = np.linalg.norm(ref_entity)
        ref_norm = ref_entity / norm_ref
        norm_other = np.linalg.norm(other_entities, axis=1)
        normalized_other = np.divide(other_entities, norm_other.reshape(-1,1)) # division elementwise
        cosine_sim = normalized_other.dot(ref_norm.T).reshape(-1,)
        # We take the k elemnt with the highest score
        k_closest_values = np.argsort(cosine_sim)[-(k+1):-1]
        return df_with_labels['Entity'].iloc[k_closest_values]
    raise ValueError("This is not a valid mode of similarity")

### Organization location

In [99]:
# Loads all the data related to humans
entity_type = "organization_108008335"

triples_organization_location = triples_df[(triples_df['relation'] == 'isLocatedIn')]
organization_head = set(triples_organization_location['head'].to_list())


organization_head = entities_df[entities_df['Entity'].isin(organization_head) & entities_df["type"].apply(lambda x: entity_type in x)]
organization_index = np.array(organization_head.index, dtype=int)

In [100]:
dict_locations = dict(city_continent_df[["cities","countries"]].values)

In [101]:
# Scale embeddings only to concerned data
scale_organization_data = deepcopy(all_arrays)
scale = StandardScaler()
scale_organization_data[organization_index] = scale.fit_transform(scale_organization_data[organization_index])

In [102]:
top_locations = ["United States", "United Kingdom", "Canada", "Japan", "France"]
SVC_organization: list[GridSearchCV] = [GridSearchCV(SVC(class_weight='balanced'), param_grid, refit = True, verbose = 1, scoring = make_scorer(cohen_kappa_score), n_jobs=-1)]*len(top_locations)

In [103]:
def isInCountry(location, country_ref):
    if location in dict_locations.keys():
        country = dict_locations[location]
        return country == country_ref
    return False

labels_organization = [np.array([isInCountry(triples_organization_location[triples_organization_location['head'] == organization]['tail'].to_list()[0], location) for organization in entities_df['Entity'][organization_index]]) for location in tqdm(top_locations)]

100%|██████████| 5/5 [01:27<00:00, 17.59s/it]


In [104]:
decision_organization = []
for idx, svm in enumerate(SVC_organization):
    # We split the entities
    X_organization_train, X_organization_test, y_organization_train, y_organization_test  = train_test_split(scale_organization_data[organization_index], labels_organization[idx], test_size=0.2, random_state=42)
    svm.fit(X_organization_train, y_organization_train)
    print("The training accuracy is: {:.2f} %".format(svm.score(X_organization_train, y_organization_train)*100))
    print("The testing accuracy is: {:.2f} %".format(svm.score(X_organization_test, y_organization_test)*100))
    print("The testing kappa score is {:.2f}".format(cohen_kappa_score(svm.predict(X_organization_test), y_organization_test)))
    print("The best parameters are", svm.best_params_)
    decision_organization.append(svm.best_estimator_.decision_function(scale_organization_data).reshape(-1, 1))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
The training accuracy is: 78.19 %
The testing accuracy is: 43.37 %
The testing kappa score is 0.43
The best parameters are {'C': 1, 'kernel': 'rbf'}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
The training accuracy is: 87.68 %
The testing accuracy is: 58.43 %
The testing kappa score is 0.58
The best parameters are {'C': 1, 'kernel': 'rbf'}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
The training accuracy is: 73.40 %
The testing accuracy is: 46.93 %
The testing kappa score is 0.47
The best parameters are {'C': 0.1, 'kernel': 'rbf'}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
The training accuracy is: 86.35 %
The testing accuracy is: 60.18 %
The testing kappa score is 0.60
The best parameters are {'C': 1, 'kernel': 'rbf'}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
The training accuracy is: 80.28 %
The testing accuracy is: 61.77 %
The testing kappa score is 0.62
The

### Projecting into the new space

In [105]:
# Project and add bias through decision function
organization_proj = np.concatenate(decision_organization, axis=1)

### Evaluation

In [106]:
# Add a new column with all labels
labels_organization = np.array(labels_organization).T
organization_head["labels"] = labels_organization.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  organization_head["labels"] = labels_organization.tolist()


### Neighbourhood similarity

In [108]:
# Global similarity score
k=10
scores_raw = 0
scores_proj = 0
for idx in tqdm(organization_index):
    # Choose a ref
    ref_proj = organization_proj[idx,:]
    ref_name = organization_head['Entity'][idx]
    # Before projection
    ref_entity = scale_organization_data[idx,:]
    closest_entity_raw = closest_neighbours(ref_entity, scale_organization_data[organization_index], organization_head, k)
    scores_raw += similarity_score(ref_name, closest_entity_raw, organization_head, mode="all")

    # After projection
    closest_entity = closest_neighbours(ref_proj, organization_proj[organization_index], organization_head, k)
    scores_proj += similarity_score(ref_name, closest_entity, organization_head, mode="all")

scores_raw /= len(organization_index)
scores_proj /= len(organization_index)

100%|██████████| 2773/2773 [00:41<00:00, 66.43it/s]


In [109]:
print(f"The similarity score before projection of the top{k} entity is: {round(scores_raw,3)}")
print(f"The similarity score after projection of the top{k} entity is: {round(scores_proj,3)}")

The similarity score before projection of the top10 entity is: 0.703
The similarity score after projection of the top10 entity is: 0.897


### WorksAt + type

In [110]:
# Loads all the data related to humans
entity_type = "person_100007846"

triples_worksat = triples_df[(triples_df['relation'] == 'worksAt')]
worksat_head = set(triples_worksat['head'].to_list())


worksat_head = entities_df[entities_df['Entity'].isin(worksat_head) & entities_df["type"].apply(lambda x: entity_type in x)]
worksat_index = np.array(worksat_head.index, dtype=int)

In [111]:
dict_type = dict(entities_df[["Entity", "type"]].values)

In [112]:
# Scale embeddings only to concerned data
scale_worksat_data = deepcopy(all_arrays)
scale = StandardScaler()
scale_worksat_data[worksat_index] = scale.fit_transform(scale_worksat_data[worksat_index])

In [113]:
top_work = ["university_108286569", "educational_institution_108276342", "organization_108008335"]
SVC_worksat: list[GridSearchCV] = [GridSearchCV(SVC(class_weight='balanced'), param_grid, refit = True, verbose = 1, scoring = make_scorer(cohen_kappa_score), n_jobs=-1)]*len(top_work)

In [114]:
def isWork(work_type, work_ref):
    if work_type in dict_type.keys():
        work: list[str] = dict_type[work_type]
        return work_ref in work
    return False

labels_worksat = [np.array([isWork(triples_worksat[triples_worksat['head'] == person]['tail'].to_list()[0], work) for person in entities_df['Entity'][worksat_index]]) for work in tqdm(top_work)]

100%|██████████| 3/3 [00:02<00:00,  1.47it/s]


In [115]:
decision_worksat = []
for idx, svm in enumerate(SVC_worksat):
    # We split the entities
    X_worksat_train, X_worksat_test, y_worksat_train, y_worksat_test  = train_test_split(scale_worksat_data[worksat_index], labels_worksat[idx], test_size=0.2, random_state=42)
    svm.fit(X_worksat_train, y_worksat_train)
    print("The training accuracy is: {:.2f} %".format(svm.score(X_worksat_train, y_worksat_train)*100))
    print("The testing accuracy is: {:.2f} %".format(svm.score(X_worksat_test, y_worksat_test)*100))
    print("The testing kappa score is {:.2f}".format(cohen_kappa_score(svm.predict(X_worksat_test), y_worksat_test)))
    print("The best parameters are", svm.best_params_)
    decision_worksat.append(svm.best_estimator_.decision_function(scale_worksat_data).reshape(-1, 1))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
The training accuracy is: 59.75 %
The testing accuracy is: 20.57 %
The testing kappa score is 0.21
The best parameters are {'C': 0.1, 'kernel': 'rbf'}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
The training accuracy is: 92.82 %
The testing accuracy is: 38.63 %
The testing kappa score is 0.39
The best parameters are {'C': 1, 'kernel': 'rbf'}
Fitting 5 folds for each of 6 candidates, totalling 30 fits
The training accuracy is: 90.76 %
The testing accuracy is: 30.70 %
The testing kappa score is 0.31
The best parameters are {'C': 1, 'kernel': 'rbf'}


### Projecting into the new space

In [116]:
# Project and add bias through decision function
worksat_proj = np.concatenate(decision_worksat, axis=1)

### Evaluation

In [117]:
# Add a new column with all labels
labels_worksat = np.array(labels_worksat).T
worksat_head["labels"] = labels_worksat.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  worksat_head["labels"] = labels_worksat.tolist()


### Neighbourhood similarity

In [119]:
# Global similarity score
k=10
scores_raw = 0
scores_proj = 0
for idx in tqdm(worksat_index):
    # Choose a ref
    ref_proj = worksat_proj[idx,:]
    ref_name = worksat_head['Entity'][idx]
    # Before projection
    ref_entity = scale_worksat_data[idx,:]
    closest_entity_raw = closest_neighbours(ref_entity, scale_worksat_data[worksat_index], worksat_head, k)
    scores_raw += similarity_score(ref_name, closest_entity_raw, worksat_head, mode="all")

    # After projection
    closest_entity = closest_neighbours(ref_proj, worksat_proj[worksat_index], worksat_head, k)
    scores_proj += similarity_score(ref_name, closest_entity, worksat_head, mode="all")

scores_raw /= len(worksat_index)
scores_proj /= len(worksat_index)

100%|██████████| 1376/1376 [00:14<00:00, 91.85it/s] 


In [120]:
print(f"The similarity score before projection of the top{k} entity is: {round(scores_raw,3)}")
print(f"The similarity score after projection of the top{k} entity is: {round(scores_proj,3)}")

The similarity score before projection of the top10 entity is: 0.465
The similarity score after projection of the top10 entity is: 0.807
