In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from itertools import combinations
from math import comb
from time import time
from datetime import date
import warnings
from joblib import Parallel, delayed
import json
import copy
from collections import defaultdict

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, plot_confusion_matrix, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

warnings.filterwarnings('ignore')
# pd.set_option('mode.chained_assignment', None)

In [2]:
WORDSIM_DF = '../data/evaluation/wordsim353_with_r3.csv'
WORDSIM_OLD_FINAL_FILE = "../data/evaluation/wordsim_old.csv"
DBPEDIA_MC_30_FINAL_FILE = "../data/evaluation/mc-30_DBpedia.csv"
DBPEDIA_RG_65_FINAL_FILE = "../data/evaluation/rg-65_DBpedia.csv"

CONCEPTNET_FILE = "../data/evaluation/kgtk_conceptnet_final.csv"
WIKI_CS_FILE = '../data/evaluation/wikidata-cs_final.csv'

INPUT_EMB_FOLDER_PATH = '../data/embeddings/'

In [3]:
# basis
P279_CHILD_PAR_DISTILBERT_COSSIM_FILE = "../data/basis/P279_ChildPar.all-distilroberta-v1.csv"
P279_SIBLINGS_DISTILBERT_COSSIM_FILE = "../data/basis/P279_Siblings.all-distilroberta-v1.csv"

P279_CHILD_PAR_CLASSSIM_FILE = "../data/basis/P279_ChildPar.classSim.csv"
P279_SIBLINGS_CLASSSIM_FILE = "../data/basis/P279_Siblings.classSim.csv"

PROBASE_FINAL_FILE = '../data/basis/intermediate_files/probase_WQnodes_subset_and_sim.csv'


In [4]:
WORDSIM_CLASS_SIM_FILE = '../data/embeddings/wordsim_class_sim.csv'
WORDSIM_JC_SIM_FILE = '../data/embeddings/wordsim_jc_sim.csv'
WORDSIM_TOP_SIM_FILE = '../data/embeddings/wordsim_top_sim.csv'

WORDSIM_OLD_CLASS_SIM_FILE = '../data/embeddings/wordsim_old_class_sim.csv'
WORDSIM_OLD_JC_SIM_FILE = '../data/embeddings/wordsim_old_jc_sim.csv'
WORDSIM_OLD_TOP_SIM_FILE = '../data/embeddings/wordsim_old_top_sim.csv'

DBPEDIA_MC_30_CLASS_SIM_FILE = '../data/embeddings/dbpedia_mc_30_class_sim.csv'
DBPEDIA_MC_30_JC_SIM_FILE = '../data/embeddings/dbpedia_mc_30_jc_sim.csv'
DBPEDIA_MC_30_TOP_SIM_FILE = '../data/embeddings/dbpedia_mc_30_top_sim.csv'

DBPEDIA_RG_65_CLASS_SIM_FILE = '../data/embeddings/dbpedia_rg_65_class_sim.csv'
DBPEDIA_RG_65_JC_SIM_FILE = '../data/embeddings/dbpedia_rg_65_jc_sim.csv'
DBPEDIA_RG_65_TOP_SIM_FILE = '../data/embeddings/dbpedia_rg_65_top_sim.csv'

# Retrofitting Pre-Req Class Definitions

## Utils

In [5]:
class Utils:
    """
    This contains all the utility functions needed by any part of retrofitting
    """
    _today = date.today()
    today_date = _today.strftime("%b_%d_%Y")
    LABELS = ['I','M','U']
    
    @classmethod
    def normalize(cls, embed_dict):
        for key, val in embed_dict.items():
            temp = np.array([float(val1) for val1 in val])
            temp2 = temp**2
            embed_dict[key] = temp / np.sqrt((temp2.sum() + 1e-6))
        return embed_dict
    
    @classmethod
    def fetch_embeddings(cls, df):
        embed_dict = {}
        for _, row in df.iterrows():
            embed_dict[row.node] = row.value
        return normalize(embed_dict)
    
    @classmethod
    def fill_coverage(cls, embed_dict, embed_name):
        wordsim_df = pd.read_csv(WORDSIM_DF)
#         wiki_cs_df = pd.read_csv(WIKICS_DF)
#         concept_net_df = pd.read_csv(CONCEPTNET_DF)
        
        compulsory_coverage_set = set(
                        wordsim_df['word1_kg_id'].to_list() 
                        + wordsim_df['word2_kg_id'].to_list()
                        + evalD.dbpedia_mc_30_df['word1_kg_id'].to_list()
                        + evalD.dbpedia_mc_30_df['word2_kg_id'].to_list()
                        + evalD.dbpedia_rg_65_df['word1_kg_id'].to_list()
                        + evalD.dbpedia_rg_65_df['word2_kg_id'].to_list())
#                         + wiki_cs_df['word1_kg_id'].to_list() 
#                         + wiki_cs_df['word2_kg_id'].to_list()
#                         + concept_net_df['word1_kg_id'].to_list()
#                         + concept_net_df['word2_kg_id'].to_list())
        
        embed_size = len(embed_dict[next(iter(embed_dict))])
        count = 0
        for word in compulsory_coverage_set:
            if word not in embed_dict:
                embed_dict[word] = np.zeros((embed_size))
                count += 1
        print(f"Added {count} corrections to {embed_name}")
        return embed_dict
    
    @classmethod
    def check_coverage(cls, embed_dict):
        wordsim_df = pd.read_csv(WORDSIM_DF)
        
        compulsory_coverage_set = set(list(zip(wordsim_df['word1_kg_id'].to_list(), wordsim_df['word2_kg_id'].to_list())))
        embed_size = len(embed_dict[next(iter(embed_dict))])
        count = 0
        for word1, word2 in compulsory_coverage_set:
            if word1 not in embed_dict or word2 not in embed_dict:
                count += 1
        return (len(wordsim_df) - count)
    
    @classmethod
    def check_eval_coverage(cls, embed_dict, eval_df):
#         wordsim_df = pd.read_csv(WORDSIM_DF)
        
        compulsory_coverage_set = set(eval_df['word1_kg_id'].to_list() + eval_df['word2_kg_id'].to_list())
        embed_size = len(embed_dict[next(iter(embed_dict))])
        count = 0
        for word in compulsory_coverage_set:
            if word not in embed_dict:
                count += 1
        return count
    
    @classmethod
    def find_missing_words(cls, embed_dict):
#         wordsim_df = pd.read_csv(WORDSIM_DF)
        
        compulsory_coverage_set = set((
            inp.wordsim_df['word1_kg_id'].to_list()) + (inp.wordsim_df['word2_kg_id'].to_list()
       ))
        embed_size = len(embed_dict[next(iter(embed_dict))])
        missing_words = []
        for word in compulsory_coverage_set:
            if word not in embed_dict or not(embed_dict[word].any()):
                missing_words.append(word)
        return missing_words
    
    @classmethod
    def determine_distances(cls, embed_dict, new_embed_dict):
        dist = []
        for word in embed_dict.keys():
            dist.append(euclidean_distances([embed_dict[word]], [new_embed_dict[word]])[0][0])
        return dist
    
    @classmethod
    def serialize_embedding_dict(cls, embed_dict):
        for key2 in embed_dict.keys():
            embed_dict[key2] = embed_dict[key2].tolist() if type(embed_dict[key2]) != list else embed_dict[key2]
        return embed_dict
    
    @classmethod
    def deserialize_embedding_dict(cls, embed_dict):
        for key2 in embed_dict.keys():
            embed_dict[key2] = np.array(embed_dict[key2])
        return embed_dict
    
    @classmethod
    def label_samples(cls, score):
        return 'I' if score <= 1.75 else 'U' if score >= 3.5 else 'M'
    
    @classmethod
    def alt_label_samples(cls, score, quartiles):
        return ['Q'+str(i+1) for i in range(len(quartiles) - 1) if quartiles[i] <= score < quartiles[i+1]][0]
    
    @classmethod
    def alt2_label_samples(cls, row, quartiles):
        return [i for i, quartile in (quartiles.items()) if (row.word1_kg_id, row.word2_kg_id) in quartile][0]
    
    @classmethod
    def determine_cos_sim(cls, emb1, emb2):
        return cosine_similarity(
                np.array(emb1).reshape(1,-1), 
                np.array(emb2).reshape(1,-1)
            )[0][0]
    
    @classmethod
    def plot_confusion_matrix(cls, conf_matrix, title):
        plt.close()
        sns.heatmap(conf_matrix, xticklabels=Utils.LABELS, yticklabels=Utils.LABELS, annot=True)
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title(title+' Confusion Matrix')

In [6]:
# import sys
# def sizeof_fmt(num, suffix='B'):
#     ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
#     for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
#         if abs(num) < 1024.0:
#             return "%3.1f %s%s" % (num, unit, suffix)
#         num /= 1024.0
#     return "%.1f %s%s" % (num, 'Yi', suffix)

#     for name, size in sorted(((name, sys.getsizeof(value)) for name, value in locals().items()),
#                              key= lambda x: -x[1])[:10]:
#         print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

## Inputs

In [7]:
class InputEmbeddings:
    """
    Instance variables:
        - embed_dict_master - holds all qnode to embedding mappings as a dictionary
        - embedding_list - list of all keys of the above dictionary
    """
    def __init__(self, has_embeddings_include: bool = True):
        self.embed_dict_master = {}
        self.emb_list = ['text_7_props', 'complex', 'transe', 'abstract_first_sent', 'labels', 'labels_n_desc']
        if has_embeddings_include: # TODO
            self.emb_list += ['has_h', 'has_s']
        self.embedding_lengths = {}
        
        for emb_key in tqdm(self.emb_list, desc='Input Embeddings', leave=False):
            self.embed_dict_master[emb_key] = self.fetch_embedding(emb_key)
            
        self.fetch_embedding_stats()
        print("Fetched all input embeddings")

    def fetch_embedding(self, emb_key):
        emb = Utils.deserialize_embedding_dict(
                json.load(open(INPUT_EMB_FOLDER_PATH+emb_key+'_orig_embedding_dict.json'))
            )
#         return emb
        print(f"OG Coverage of {emb_key}: {Utils.check_coverage(emb)}")
        return Utils.fill_coverage(
                emb, emb_key
            )

    def fetch_embedding_stats(self):
        for emb_name in self.embed_dict_master.keys():
            self.embedding_lengths[emb_name] = len(next(iter(self.embed_dict_master[emb_name].values())))
            print(f"Embedding: {emb_name}, Size: {len(self.embed_dict_master[emb_name].keys())}, Length: {self.embedding_lengths[emb_name]}")

class ReducedInputEmbeddings:
    def __init__(self, embed_dict_master, final_embed_len):
        self.embed_dict_master = copy.deepcopy(embed_dict_master)
        self.final_embed_len = final_embed_len
        for key in tqdm(self.embed_dict_master.keys()):
#             tsne = TSNE(final_embed_len, verbose=1, method='exact')
            tfmr = PCA(final_embed_len)
            tfmr_proj = tfmr.fit_transform(pd.DataFrame(list(self.embed_dict_master[key].values())))
            tfmr_proj = normalize(tfmr_proj, axis=0)
            for w_key, emb in zip(self.embed_dict_master[key].keys(), tfmr_proj):
                self.embed_dict_master[key][w_key] = emb        
                
    def generate_concatenated_embedding_dict(self, key_comb):
        embedDict = defaultdict(list)
        masterKeySet = set()
        for key in key_comb:
            for qnode in self.embed_dict_master[key]:
                masterKeySet.add(qnode)
        for qnode in masterKeySet:
            for key in key_comb:
                if qnode in self.embed_dict_master[key]:
                    embedDict[qnode] = embedDict[qnode] + (self.embed_dict_master[key][qnode].tolist())
                else:
#                     print("Hit missing elem branch for concatenation")
                    embedDict[qnode] = embedDict[qnode] + [0]*self.final_embed_len
            embedDict[qnode] = np.array(embedDict[qnode])
        return dict(embedDict)

class InputScoreTables:
    def __init__(self, embed_dict_master, exception_cols, eval_file, new_embed_dict_master=None, new_embed_suffix='_bert_child_par_1_weighted'):
        self.input_score_tables = {}
        self.new_embed_suffix = new_embed_suffix
        print(f"Fetching {eval_file} wordsim score tables and eval file")
        if eval_file == 'wordsim_new':
            self.input_score_tables['classSim'] = pd.read_csv(WORDSIM_CLASS_SIM_FILE)
            self.input_score_tables['JC'] = pd.read_csv(WORDSIM_JC_SIM_FILE)
            self.input_score_tables['topSim'] = pd.read_csv(WORDSIM_TOP_SIM_FILE)
            self.wordsim = evalD.wordsim_df.copy()
        elif eval_file == 'wordsim_old':
            self.input_score_tables['classSim'] = pd.read_csv(WORDSIM_OLD_CLASS_SIM_FILE)
            self.input_score_tables['JC'] = pd.read_csv(WORDSIM_OLD_JC_SIM_FILE)
            self.input_score_tables['topSim'] = pd.read_csv(WORDSIM_OLD_TOP_SIM_FILE)
            self.wordsim = evalD.old_wordsim_df.copy()
        elif eval_file == 'dbpedia_mc_30':
            self.input_score_tables['classSim'] = pd.read_csv(DBPEDIA_MC_30_CLASS_SIM_FILE)
            self.input_score_tables['JC'] = pd.read_csv(DBPEDIA_MC_30_JC_SIM_FILE)
            self.input_score_tables['topSim'] = pd.read_csv(DBPEDIA_MC_30_TOP_SIM_FILE)
            self.wordsim = evalD.dbpedia_mc_30_df.copy()
        elif eval_file == 'dbpedia_rg_65':
            self.input_score_tables['classSim'] = pd.read_csv(DBPEDIA_RG_65_CLASS_SIM_FILE)
            self.input_score_tables['JC'] = pd.read_csv(DBPEDIA_RG_65_JC_SIM_FILE)
            self.input_score_tables['topSim'] = pd.read_csv(DBPEDIA_RG_65_TOP_SIM_FILE)
            self.wordsim = evalD.dbpedia_rg_65_df.copy()
            
#         self.input_score_tables['classSim']['embedding_na'] = self.input_score_tables['classSim']['embedding_cos_sim'].isna()
# #         self.input_score_tables['classSim'] = self.input_score_tables['classSim'][self.input_score_tables['classSim'].word1_kg_id == self.input_score_tables['classSim'].word2_kg_id]
#         self.input_score_tables['JC']['embedding_na'] = self.input_score_tables['JC']['embedding_cos_sim'].isna()
# #         self.input_score_tables['JC'] = self.input_score_tables['JC'][self.input_score_tables['JC'].word1_kg_id == self.input_score_tables['JC'].word2_kg_id]
#         self.input_score_tables['topSim']['embedding_na'] = self.input_score_tables['topSim']['embedding_cos_sim'].isna()
#         self.input_score_tables['topSim'] = self.input_score_tables['topSim'][self.input_score_tables['topSim'].word1_kg_id == self.input_score_tables['topSim'].word2_kg_id]
        
        if embed_dict_master is not None:
            for emb in embed_dict_master:
#                 print(f"Emb: {emb}")
                self.input_score_tables[emb] = self.construct_wsim_tab(embed_dict_master[emb])
                if new_embed_dict_master is not None:
                    self.input_score_tables[emb+'_retrofitted'] = self.construct_wsim_tab(new_embed_dict_master[emb + self.new_embed_suffix])
        self.input_score_tables['average'] = self.get_averaged_dict(exception_cols, False)
        if new_embed_dict_master is not None:
            self.input_score_tables['average_retrofitted'] = self.get_averaged_dict(exception_cols, True)
    
    def construct_wsim_tab(self, embed_dict):
        eval_dataset = self.wordsim.copy()

        eval_dataset['embedding_cos_sim'] = eval_dataset.apply(lambda p: Utils.determine_cos_sim(embed_dict[p['word1_kg_id']], embed_dict[p['word2_kg_id']]) 
                                                   if p['word1_kg_id'] in embed_dict and p['word2_kg_id'] in embed_dict and embed_dict[p['word1_kg_id']].sum() != 0 and embed_dict[p['word2_kg_id']].sum() != 0
                                                   else None, axis=1)
        eval_dataset['embedding_na'] = eval_dataset['embedding_cos_sim'].isna()
#         print(f"Coverage: {len(eval_dataset) - eval_dataset['embedding_cos_sim'].isna().sum()}")
        eval_dataset['embedding_cos_sim'].fillna(eval_dataset['embedding_cos_sim'].mean(skipna=True), inplace=True)
        
        # Scale abs value of cosine similarities to 1,4 strictly
        eval_dataset['embedding_cos_sim'] = eval_dataset['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p))
        
        return eval_dataset
        
    def get_pairwise_dict(self, tab_key):
        if tab_key not in self.input_score_tables:
            raise "Key not present in table"
        return {(row['word1_kg_id'], row['word2_kg_id']): row['embedding_cos_sim'] for _, row in self.input_score_tables[tab_key].iterrows()}
    
    def get_averaged_dict(self, exception_cols, new_embed=False):
        final_dict = defaultdict(list)
        col_list = []
        try:
            for tab_key in set(self.input_score_tables.keys()) - exception_cols:
                if new_embed and not(tab_key.endswith('retrofitted')):
                    continue
                elif not(new_embed) and tab_key.endswith('retrofitted'):
                    continue
                if any([tab_key.startswith(col1) for col1 in exception_cols]):
                    continue
                if tab_key.startswith('average'):
                    continue
                col_list.append(tab_key)
                for _, row in self.input_score_tables[tab_key].iterrows():
                    if row['embedding_na'] == False:
                        final_dict[(row['word1_kg_id'], row['word2_kg_id'])].append(row['embedding_cos_sim'])
                    else:
    #                     print('na embedding was present, hence skipped')
                        pass
        except Exception as exc:
            print(tab_key)
            raise exc
        for key in final_dict:
            final_dict[key] = np.mean(np.array(final_dict[key]))
        print(f"Returning averaged scores from {len(col_list)} algorithms - {col_list}")
        eval_dataset = self.wordsim.copy()

        eval_dataset['embedding_cos_sim'] = eval_dataset.apply(lambda p: final_dict[(p['word1_kg_id'], p['word2_kg_id'])], axis=1)
        eval_dataset['embedding_na'] = eval_dataset['embedding_cos_sim'].isna()
        
        return eval_dataset

## NeighborDatasets

In [8]:
class NeighborDatasets:
    """
    Instance variables:
        - neighbors_dict_master - holds all qnode to neighbor qnode mappings as a dictionary
        - basis_list - list of all keys of the above dictionary
    """
    
    def __init__(self, class_datasets_fetch: bool = False, probase_datasets_fetch: bool = True):
        self.neighbors_dict_master = {}
        
        pbar = tqdm(desc='Neighbor Datasets', leave=False, total = 
                    3
                    + (3 if class_datasets_fetch else 0) 
                    + (1 if probase_datasets_fetch else 0) 
                   )
        
        bert_P279_child_par_df = pd.read_csv(P279_CHILD_PAR_DISTILBERT_COSSIM_FILE)
#         bert_P279_child_par_df_cross_enc = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_filtered_cross_enc.csv')
        bert_P279_siblings_df = pd.read_csv(P279_SIBLINGS_DISTILBERT_COSSIM_FILE)
#         bert_P279_siblings_df_cross_enc = pd.read_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_min_cols_with_desc_dups_removed_cross_enc.csv')
        
        self.neighbors_dict_master['bert_child_par'] = self.fetch_neighbours(bert_P279_child_par_df)
        pbar.update(1)
        self.neighbors_dict_master['bert_siblings'] = self.fetch_neighbours(bert_P279_siblings_df)
        pbar.update(1)
        self.neighbors_dict_master['bert_all'] = self.fetch_neighbours(pd.concat([
                bert_P279_child_par_df, bert_P279_siblings_df
            ]))
        pbar.update(1)
        
#         self.neighbors_dict_master['cross_enc_child_par'] = self.fetch_neighbours(bert_P279_child_par_df)
#         pbar.update(1)
#         self.neighbors_dict_master['cross_enc_siblings'] = self.fetch_neighbours(bert_P279_siblings_df)
#         pbar.update(1)
#         self.neighbors_dict_master['cross_enc_all'] = self.fetch_neighbours(pd.concat([
#                 bert_P279_child_par_df, bert_P279_siblings_df
#             ]))
#         pbar.update(1)
            
        if class_datasets_fetch:
            class_P279_child_par_df = pd.read_csv(P279_CHILD_PAR_CLASSSIM_FILE)
            class_P279_child_par_df['similarity_value'] = class_P279_child_par_df['classSim']
            
            class_P279_siblings_df = pd.read_csv(P279_SIBLINGS_CLASSSIM_FILE)
            class_P279_siblings_df['similarity_value'] = class_P279_siblings_df['classSim']
            
            self.neighbors_dict_master['class_child_par'] = self.fetch_neighbours(class_P279_child_par_df)
            pbar.update(1)
            self.neighbors_dict_master['class_siblings'] = self.fetch_neighbours(class_P279_siblings_df)
            pbar.update(1)
            self.neighbors_dict_master['class_all'] = self.fetch_neighbours(pd.concat([
                    class_P279_child_par_df, class_P279_siblings_df
                ]))
            pbar.update(1)

        if probase_datasets_fetch:
            probase_df = self.process_probase(PROBASE_FINAL_FILE)
            
            self.neighbors_dict_master['probase'] = self.fetch_neighbours(probase_df)
            pbar.update(1)
#             self.neighbors_dict_master['probase+bert_all'] = self.fetch_neighbours(pd.concat([
#                     bert_P279_child_par_df, bert_P279_siblings_df, probase_df
#                 ]))
#             pbar.update(1)
        
        self.basis_list = list(self.neighbors_dict_master.keys())
        
        pbar.close()

        print(f"Fetched neighbour datasets: {self.basis_list}")
    
    def process_probase(self, probase_file_path):
        probase_df = pd.read_csv(probase_file_path)
#         probase_df = probase_df.rename(columns={'n1_final_qnode': 'node1', 'n2_final_qnode': 'node2', 'sim': 'similarity_value'})
        probase_df['similarity_value'] = 0.5 + 0.5 * probase_df['similarity_value']
        
        return probase_df
        
    def fetch_neighbours(self, df):
        neighbours_dict = {}
        for _, row in df.iterrows():
            if row.node1 not in neighbours_dict:
                neighbours_dict[row.node1] = []
            neighbours_dict[row.node1].append((row.node2, row.similarity_value))

            if row.node2 not in neighbours_dict:
                neighbours_dict[row.node2] = []
            neighbours_dict[row.node2].append((row.node1, row.similarity_value))
#         print(max([len(neigh) for neigh in neighbours_dict.values()]))
        
        return neighbours_dict

## EvaluationDatasets

In [9]:
class EvaluationDatasets:
    def __init__(self):
        self.wordsim_df = pd.read_csv(WORDSIM_DF)
        self.wordsim_df['category'] = self.wordsim_df.Avg.apply(Utils.label_samples)
        self.fetch_distribution_stats("Wordsim-353", self.wordsim_df)
        
        self.old_wordsim_df = pd.read_csv(WORDSIM_OLD_FINAL_FILE)
        self.wordsim_df['category'] = self.wordsim_df.Avg.apply(Utils.label_samples)
        self.fetch_distribution_stats("Wordsim-353 OLD", self.old_wordsim_df)
        
        self.dbpedia_mc_30_df = pd.read_csv(DBPEDIA_MC_30_FINAL_FILE)
        self.dbpedia_mc_30_df['category'] = self.dbpedia_mc_30_df.Avg.apply(Utils.label_samples)
        self.fetch_distribution_stats("DBPedia MC 30", self.dbpedia_mc_30_df)
        
        self.dbpedia_rg_65_df = pd.read_csv(DBPEDIA_RG_65_FINAL_FILE)
        self.dbpedia_rg_65_df['category'] = self.dbpedia_rg_65_df.Avg.apply(Utils.label_samples)
        self.fetch_distribution_stats("DBPedia RG 65", self.dbpedia_rg_65_df)
        
#         self.wiki_cs_df = pd.read_csv('../data/wikidata-cs_categorized.csv')
#         self.fetch_distribution_stats("Wikidata CS", self.wiki_cs_df)
        
#         self.concept_net_df = pd.read_csv('../data/kgtk_conceptnet_evaluation.csv')
#         self.fetch_distribution_stats("Concept Net", self.concept_net_df)
        
        self.get_coverage_nodes()
        
    def fetch_distribution_stats(self, name, dataset):
        print(f"Dataset: {name}")
        print(dataset.category.value_counts())
    
    def get_coverage_nodes(self):
        self.coverage = set(
                        self.wordsim_df['word1_kg_id'].to_list() 
                        + self.wordsim_df['word2_kg_id'].to_list() 
                        + self.dbpedia_mc_30_df['word1_kg_id'].to_list()
                        + self.dbpedia_mc_30_df['word2_kg_id'].to_list()
                        + self.dbpedia_rg_65_df['word1_kg_id'].to_list()
                        + self.dbpedia_rg_65_df['word2_kg_id'].to_list())
#                         + self.wiki_cs_df['word1_kg_id'].to_list() 
#                         + self.wiki_cs_df['word2_kg_id'].to_list()
#                         + self.concept_net_df['word1_kg_id'].to_list()
#                         + self.concept_net_df['word2_kg_id'].to_list())

## ResultMetrics

In [10]:
class ResultMetrics:
    
    @classmethod
    def compute_classification_results(cls,
            embed_dict, 
            eval_dataset,
            get_output_values: bool = False,
            old_accuracy = None
            ):
        """
        Inputs:
            - embed_dict - dictionary of qnodes with node embeddings as its values
            - eval_dataset - evaluation dataset as pandas dataframe that must have the 
                following columns for this function to work correctly:
                * word1_kg_id - Qnode of node1 in the evaluation pair
                * word2_kg_id - Qnode of node2 in the evaluation pair
                * category - Category of the evaluation pair. One of the labels: I/U/M
        Outputs:
            - response_dict - Returns a dictionary with the following keys:
                * covered_pairs - Indicates the number of pairs of the evaluation dataset that the 
                    embedding dictionary can cover
                
        """
        response_dict = {}
        
        eval_dataset = eval_dataset.copy()

        missing_words_set = set(
            eval_dataset[eval_dataset.word1_kg_id.apply(lambda p: p not in embed_dict)].word1_kg_id.to_list() 
            + eval_dataset[eval_dataset.word2_kg_id.apply(lambda p: p not in embed_dict)].word2_kg_id.to_list()
        )
        
        response_dict['covered_pairs'] = len(eval_dataset)

        eval_dataset['embedding_cos_sim'] = eval_dataset.apply(lambda p: Utils.determine_cos_sim(embed_dict[p['word1_kg_id']], embed_dict[p['word2_kg_id']]) 
                                                   if p['word1_kg_id'] in embed_dict and p['word2_kg_id'] in embed_dict 
                                                   else None, axis=1)
        
        eval_dataset['embedding_cos_sim'].fillna(eval_dataset['embedding_cos_sim'].mean(skipna=True), inplace=True)
        
        # Scale abs value of cosine similarities to 1,4 strictly
        eval_dataset['embedding_cos_sim'] = eval_dataset['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p))

        response_dict['accuracy'] = 100 * accuracy_score(
                eval_dataset['category'],
                eval_dataset['embedding_cos_sim'].apply(Utils.label_samples)
            )
    
        response_dict['classification_report'] = classification_report(
                eval_dataset['category'], 
                eval_dataset['embedding_cos_sim'].apply(Utils.label_samples), 
                output_dict=True
            )

        response_dict['conf_matrix'] = confusion_matrix(
                eval_dataset['category'], 
                eval_dataset['embedding_cos_sim'].apply(Utils.label_samples), 
                labels=Utils.LABELS
            )
        if 'Avg' in eval_dataset.columns:
            response_dict['KT'] = stats.kendalltau(eval_dataset['Avg'], eval_dataset['embedding_cos_sim']).correlation
            response_dict['SR'] = stats.spearmanr(eval_dataset['Avg'], eval_dataset['embedding_cos_sim']).correlation
            response_dict['RMSE'] = mean_squared_error(eval_dataset['Avg'], eval_dataset['embedding_cos_sim'], squared=False)
        else:
            response_dict['KT'] = None
            response_dict['SR'] = None
            response_dict['RMSE'] = None
        
        if old_accuracy is not None:
            response_dict['increase_acc'] = response_dict['accuracy'] - old_accuracy
        else:
            response_dict['increase_acc'] = None
        
        if get_output_values:
            response_dict['preds'] = eval_dataset['embedding_cos_sim'].apply(Utils.label_samples)

        return response_dict, \
                (response_dict['covered_pairs'],  \
                 response_dict['accuracy'], \
                 response_dict['increase_acc'], \
                 
                 response_dict['classification_report']['I']['precision'],  \
                 response_dict['classification_report']['I']['recall'],  \
                 response_dict['classification_report']['I']['f1-score'],   \
                 
                 response_dict['classification_report']['M']['precision'],  \
                 response_dict['classification_report']['M']['recall'],  \
                 response_dict['classification_report']['M']['f1-score'], \
                 
                 response_dict['classification_report']['U']['precision'],  \
                 response_dict['classification_report']['U']['recall'],  \
                 response_dict['classification_report']['U']['f1-score'], \
                
                 response_dict['KT'], \
                 response_dict['SR'],
                 response_dict['RMSE'])
    
    @classmethod
    def fetch_best_result_for_emb(cls, results_df, emb_col, target_col, iter_col, highest: bool = True):
        opt_value = {}
        for _, row in results_df.iterrows():
            if row[emb_col] not in opt_value:
                opt_value[row[emb_col]] = {'opt_metric': float('-inf') if highest else float('inf'),
                                     'opt_row': [], 'old_row': []}
            if row[iter_col] == 0:
                opt_value[row[emb_col]]['old_row'] = row
            else:
                if (highest and row[target_col] > opt_value[row[emb_col]]['opt_metric']) \
                        or (not(highest) and row[target_col] < opt_value[row[emb_col]]['opt_metric']):
                    opt_value[row[emb_col]]['opt_metric'] = row[target_col]
                    opt_value[row[emb_col]]['opt_row'] = row
        best_results = []
        for emb_key in opt_value:
            best_results.append(opt_value[emb_key]['old_row'])
            best_results.append(opt_value[emb_key]['opt_row'])
        return pd.DataFrame(best_results, columns = results_df.columns)
    
    @classmethod
    def compute_classification_n_regression_stats(cls, ist, suffix, standard_labels=True):
#         q_size = len(evalD.wordsim_df) // 4
        if not(standard_labels):
#             print(f"At most {q_size} rows in each quartile")
#             temp_wordsim_df = ist.wordsim.sort_values(by=['Avg', 'word1_kg_id', 'word2_kg_id'])
#             quantile_sets = {'Q'+str(i+1): set(temp_wordsim_df[q_size*i:q_size*(i+1)].apply(lambda p: (p.word1_kg_id, p.word2_kg_id), axis=1).to_list()) 
#                                  for i in range(4) }
#             wordsim_cats = ist.wordsim.apply(Utils.alt2_label_samples, args=(quantile_sets,), axis=1)
            quantiles = evalD.wordsim_df.Avg.quantile([0, 0.25, 0.5, 0.75, 1]).to_list()
            quantiles[0], quantiles[-1] = float('-inf'), float('inf')
            print(f"Quantiles being used by wordsim: {quantiles}")
            wordsim_cats = ist.wordsim.Avg.apply(Utils.alt_label_samples, args=(quantiles,))
        else:
            wordsim_cats = ist.wordsim.category
        
        eval_df = ist.wordsim.copy()
        if not(standard_labels):
#             eval_df['quartile'] = eval_df.apply(Utils.alt2_label_samples, args=(quantile_sets,), axis=1)
            eval_df['quartile'] = eval_df.Avg.apply(Utils.alt_label_samples, args=(quantiles,))
            
        results = []
        for tab_key in ist.input_score_tables:
#             print(tab_key)
            cosSimPreds_df = ist.input_score_tables[tab_key]
#             print(len(cosSimPreds_df))
            response_dict = {}
            if standard_labels:
                preds = cosSimPreds_df['embedding_cos_sim'].apply(Utils.label_samples)
            else:
                temp_wordsim_df = cosSimPreds_df.sort_values(by=['embedding_cos_sim', 'word1_kg_id', 'word2_kg_id'])
#                 quantile_sets = {'Q'+str(i+1): set(temp_wordsim_df[q_size*i:q_size*(i+1)].apply(lambda p: (p.word1_kg_id, p.word2_kg_id), axis=1).to_list()) 
#                                  for i in range(4) }
                quantiles_emb = cosSimPreds_df['embedding_cos_sim'].quantile([0, 0.25, 0.5, 0.75, 1]).to_list()
                quantiles_emb[0], quantiles_emb[-1] = float('-inf'), float('inf')
                print(f"Quantiles being used by {tab_key}: {quantiles_emb}")
                preds = cosSimPreds_df['embedding_cos_sim'].apply(Utils.alt_label_samples, args=(quantiles_emb,))
            eval_df[tab_key] = cosSimPreds_df['embedding_cos_sim']
            eval_df[tab_key+'_cat'] = preds
#             print(len(wordsim_cats))
#             print(len(preds))
            response_dict['accuracy'] = 100 * accuracy_score(
                    wordsim_cats,
                    preds
                )

            response_dict['classification_report'] = classification_report(
                    wordsim_cats,
                    preds, 
                    output_dict=True
                )

            response_dict['KT'] = stats.kendalltau(cosSimPreds_df['Avg'], cosSimPreds_df['embedding_cos_sim']).correlation
            response_dict['SR'] = stats.spearmanr(cosSimPreds_df['Avg'], cosSimPreds_df['embedding_cos_sim']).correlation
            response_dict['RMSE'] = mean_squared_error(cosSimPreds_df['Avg'], cosSimPreds_df['embedding_cos_sim'], squared=False)
            
            # SVR related stats
            temp_dict = {'basis': '', 'emb': tab_key, 'weightedness': True, 
                'iter_num': 0, 'weight_case': None, 'svm_input': 'score'}
            svr_res = SVMProcedures.execute_supervised_scenario(
                ist.wordsim, temp_dict, ist.get_pairwise_dict(tab_key), 
                {},num_of_splits=10,
                comb_mode=False, SVC_or_SVR='SVR', score_table_mode=True
            )
            
            if standard_labels:
                results.append([tab_key, response_dict['accuracy'], response_dict['classification_report']['macro avg']['precision'], \
                            response_dict['classification_report']['macro avg']['recall'], \
                            response_dict['classification_report']['macro avg']['f1-score'], \
                                \
                            response_dict['classification_report']['I']['precision'], \
                            response_dict['classification_report']['I']['recall'], \
                            response_dict['classification_report']['I']['f1-score'], \
                                \
                            response_dict['classification_report']['M']['precision'], \
                            response_dict['classification_report']['M']['recall'], \
                            response_dict['classification_report']['M']['f1-score'], \
                                \
                            response_dict['classification_report']['U']['precision'], \
                            response_dict['classification_report']['U']['recall'], \
                            response_dict['classification_report']['U']['f1-score'], \
                                \
                            response_dict['KT'], response_dict['SR'], response_dict['RMSE'], svr_res[-2], svr_res[-1], svr_res[-3]])
            else:
                results.append([tab_key, response_dict['accuracy'], response_dict['classification_report']['macro avg']['precision'], \
                            response_dict['classification_report']['macro avg']['recall'], \
                            response_dict['classification_report']['macro avg']['f1-score'], \
                                \
                            response_dict['classification_report']['Q1']['precision'], \
                            response_dict['classification_report']['Q1']['recall'], \
                            response_dict['classification_report']['Q1']['f1-score'], \
                                \
                            response_dict['classification_report']['Q2']['precision'], \
                            response_dict['classification_report']['Q2']['recall'], \
                            response_dict['classification_report']['Q2']['f1-score'], \
                                \
                            response_dict['classification_report']['Q3']['precision'], \
                            response_dict['classification_report']['Q3']['recall'], \
                            response_dict['classification_report']['Q3']['f1-score'], \
                                \
                            response_dict['classification_report']['Q4']['precision'], \
                            response_dict['classification_report']['Q4']['recall'], \
                            response_dict['classification_report']['Q4']['f1-score'], \
                                \
                            response_dict['KT'], response_dict['SR'], response_dict['RMSE'], svr_res[-2], svr_res[-1], svr_res[-3]])
        if standard_labels:
            res_df = pd.DataFrame(results, columns=['algorithm', 'accuracy', 'P', 'R', 'F1', 'I P', 'I R', 'I F1', 'M P', 'M R', 'M F1', 'U P', 'U R', 'U F1', 'Kendall Tau', 'Spearman Rank', 'RMSE', 'SVR Kendall Tau', 'SVR Spearman Rank', 'SVR RMSE'])
        else:
            res_df = pd.DataFrame(results, columns=['algorithm', 'accuracy', 'P', 'R', 'F1', 'Q1 P', 'Q1 R', 'Q1 F1', 'Q2 P', 'Q2 R', 'Q2 F1', 'Q3 P', 'Q3 R', 'Q3 F1', 'Q4 P', 'Q4 R', 'Q4 F1', 'Kendall Tau', 'Spearman Rank', 'RMSE', 'SVR Kendall Tau', 'SVR Spearman Rank', 'SVR RMSE'])
        res_df.to_csv('../data/retrofitting/score_table_algorithms_results.' + suffix + '.' + Utils.today_date + '.csv', index=False)
        return res_df, eval_df
    

## RetrofittingProcedures

In [11]:
class RetrofittingProcedures:
    
    np_label_samples = np.vectorize(Utils.label_samples)
    
    @classmethod
    def retrofit(cls,embed_dict, neighbors_dict, weight_case, weight_assignment=False):
        new_embed_dict = {}
        for word in embed_dict.keys():
            if word in neighbors_dict:
                neighbs = neighbors_dict[word]
                neighbs = list(filter(lambda p: p[0] in embed_dict, neighbs))
                if len(neighbs) == 0:
                    new_embed_dict[word] = embed_dict[word]
                    continue
                if weight_assignment:
                    sum_of_sims = sum([neighb[1] for neighb in neighbs])
                    sum_of_embs = sum([embed_dict[neighb[0]] * float(neighb[1]) for neighb in neighbs])
                else:
                    sum_of_sims = sum([1 for neighb in neighbs])
                    sum_of_embs = sum([embed_dict[neighb[0]] for neighb in neighbs])

                if weight_case == 1:
                    new_embed_dict[word] = (embed_dict[word] * (len(neighbs)) + sum_of_embs) / ((len(neighbs)) + sum_of_sims)
                elif weight_case == 2:
                    new_embed_dict[word] = (embed_dict[word] * (len(neighbs))**2 + sum_of_embs) / ((len(neighbs))**2 + sum_of_sims)
                elif weight_case == 0.5:
                    new_embed_dict[word] = (embed_dict[word] * (len(neighbs))**0.5 + sum_of_embs) / ((len(neighbs))**0.5 + sum_of_sims)
                else:
                    raise
            else:
                new_embed_dict[word] = embed_dict[word]
        return new_embed_dict
    
    @classmethod
    def execute_all_unsupervised_scenarios(cls,
                emb_list, basis_list, 
                embed_dict_master, neigh_dict_master,
                eval_dataset,
                scenario_name: str,
                num_of_iterations: int = 2, 
                weightedness_list: list = [True],
                weight_cases_list: list = [1],
                get_output_values: bool = False,
                prev_new_embed_dict_master = None
            ):
        
        new_embed_dict_master = {}
        responses_dict_master = {}
        results = []
        
        for basis in tqdm(basis_list, desc='Basis', leave=False):
            for emb in tqdm(emb_list, desc='Embedding', leave=False):
                for weightedness in weightedness_list:
                    for weight_case in tqdm(weight_cases_list, desc='Weight Case', leave=False):
                        # Base Reference Initializations and Calculations
                        embed_dict = embed_dict_master[emb]
                        responses_dict, result_values = ResultMetrics.compute_classification_results(
                            embed_dict, eval_dataset, get_output_values=get_output_values, old_accuracy=None)
                        results.append([emb, basis, weight_case, weightedness, 0, 'base', *result_values, 0])
                        old_accuracy = responses_dict['accuracy']
                        
                        for iter_num in tqdm(range(1,num_of_iterations+1), desc='Iteration', leave=False):
                            start_time = time()
                            
                            case_name = emb + '_' + basis + '_' + str(weight_case) + ('_weighted' if weightedness else '_unweighted')
                            
                            if prev_new_embed_dict_master is not None and case_name in prev_new_embed_dict_master:
                                new_embed_dict = prev_new_embed_dict_master[case_name]
                            else:
                                new_embed_dict = cls.retrofit(embed_dict, neigh_dict_master[basis], weight_case, weightedness)
                            
                            responses_dict, result_values = ResultMetrics.compute_classification_results(
                                new_embed_dict, eval_dataset, get_output_values=get_output_values, old_accuracy=old_accuracy)
                            
                            results.append([emb, basis, weight_case, weightedness, iter_num, case_name, \
                                                *result_values, \
                                                time() - start_time
                                            ])
                            
                            new_embed_dict_master[case_name] = embed_dict = new_embed_dict
                            responses_dict_master[case_name] = responses_dict

        #                     if iter_num == num_of_iterations and highestOne:
        #                         case_name = gR[0] + '_' + gR[1] + '_' + str(gR[2]) + '_weighted'
        #                         new_embed_dict_master[case_name] = serializeEmbeddingDict(new_embed_dict_master[case_name])
        #                         highestOne = False
        #                         json.dump(new_embed_dict_master[case_name],open('../data/Master_P279_dataset/embeddings/new_embedding_dict_'+case_name+'.json','w'))
        #                         new_embed_dict_master[case_name] = deserializeEmbeddingDict(new_embed_dict_master[case_name])
#         print(results)
#         ['text_7_props', 'bert_child_par', 1, True, 0, 'base', 344, 56.68604651162791, None, 0.30952380952380953, 
#          0.65, 0.41935483870967744, 0.6289752650176679, 0.8054298642533937, 0.7063492063492064, 
#          0.21052631578947367, 0.038834951456310676, 0.06557377049180328, 0.31127513538205615, 0.4132700578622946]
        resultsDF = pd.DataFrame(results, columns=['Embedding', 'Basis', 'Weight Case', 'Weightedness', 
                                                   'Iteration Num', 'Case Name', \
                                                   'No. of Pairs Covered', 'Accuracy', 'Increase in Accuracy', \
                                                   'I Precision', 'I Recall', 'I F1-Score', \
                                                   'M Precision', 'M Recall', 'M F1-Score', \
                                                   'U Precision', 'U Recall', 'U F1-Score',
                                                   'KT Correlation', 'SpearmanR Correlation', 'RMSE', \
                                                   'Time to Retrofit'])
        resultsDF.to_csv('../data/retrofitting/retro_unsup_results.' + scenario_name + '.'+ Utils.today_date +'.csv', index=False)
#         best_results_df = ResultMetrics.fetch_best_result_for_emb(resultsDF, 'Embedding', 'Accuracy', 'Iteration Num', highest=True)
#         best_results_df.to_csv('../data/retrofitting/retro_unsup_results.' + scenario_name + '.'+ Utils.today_date +'.best.csv', index=False)
        
#         cls.save_needed_embeddings(new_embed_dict_master)
        
        return new_embed_dict_master, responses_dict_master
    
    @classmethod
    def save_all_embeddings(cls, new_embed_dict_master):
        for case_name in new_embed_dict_master:
            json.dump(Utils.serialize_embedding_dict(new_embed_dict_master[case_name]), open(INPUT_EMB_FOLDER_PATH + 'new_embeddings/' + case_name + '.' + Utils.today_date + '.json', 'w'))
    
    @classmethod
    def save_needed_embeddings(cls, new_embed_dict_master):
        for case_name in new_embed_dict_master:
            temp = {key: new_embed_dict_master[case_name][key] for key in new_embed_dict_master[case_name] if key in evalD.coverage}
            new_embed_dict_master[case_name] = temp

## SVMProcedures

In [12]:
class SVMProcedures:
    @classmethod
    def execute_supervised_scenario(cls,
                eval_dataset, case, embed_dict_master, new_embed_dict_master, 
                num_of_splits = 10, 
                comb_mode: bool = False, SVC_or_SVR: str = 'SVC', 
                score_table_mode: bool = False
            ):
        
        X = []        
        
        ################
        # 2 Approaches based on argument: `comb_mode`
        ################
        
        if comb_mode: ########## COMBINATION MODE CODE ####################
            case_name = " & ".join(case['emb']) + '_' + case['basis'] + '_' + str(case['weight_case']) + ('_weighted' if case['weightedness'] else '_unweighted')
            
            for _, row in eval_dataset.iterrows():
                if case['svm_input'] == 'emb':
                    tempX = []
                    for individual_emb in case['emb']:
                        ind_case_name = individual_emb + '_' + case['basis'] + '_' + str(case['weight_case']) + ('_weighted' if case['weightedness'] else '_unweighted')
                        if case['iter_num'] != 0 and ind_case_name not in new_embed_dict_master:
                            return case_name, case, None
                        if case['iter_num'] == 0:
                            if score_table_mode:
                                raise "Not yet implemented"
                            tempX += embed_dict_master[individual_emb][row['word1_kg_id']].tolist() + embedDictMaster[individual_emb][row['word2_kg_id']].tolist()
                        else:
                            tempX += new_embed_dict_master[ind_case_name][row['word1_kg_id']].tolist() + newEmbedDictMaster[ind_case_name][row['word2_kg_id']].tolist()
                    X.append(tempX)
                else:
                    tempX = []
                    for individual_emb in case['emb']:
                        ind_case_name = individual_emb + '_' + case['basis'] + '_' + str(case['weight_case']) + ('_weighted' if case['weightedness'] else '_unweighted')
                        if case['iter_num'] != 0 and ind_case_name not in new_embed_dict_master:
                            return case_name, case, None
                        if case['iter_num'] == 0:
                            if score_table_mode:
                                raise "Not yet implemented"
                            tempX.append(abs(Utils.determine_cos_sim(
                                embed_dict_master[individual_emb][row['word1_kg_id']], 
                                embed_dict_master[individual_emb][row['word2_kg_id']]
                            )))
                        else:
                            tempX.append(abs(Utils.determine_cos_sim(
                                new_embed_dict_master[ind_case_name][row['word1_kg_id']],
                                new_embed_dict_master[ind_case_name][row['word2_kg_id']]
                            )))
                    X.append(tempX)

        else: ########## NON-COMBINATION MODE CODE ####################
            try:
                case_name = case['emb'] + '_' + case['basis'] + '_' + str(case['weight_case']) + ('_weighted' if case['weightedness'] else '_unweighted')
                if case['iter_num'] != 0 and case_name not in new_embed_dict_master:
                    return case_name, case, None
                for _, row in eval_dataset.iterrows():
                    if case['svm_input'] == 'emb':
                        if case['iter_num'] == 0:
                            if score_table_mode:
                                raise "Not yet implemented"
                            X.append(embed_dict_master[case['emb']][row['word1_kg_id']].tolist() + embed_dict_master[case['emb']][row['word2_kg_id']].tolist())
                        else:
                            X.append(new_embed_dict_master[case_name][row['word1_kg_id']].tolist() + new_embed_dict_master[case_name][row['word2_kg_id']].tolist())
                    else:
                        if case['iter_num'] == 0:
                            if score_table_mode:
                                X.append(embed_dict_master[(row['word1_kg_id'], row['word2_kg_id'])])
                            else:
                                X.append(abs(Utils.determine_cos_sim(
                                        embed_dict_master[case['emb']][row['word1_kg_id']], 
                                        embed_dict_master[case['emb']][row['word2_kg_id']]
                                    )))
                        else:
                            X.append(abs(Utils.determine_cos_sim(
                                    new_embed_dict_master[case_name][row['word1_kg_id']],
                                    new_embed_dict_master[case_name][row['word2_kg_id']]
                                )))
            except Exception as err:
                print(case_name)
                raise err
                    
        X = pd.DataFrame(X)
        
        ################
        # 2 Approaches based on argument: `SVC_or_SVR`
        ################
        
        # Target split depending on SVC or SVM
        if SVC_or_SVR == 'SVC':
            Y = eval_dataset['category']
        elif SVC_or_SVR == 'SVR':
            if 'Avg' not in eval_dataset.columns:
                raise ValueError("Avg column not present in the provided eval_dataset")
            Y = (eval_dataset['Avg'] - 1) / 3
        else:
            raise ValueError("Invalid SVC_or_SVR provided")
        
        if SVC_or_SVR == 'SVC':
            skf = StratifiedKFold(n_splits=num_of_splits, random_state=19, shuffle=True)
            X_train_splits, X_test_splits, Y_train_splits, Y_test_splits = [], [], [], []
            for train_index, test_index in skf.split(X, Y):
                X_train_splits.append(X.iloc[train_index])
                X_test_splits.append(X.iloc[test_index])
                Y_train_splits.append(Y.iloc[train_index])
                Y_test_splits.append(Y.iloc[test_index])
        elif SVC_or_SVR == 'SVR':
            skf = KFold(n_splits=num_of_splits, random_state=19, shuffle=True)
            X_train_splits, X_test_splits, Y_train_splits, Y_test_splits = [], [], [], []
            for train_index, test_index in skf.split(X, Y):
                X_train_splits.append(X.iloc[train_index])
                X_test_splits.append(X.iloc[test_index])
                Y_train_splits.append(Y.iloc[train_index])
                Y_test_splits.append(Y.iloc[test_index])

        preds = []
        
        # Classifier/Regressor training depending on SVC or SVM
        if SVC_or_SVR == 'SVC':
            for X_train1, Y_train1, X_test1, Y_test1 in zip(X_train_splits, Y_train_splits, X_test_splits, Y_test_splits):
                clf = make_pipeline(StandardScaler(), SVC(gamma='auto', random_state=100, max_iter=100))
                clf.fit(X_train1, Y_train1)
                preds.append(clf.predict(X_test1))
                
            acc, f1_score = 0, 0
            for pred, Y_test1 in zip(preds, Y_test_splits):
                acc += accuracy_score(pred, Y_test1)
                f1_score += classification_report(
                    Y_test1,
                    pred, 
                    output_dict=True
                )['macro avg']['f1-score']

            return case_name, *list(case.values()), acc/num_of_splits, f1_score/num_of_splits
        
        elif SVC_or_SVR == 'SVR':
            for X_train1, Y_train1, X_test1, Y_test1 in zip(X_train_splits, Y_train_splits, X_test_splits, Y_test_splits):
                clf = make_pipeline(StandardScaler(), SVR(gamma='auto', max_iter=100))
                clf.fit(X_train1, Y_train1)
                preds.append(clf.predict(X_test1))
            
            acc = 0
            ktCorr = 0
            spearmanR = 0
            for pred, Y_test1 in zip(preds, Y_test_splits):
                acc += mean_squared_error(pred * 3 + 1, Y_test1 * 3 + 1, squared=False)
                ktCorr += stats.kendalltau(Y_test1 * 3 + 1, pred * 3 + 1).correlation
                spearmanR += stats.spearmanr(Y_test1 * 3 + 1, pred * 3 + 1).correlation
                
            return case_name, *list(case.values()), acc/num_of_splits, ktCorr/num_of_splits, spearmanR/num_of_splits
    
    @classmethod
    def execute_all_supervised_scenarios(cls,
                emb_list, basis_list, embed_dict_master, new_embed_dict_master, 
                eval_dataset, 
                scenario_name: str,
                num_of_splits = 10,
                comb_mode: bool = False, SVC_or_SVR: str = 'SVC', 
                num_of_iterations = 2,
                num_of_jobs = 1
            ):
        
        if not(comb_mode):
            svm_cases_list = []
            for basis in basis_list:
                for emb in emb_list:
                    for weightedness in [True]:
                        for iter_num in range(0,num_of_iterations+1):
                            for weight_case in [1]:
                                for svm_input in ['score']:
                                    temp_dict = {'basis': basis, 'emb': emb, 'weightedness': weightedness, 
                                                'iter_num': iter_num, 'weight_case': weight_case, 'svm_input': svm_input}
                                    svm_cases_list.append(temp_dict) 
        else:
            svm_cases_list = []
            for basis in basis_list:
                for emb in emb_list:
                    for weightedness in [True]:
                        for iter_num in range(0,num_of_iterations+1):
                            for weight_case in [1]:
                                for svm_input in ['score']:
                                    for i in range(1,len(emb_list)+1):
                                        for emb_comb in combinations(emb_list, i):
                                            temp_dict = {'basis': basis, 'emb': emb_comb, 'weightedness': weightedness, 
                                                        'iter_num': iter_num, 'weight_case': weight_case, 'svm_input': svm_input}
                                            svm_cases_list.append(temp_dict) 

        results = Parallel(n_jobs=num_of_jobs)(delayed(cls.execute_supervised_scenario)(
                eval_dataset, caseDict, embed_dict_master, 
                new_embed_dict_master,num_of_splits,
                comb_mode, SVC_or_SVR
            ) for caseDict in tqdm(svm_cases_list))
        
        if SVC_or_SVR == 'SVC':
            results_df = pd.DataFrame(results, columns=['Case Name','Basis','Embedding','Weightedness', 'Iteration Num', 'Weight Case', 'Technique','Accuracy', 'F1'])
#             best_results_df = ResultMetrics.fetch_best_result_for_emb(results_df, 'Embedding', 'Accuracy', 'Iteration Num', highest=True)
#             best_results_df.to_csv('../data/retrofitting/retro_SVC_results.' + scenario_name + '.'+ Utils.today_date +'.best.csv', index=False)
        else:
            results_df = pd.DataFrame(results, columns=['Case Name','Basis','Embedding','Weightedness', 'Iteration Num', 'Weight Case', 'Technique','MSE', 'KT Correlation', 'SR Correlation'])
#             best_results_df = ResultMetrics.fetch_best_result_for_emb(results_df, 'Embedding', 'MSE', 'Iteration Num', highest=False)
#             best_results_df.to_csv('../data/retrofitting/retro_SVR_results.' + scenario_name + '.'+ Utils.today_date +'.best.csv', index=False)
            
        results_df.to_csv('../data/retrofitting/retro_SVM_results.' + scenario_name + '.'+ Utils.today_date +'.csv', index=False)

# Scratch Pad

In [31]:
evalD.dbpedia_rg_65_df.category.value_counts()

M    26
U     6
I     2
Name: category, dtype: int64

In [32]:
len(evalD.dbpedia_rg_65_df), len(set(evalD.dbpedia_rg_65_df.word1_kg_id.to_list() + evalD.dbpedia_rg_65_df.word2_kg_id.to_list()))

(34, 31)

In [33]:
bert_P279_child_par_df = pd.read_csv(P279_CHILD_PAR_DISTILBERT_COSSIM_FILE)
#         bert_P279_child_par_df_cross_enc = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_filtered_cross_enc.csv')
bert_P279_siblings_df = pd.read_csv(P279_SIBLINGS_DISTILBERT_COSSIM_FILE)

In [35]:
len(bert_P279_siblings_df)

785418

In [36]:
probase_df = pd.read_csv(PROBASE_FINAL_PATH)

NameError: name 'PROBASE_FINAL_PATH' is not defined

# The Master Controller

In [14]:
%%time
new_embed_dict_master, responses_dict_master = {}, {}

# # Load all supporting files
evalD = EvaluationDatasets()
inp = InputEmbeddings()
basis = NeighborDatasets()

Dataset: Wordsim-353
M    220
U    103
I     11
Name: category, dtype: int64
Dataset: Wordsim-353 OLD
M    280
U     44
I     25
Name: category, dtype: int64
Dataset: DBPedia MC 30
M    11
U     4
I     1
Name: category, dtype: int64
Dataset: DBPedia RG 65
M    26
U     6
I     2
Name: category, dtype: int64


Input Embeddings:   0%|          | 0/8 [00:00<?, ?it/s]

OG Coverage of text_7_props: 334
Added 0 corrections to text_7_props
OG Coverage of complex: 334
Added 0 corrections to complex
OG Coverage of transe: 334
Added 0 corrections to transe
OG Coverage of abstract_first_sent: 334
Added 0 corrections to abstract_first_sent
OG Coverage of labels: 334
Added 0 corrections to labels
OG Coverage of labels_n_desc: 334
Added 0 corrections to labels_n_desc
OG Coverage of has_h: 327
Added 11 corrections to has_h
OG Coverage of has_s: 226
Added 96 corrections to has_s
Embedding: text_7_props, Size: 241696, Length: 1024
Embedding: complex, Size: 241698, Length: 100
Embedding: transe, Size: 241698, Length: 100
Embedding: abstract_first_sent, Size: 241698, Length: 768
Embedding: labels, Size: 241698, Length: 768
Embedding: labels_n_desc, Size: 241698, Length: 768
Embedding: has_h, Size: 166212, Length: 200
Embedding: has_s, Size: 117089, Length: 200
Fetched all input embeddings


Neighbor Datasets:   0%|          | 0/4 [00:00<?, ?it/s]

Fetched neighbour datasets: ['bert_child_par', 'bert_siblings', 'bert_all', 'probase']
CPU times: user 10min 14s, sys: 47.4 s, total: 11min 1s
Wall time: 11min 28s


In [100]:
inp_tsne = ReducedInputEmbeddings(inp.embed_dict_master, 100)
conc_emb_dict = inp_tsne.generate_concatenated_embedding_dict(list(set(inp.embed_dict_master.keys()) - set(['labels', 'labels_n_desc'])))

  0%|          | 0/8 [00:00<?, ?it/s]

In [101]:
json.dump(Utils.serialize_embedding_dict(conc_emb_dict), open(INPUT_EMB_FOLDER_PATH + 'concatenated_orig_embedding_dict.json', 'w'))

In [102]:
responses_dict, _ = ResultMetrics.compute_classification_results(
                            conc_emb_dict, evalD.old_wordsim_df, get_output_values=False, old_accuracy=None)
responses_dict

{'covered_pairs': 349,
 'accuracy': 42.693409742120345,
 'classification_report': {'I': {'precision': 0.8888888888888888,
   'recall': 0.32,
   'f1-score': 0.47058823529411764,
   'support': 25},
  'M': {'precision': 0.8347107438016529,
   'recall': 0.3607142857142857,
   'f1-score': 0.5037406483790524,
   'support': 280},
  'U': {'precision': 0.182648401826484,
   'recall': 0.9090909090909091,
   'f1-score': 0.30418250950570336,
   'support': 44},
  'accuracy': 0.4269340974212034,
  'macro avg': {'precision': 0.6354160115056753,
   'recall': 0.5299350649350649,
   'f1-score': 0.4261704643929578,
   'support': 349},
  'weighted avg': {'precision': 0.756383266954299,
   'recall': 0.4269340974212034,
   'f1-score': 0.47620664139466634,
   'support': 349}},
 'conf_matrix': array([[  8,  16,   1],
        [  1, 101, 178],
        [  0,   4,  40]]),
 'KT': 0.4387071263112683,
 'SR': 0.6180061306656681,
 'RMSE': 0.8573774932552949,
 'increase_acc': None}

In [103]:
responses_dict, _ = ResultMetrics.compute_classification_results(
                            conc_emb_dict, evalD.wordsim_df, get_output_values=False, old_accuracy=None)
responses_dict

{'covered_pairs': 334,
 'accuracy': 53.293413173652695,
 'classification_report': {'I': {'precision': 0.0,
   'recall': 0.0,
   'f1-score': 0.0,
   'support': 11},
  'M': {'precision': 0.782608695652174,
   'recall': 0.4090909090909091,
   'f1-score': 0.537313432835821,
   'support': 220},
  'U': {'precision': 0.4018264840182648,
   'recall': 0.8543689320388349,
   'f1-score': 0.5465838509316769,
   'support': 103},
  'accuracy': 0.5329341317365269,
  'macro avg': {'precision': 0.3948117265568129,
   'recall': 0.42115328037658134,
   'f1-score': 0.3612990945891659,
   'support': 334},
  'weighted avg': {'precision': 0.6394073080759268,
   'recall': 0.5329341317365269,
   'f1-score': 0.5224763229636028,
   'support': 334}},
 'conf_matrix': array([[  0,  10,   1],
        [  0,  90, 130],
        [  0,  15,  88]]),
 'KT': 0.43650705554014996,
 'SR': 0.5871457253915232,
 'RMSE': 0.706830860671417,
 'increase_acc': None}

In [125]:
responses_dict, _ = ResultMetrics.compute_classification_results(
                            conc_emb_dict, evalD.dbpedia_mc_30_df, get_output_values=False, old_accuracy=None)
responses_dict

{'covered_pairs': 16,
 'accuracy': 31.25,
 'classification_report': {'I': {'precision': 0.0,
   'recall': 0.0,
   'f1-score': 0.0,
   'support': 1},
  'M': {'precision': 0.5,
   'recall': 0.2727272727272727,
   'f1-score': 0.3529411764705882,
   'support': 11},
  'U': {'precision': 0.2,
   'recall': 0.5,
   'f1-score': 0.28571428571428575,
   'support': 4},
  'accuracy': 0.3125,
  'macro avg': {'precision': 0.2333333333333333,
   'recall': 0.25757575757575757,
   'f1-score': 0.21288515406162464,
   'support': 16},
  'weighted avg': {'precision': 0.39375,
   'recall': 0.3125,
   'f1-score': 0.31407563025210083,
   'support': 16}},
 'conf_matrix': array([[0, 1, 0],
        [0, 3, 8],
        [0, 2, 2]]),
 'KT': 0.25,
 'SR': 0.4088235294117647,
 'RMSE': 0.9704384065378439,
 'increase_acc': None}

In [126]:
responses_dict, _ = ResultMetrics.compute_classification_results(
                            conc_emb_dict, evalD.dbpedia_rg_65_df, get_output_values=False, old_accuracy=None)
responses_dict

{'covered_pairs': 34,
 'accuracy': 44.11764705882353,
 'classification_report': {'I': {'precision': 0.0,
   'recall': 0.0,
   'f1-score': 0.0,
   'support': 2},
  'M': {'precision': 0.7692307692307693,
   'recall': 0.38461538461538464,
   'f1-score': 0.5128205128205128,
   'support': 26},
  'U': {'precision': 0.23809523809523808,
   'recall': 0.8333333333333334,
   'f1-score': 0.37037037037037035,
   'support': 6},
  'accuracy': 0.4411764705882353,
  'macro avg': {'precision': 0.33577533577533575,
   'recall': 0.40598290598290604,
   'f1-score': 0.2943969610636277,
   'support': 34},
  'weighted avg': {'precision': 0.6302521008403361,
   'recall': 0.4411764705882353,
   'f1-score': 0.457516339869281,
   'support': 34}},
 'conf_matrix': array([[ 0,  2,  0],
        [ 0, 10, 16],
        [ 0,  1,  5]]),
 'KT': 0.3038438086632597,
 'SR': 0.43203179862117314,
 'RMSE': 0.8817551071220816,
 'increase_acc': None}

In [16]:
subset_keys = ['abstract_first_sent', 'has_h']
temp_dict = {}
for key in subset_keys:
    temp_dict[key] = copy.deepcopy(inp.embed_dict_master[key])
inp_tsne = ReducedInputEmbeddings(temp_dict, 100)
conc_emb_dict = inp_tsne.generate_concatenated_embedding_dict(subset_keys)

  0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
responses_dict, _ = ResultMetrics.compute_classification_results(
                            conc_emb_dict, evalD.old_wordsim_df, get_output_values=False, old_accuracy=None)
responses_dict

{'covered_pairs': 349,
 'accuracy': 60.458452722063036,
 'classification_report': {'I': {'precision': 0.7272727272727273,
   'recall': 0.32,
   'f1-score': 0.4444444444444444,
   'support': 25},
  'M': {'precision': 0.8858695652173914,
   'recall': 0.5821428571428572,
   'f1-score': 0.7025862068965518,
   'support': 280},
  'U': {'precision': 0.2597402597402597,
   'recall': 0.9090909090909091,
   'f1-score': 0.40404040404040403,
   'support': 44},
  'accuracy': 0.6045845272206304,
  'macro avg': {'precision': 0.6242941840767928,
   'recall': 0.6037445887445888,
   'f1-score': 0.5170236851271334,
   'support': 349},
  'weighted avg': {'precision': 0.7955698219806854,
   'recall': 0.6045845272206304,
   'f1-score': 0.6464556642404682,
   'support': 349}},
 'conf_matrix': array([[  8,  17,   0],
        [  3, 163, 114],
        [  0,   4,  40]]),
 'KT': 0.449925164929108,
 'SR': 0.6314889524026878,
 'RMSE': 0.7098851741322992,
 'increase_acc': None}

In [18]:
responses_dict, _ = ResultMetrics.compute_classification_results(
                            conc_emb_dict, evalD.wordsim_df, get_output_values=False, old_accuracy=None)
responses_dict

{'covered_pairs': 334,
 'accuracy': 67.96407185628742,
 'classification_report': {'I': {'precision': 0.0,
   'recall': 0.0,
   'f1-score': 0.0,
   'support': 11},
  'M': {'precision': 0.8202247191011236,
   'recall': 0.6636363636363637,
   'f1-score': 0.7336683417085428,
   'support': 220},
  'U': {'precision': 0.525974025974026,
   'recall': 0.7864077669902912,
   'f1-score': 0.6303501945525293,
   'support': 103},
  'accuracy': 0.6796407185628742,
  'macro avg': {'precision': 0.44873291502504986,
   'recall': 0.48334804354221833,
   'f1-score': 0.4546728454203574,
   'support': 334},
  'weighted avg': {'precision': 0.7024693499328499,
   'recall': 0.6796407185628742,
   'f1-score': 0.6776440275891913,
   'support': 334}},
 'conf_matrix': array([[  0,  10,   1],
        [  2, 146,  72],
        [  0,  22,  81]]),
 'KT': 0.48777318852355533,
 'SR': 0.6543303283611083,
 'RMSE': 0.5724958150747688,
 'increase_acc': None}

In [19]:
responses_dict, _ = ResultMetrics.compute_classification_results(
                            conc_emb_dict, evalD.dbpedia_mc_30_df, get_output_values=False, old_accuracy=None)
responses_dict

{'covered_pairs': 16,
 'accuracy': 62.5,
 'classification_report': {'I': {'precision': 0.0,
   'recall': 0.0,
   'f1-score': 0.0,
   'support': 1},
  'M': {'precision': 0.7272727272727273,
   'recall': 0.7272727272727273,
   'f1-score': 0.7272727272727273,
   'support': 11},
  'U': {'precision': 0.4,
   'recall': 0.5,
   'f1-score': 0.4444444444444445,
   'support': 4},
  'accuracy': 0.625,
  'macro avg': {'precision': 0.3757575757575758,
   'recall': 0.4090909090909091,
   'f1-score': 0.39057239057239057,
   'support': 16},
  'weighted avg': {'precision': 0.6,
   'recall': 0.625,
   'f1-score': 0.6111111111111112,
   'support': 16}},
 'conf_matrix': array([[0, 1, 0],
        [0, 8, 3],
        [0, 2, 2]]),
 'KT': 0.45,
 'SR': 0.5852941176470587,
 'RMSE': 0.7296263062335091,
 'increase_acc': None}

In [20]:
responses_dict, _ = ResultMetrics.compute_classification_results(
                            conc_emb_dict, evalD.dbpedia_rg_65_df, get_output_values=False, old_accuracy=None)
responses_dict

{'covered_pairs': 34,
 'accuracy': 64.70588235294117,
 'classification_report': {'I': {'precision': 0.0,
   'recall': 0.0,
   'f1-score': 0.0,
   'support': 2},
  'M': {'precision': 0.8181818181818182,
   'recall': 0.6923076923076923,
   'f1-score': 0.7500000000000001,
   'support': 26},
  'U': {'precision': 0.3333333333333333,
   'recall': 0.6666666666666666,
   'f1-score': 0.4444444444444444,
   'support': 6},
  'accuracy': 0.6470588235294118,
  'macro avg': {'precision': 0.38383838383838387,
   'recall': 0.452991452991453,
   'f1-score': 0.3981481481481482,
   'support': 34},
  'weighted avg': {'precision': 0.6844919786096257,
   'recall': 0.6470588235294118,
   'f1-score': 0.6519607843137256,
   'support': 34}},
 'conf_matrix': array([[ 0,  2,  0],
        [ 0, 18,  8],
        [ 0,  2,  4]]),
 'KT': 0.40750816691307773,
 'SR': 0.5159318543137886,
 'RMSE': 0.7176324851002853,
 'increase_acc': None}

In [63]:
# res_df[res_df.abstract_first_sent >= 3.9]

In [154]:
%%time
start_master_time = time()

ist = InputScoreTables(inp.embed_dict_master, set(['labels', 'labels_n_desc']), eval_file='wordsim_new')
_,res_df = ResultMetrics.compute_classification_n_regression_stats(ist, 'wsim_quantiles', standard_labels=False)
res_df.to_csv('../data/retrofitting/wordsim_quantile_analysis.'+ Utils.today_date +'.csv', index=False)

ist = InputScoreTables(inp.embed_dict_master, set(['labels', 'labels_n_desc']), eval_file='wordsim_new')
_,res_df = ResultMetrics.compute_classification_n_regression_stats(ist, 'wsim_orig', standard_labels=True)
res_df.to_csv('../data/retrofitting/wordsim_all_algo_scores.'+ Utils.today_date +'.csv', index=False)

ist = InputScoreTables(inp.embed_dict_master, set(['labels', 'labels_n_desc']), eval_file='wordsim_old')
_,res_df = ResultMetrics.compute_classification_n_regression_stats(ist, 'wsim_old', standard_labels=True)
res_df.to_csv('../data/retrofitting/wordsim_all_algo_scores.wsim_old.'+ Utils.today_date +'.csv', index=False)

ist = InputScoreTables(inp.embed_dict_master, set(['labels', 'labels_n_desc']), eval_file='dbpedia_mc_30')
_,res_df = ResultMetrics.compute_classification_n_regression_stats(ist, 'dbpedia_mc_30', standard_labels=True)
res_df.to_csv('../data/retrofitting/dbpedia_mc_30_all_algo_scores.'+ Utils.today_date +'.csv', index=False)

ist = InputScoreTables(inp.embed_dict_master, set(['labels', 'labels_n_desc']), eval_file='dbpedia_rg_65')
_,res_df = ResultMetrics.compute_classification_n_regression_stats(ist, 'dbpedia_rg_65', standard_labels=True)
res_df.to_csv('../data/retrofitting/dbpedia_rg_65_all_algo_scores.'+ Utils.today_date +'.csv', index=False)


# Wordsim executions
new_embed_dict_master['wordsim'], responses_dict_master['wordsim'] = RetrofittingProcedures.execute_all_unsupervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master, basis.neighbors_dict_master, 
                                                                            evalD.wordsim_df, "wordsim_ind")

print("Analysed wordsim_ind")

ist = InputScoreTables(inp.embed_dict_master, set(['labels', 'labels_n_desc']), eval_file='wordsim_new', new_embed_dict_master=new_embed_dict_master['wordsim'])
_,res_df = ResultMetrics.compute_classification_n_regression_stats(ist, 'wsim_orig_retro', standard_labels=True)
res_df.to_csv('../data/retrofitting/wordsim_all_algo_scores.wsim_orig_retro.'+ Utils.today_date +'.csv', index=False)

SVMProcedures.execute_all_supervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master, 
                                            new_embed_dict_master['wordsim'], 
                                            evalD.wordsim_df, "SVC_Wordsim",
                                            comb_mode = False, SVC_or_SVR = 'SVC')
print("Analysed SVC_Wordsim")

SVMProcedures.execute_all_supervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master, 
                                            new_embed_dict_master['wordsim'], 
                                            evalD.wordsim_df, "SVR_Wordsim",
                                            comb_mode = False, SVC_or_SVR = 'SVR')
print("Analysed SVR_Wordsim")

# RetrofittingProcedures.save_all_embeddings(new_embed_dict_master['wordsim'])

# SVMProcedures.execute_all_supervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master, 
#                                             new_embed_dict_master['wordsim'], 
#                                             evalD.wordsim_df, "SVR_Wordsim",
#                                             comb_mode = True, SVC_or_SVR = 'SVR')
# print("Analysed SVR_Wordsim combinatrics")

# new_embed_dict_master, responses_dict_master = {}, {}



new_embed_dict_master['dbpedia_mc_30'], responses_dict_master['dbpedia_mc_30'] = RetrofittingProcedures.execute_all_unsupervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master, basis.neighbors_dict_master, 
                                                                            evalD.dbpedia_mc_30_df, "dbpedia_mc_30_ind", prev_new_embed_dict_master=new_embed_dict_master['wordsim'])
print("Analysed dbpedia_mc_30_ind")
SVMProcedures.execute_all_supervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master, 
                                            new_embed_dict_master['dbpedia_mc_30'], 
                                            evalD.dbpedia_mc_30_df, "SVR_dbpedia_mc_30",
                                            comb_mode = False, SVC_or_SVR = 'SVR')
print("Analysed SVR_dbpedia_mc_30")

new_embed_dict_master['dbpedia_rg_65'], responses_dict_master['dbpedia_rg_65'] = RetrofittingProcedures.execute_all_unsupervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master, basis.neighbors_dict_master, 
                                                                            evalD.dbpedia_rg_65_df, "dbpedia_rg_65", prev_new_embed_dict_master=new_embed_dict_master['wordsim'])
print("Analysed dbpedia_rg_65_ind")
SVMProcedures.execute_all_supervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master, 
                                            new_embed_dict_master['dbpedia_rg_65'], 
                                            evalD.dbpedia_rg_65_df, "SVR_dbpedia_rg_65",
                                            comb_mode = False, SVC_or_SVR = 'SVR')
print("Analysed SVR_dbpedia_rg_65")
                                                                                                                               
                                                                                                                               
# # Wiki CS executions
# new_embed_dict_master['wiki_cs'], responses_dict_master['wiki_cs'] = RetrofittingProcedures.execute_all_unsupervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master, basis.neighbors_dict_master, 
#                                                           evalD.wiki_cs_df, "wiki_cs_ind")
# print("Analysed wiki_cs_ind")

# SVMProcedures.execute_all_supervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master,
#                                             new_embed_dict_master['wiki_cs'],
#                                             evalD.wordsim_df, "SVC_Wiki_CS",
#                                             comb_mode = False, SVC_or_SVR = 'SVC')
# print("Analysed SVC_Wiki_CS")

                                                                                                                               
                                                                                                                               
                                                                                                                               
# # Conceptnet executions
# new_embed_dict_master['conceptnet'], responses_dict_master['conceptnet'] = RetrofittingProcedures.execute_all_unsupervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master, basis.neighbors_dict_master, 
#                                                           evalD.concept_net_df, "concept_net_ind")
# print("Analysed concept_net_ind")

# SVMProcedures.execute_all_supervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master,
#                                             new_embed_dict_master['conceptnet'],
#                                             evalD.wordsim_df, "SVC_Conceptnet",
#                                             comb_mode = False, SVC_or_SVR = 'SVC')
# print("Analysed SVC_Conceptnet")

# print(f"Time taken for end-to-end execution: {time() - start_master_time}s")

Fetching wordsim_new wordsim score tables and eval file
Returning averaged scores from 9 algorithms - ['has_h', 'complex', 'classSim', 'topSim', 'transe', 'has_s', 'JC', 'text_7_props', 'abstract_first_sent']
Quantiles being used by wordsim: [-inf, 2.8, 3.0, 3.6, inf]
Quantiles being used by classSim: [-inf, 3.3613822255703725, 3.809400978521772, 3.957763635353713, inf]
Quantiles being used by JC: [-inf, 2.8351504372328686, 3.503981920452832, 3.7887034134721307, inf]
Quantiles being used by topSim: [-inf, 2.446182786605355, 2.783429730161016, 2.9719271938105174, inf]
Quantiles being used by text_7_props: [-inf, 1.9650170825691686, 2.2566934546159736, 2.559627663334526, inf]
Quantiles being used by complex: [-inf, 2.3927659996609845, 2.599180941250367, 2.8733883637473148, inf]
Quantiles being used by transe: [-inf, 2.617148987716268, 3.003437976104153, 3.330058848001383, inf]
Quantiles being used by abstract_first_sent: [-inf, 2.947749962401879, 3.3795752703868414, 3.68130819302516, inf

Basis:   0%|          | 0/4 [00:00<?, ?it/s]

Embedding:   0%|          | 0/8 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding:   0%|          | 0/8 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding:   0%|          | 0/8 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding:   0%|          | 0/8 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Analysed wordsim_ind
Fetching wordsim_old wordsim score tables and eval file
Returning averaged scores from 9 algorithms - ['has_h', 'classSim', 'topSim', 'text_7_props', 'complex', 'transe', 'has_s', 'JC', 'abstract_first_sent']
Returning averaged scores from 6 algorithms - ['complex_retrofitted', 'text_7_props_retrofitted', 'has_h_retrofitted', 'transe_retrofitted', 'abstract_first_sent_retrofitted', 'has_s_retrofitted']


  0%|          | 0/96 [00:00<?, ?it/s]

Analysed SVC_Wordsim


  0%|          | 0/96 [00:00<?, ?it/s]

Analysed SVR_Wordsim


Basis:   0%|          | 0/4 [00:00<?, ?it/s]

Embedding:   0%|          | 0/8 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding:   0%|          | 0/8 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding:   0%|          | 0/8 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding:   0%|          | 0/8 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Analysed dbpedia_mc_30_ind


  0%|          | 0/96 [00:00<?, ?it/s]

Analysed SVR_dbpedia_mc_30


Basis:   0%|          | 0/4 [00:00<?, ?it/s]

Embedding:   0%|          | 0/8 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding:   0%|          | 0/8 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding:   0%|          | 0/8 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Embedding:   0%|          | 0/8 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Weight Case:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

Analysed dbpedia_rg_65_ind


  0%|          | 0/96 [00:00<?, ?it/s]

Analysed SVR_dbpedia_rg_65
CPU times: user 7min, sys: 36 s, total: 7min 36s
Wall time: 7min 35s


In [156]:
ist = InputScoreTables(inp.embed_dict_master, set(['labels', 'labels_n_desc']), eval_file='wordsim_new', new_embed_dict_master=new_embed_dict_master['wordsim'])
_,res_df = ResultMetrics.compute_classification_n_regression_stats(ist, 'wsim_orig_retro', standard_labels=True)
res_df.to_csv('../data/retrofitting/wordsim_all_algo_scores.wsim_orig_retro.'+ Utils.today_date +'.csv', index=False)

Fetching wordsim_new wordsim score tables and eval file
Returning averaged scores from 9 algorithms - ['has_h', 'classSim', 'topSim', 'text_7_props', 'complex', 'transe', 'has_s', 'JC', 'abstract_first_sent']
Returning averaged scores from 6 algorithms - ['complex_retrofitted', 'text_7_props_retrofitted', 'has_h_retrofitted', 'transe_retrofitted', 'abstract_first_sent_retrofitted', 'has_s_retrofitted']


In [159]:
new_embed_dict_master['wordsim'].keys()

dict_keys(['text_7_props_bert_child_par_1_weighted', 'complex_bert_child_par_1_weighted', 'transe_bert_child_par_1_weighted', 'abstract_first_sent_bert_child_par_1_weighted', 'labels_bert_child_par_1_weighted', 'labels_n_desc_bert_child_par_1_weighted', 'has_h_bert_child_par_1_weighted', 'has_s_bert_child_par_1_weighted', 'text_7_props_bert_siblings_1_weighted', 'complex_bert_siblings_1_weighted', 'transe_bert_siblings_1_weighted', 'abstract_first_sent_bert_siblings_1_weighted', 'labels_bert_siblings_1_weighted', 'labels_n_desc_bert_siblings_1_weighted', 'has_h_bert_siblings_1_weighted', 'has_s_bert_siblings_1_weighted', 'text_7_props_bert_all_1_weighted', 'complex_bert_all_1_weighted', 'transe_bert_all_1_weighted', 'abstract_first_sent_bert_all_1_weighted', 'labels_bert_all_1_weighted', 'labels_n_desc_bert_all_1_weighted', 'has_h_bert_all_1_weighted', 'has_s_bert_all_1_weighted', 'text_7_props_probase_1_weighted', 'complex_probase_1_weighted', 'transe_probase_1_weighted', 'abstract_fi

In [160]:
ist = InputScoreTables(inp.embed_dict_master, set(['labels', 'labels_n_desc']), eval_file='wordsim_new', new_embed_dict_master=new_embed_dict_master['wordsim'], new_embed_suffix='_bert_all_1_weighted')
_,res_df = ResultMetrics.compute_classification_n_regression_stats(ist, 'wsim_orig_retro.bert_all', standard_labels=True)
res_df.to_csv('../data/retrofitting/wordsim_all_algo_scores.wsim_orig_retro.bert_all.'+ Utils.today_date +'.csv', index=False)

ist = InputScoreTables(inp.embed_dict_master, set(['labels', 'labels_n_desc']), eval_file='wordsim_new', new_embed_dict_master=new_embed_dict_master['wordsim'], new_embed_suffix='_bert_siblings_1_weighted')
_,res_df = ResultMetrics.compute_classification_n_regression_stats(ist, 'wsim_orig_retro.bert_siblings', standard_labels=True)
res_df.to_csv('../data/retrofitting/wordsim_all_algo_scores.wsim_orig_retro.bert_siblings.'+ Utils.today_date +'.csv', index=False)

ist = InputScoreTables(inp.embed_dict_master, set(['labels', 'labels_n_desc']), eval_file='wordsim_new', new_embed_dict_master=new_embed_dict_master['wordsim'], new_embed_suffix='_bert_child_par_1_weighted')
_,res_df = ResultMetrics.compute_classification_n_regression_stats(ist, 'wsim_orig_retro.bert_child_par', standard_labels=True)
res_df.to_csv('../data/retrofitting/wordsim_all_algo_scores.wsim_orig_retro.bert_child_par.'+ Utils.today_date +'.csv', index=False)

ist = InputScoreTables(inp.embed_dict_master, set(['labels', 'labels_n_desc']), eval_file='wordsim_new', new_embed_dict_master=new_embed_dict_master['wordsim'], new_embed_suffix='_probase_1_weighted')
_,res_df = ResultMetrics.compute_classification_n_regression_stats(ist, 'wsim_orig_retro.probase', standard_labels=True)
res_df.to_csv('../data/retrofitting/wordsim_all_algo_scores.wsim_orig_retro.probase.'+ Utils.today_date +'.csv', index=False)

Fetching wordsim_new wordsim score tables and eval file
Returning averaged scores from 9 algorithms - ['has_h', 'classSim', 'topSim', 'text_7_props', 'complex', 'transe', 'has_s', 'JC', 'abstract_first_sent']
Returning averaged scores from 6 algorithms - ['complex_retrofitted', 'text_7_props_retrofitted', 'has_h_retrofitted', 'transe_retrofitted', 'abstract_first_sent_retrofitted', 'has_s_retrofitted']
Fetching wordsim_new wordsim score tables and eval file
Returning averaged scores from 9 algorithms - ['has_h', 'classSim', 'topSim', 'text_7_props', 'complex', 'transe', 'has_s', 'JC', 'abstract_first_sent']
Returning averaged scores from 6 algorithms - ['complex_retrofitted', 'text_7_props_retrofitted', 'has_h_retrofitted', 'transe_retrofitted', 'abstract_first_sent_retrofitted', 'has_s_retrofitted']
Fetching wordsim_new wordsim score tables and eval file
Returning averaged scores from 9 algorithms - ['has_h', 'classSim', 'topSim', 'text_7_props', 'complex', 'transe', 'has_s', 'JC', 'a

In [157]:
ist = InputScoreTables(inp.embed_dict_master, set(['labels', 'labels_n_desc']), eval_file='dbpedia_rg_65', new_embed_dict_master=new_embed_dict_master['wordsim'])
_,res_df = ResultMetrics.compute_classification_n_regression_stats(ist, 'dbpedia_rg_65_retro', standard_labels=True)
res_df.to_csv('../data/retrofitting/wordsim_all_algo_scores.dbpedia_rg_65_retro.'+ Utils.today_date +'.csv', index=False)

ist = InputScoreTables(inp.embed_dict_master, set(['labels', 'labels_n_desc']), eval_file='dbpedia_mc_30', new_embed_dict_master=new_embed_dict_master['wordsim'])
_,res_df = ResultMetrics.compute_classification_n_regression_stats(ist, 'dbpedia_mc_30_retro', standard_labels=True)
res_df.to_csv('../data/retrofitting/wordsim_all_algo_scores.dbpedia_mc_30_retro.'+ Utils.today_date +'.csv', index=False)

Fetching dbpedia_rg_65 wordsim score tables and eval file
Returning averaged scores from 9 algorithms - ['has_h', 'classSim', 'topSim', 'text_7_props', 'complex', 'transe', 'has_s', 'JC', 'abstract_first_sent']
Returning averaged scores from 6 algorithms - ['complex_retrofitted', 'text_7_props_retrofitted', 'has_h_retrofitted', 'transe_retrofitted', 'abstract_first_sent_retrofitted', 'has_s_retrofitted']
Fetching dbpedia_mc_30 wordsim score tables and eval file
Returning averaged scores from 9 algorithms - ['has_h', 'classSim', 'topSim', 'text_7_props', 'complex', 'transe', 'has_s', 'JC', 'abstract_first_sent']
Returning averaged scores from 6 algorithms - ['complex_retrofitted', 'text_7_props_retrofitted', 'has_h_retrofitted', 'transe_retrofitted', 'abstract_first_sent_retrofitted', 'has_s_retrofitted']


In [161]:
ist = InputScoreTables(inp.embed_dict_master, set(['labels', 'labels_n_desc']), eval_file='wordsim_new', new_embed_dict_master=new_embed_dict_master['wordsim'], new_embed_suffix='_bert_child_par_1_weighted')
_,res_df = ResultMetrics.compute_classification_n_regression_stats(ist, 'wsim_quantiles.retrofit', standard_labels=False)
res_df.to_csv('../data/retrofitting/wordsim_quantile_analysis.retrofit.'+ Utils.today_date +'.csv', index=False)

Fetching wordsim_new wordsim score tables and eval file
Returning averaged scores from 9 algorithms - ['has_h', 'classSim', 'topSim', 'text_7_props', 'complex', 'transe', 'has_s', 'JC', 'abstract_first_sent']
Returning averaged scores from 6 algorithms - ['complex_retrofitted', 'text_7_props_retrofitted', 'has_h_retrofitted', 'transe_retrofitted', 'abstract_first_sent_retrofitted', 'has_s_retrofitted']
Quantiles being used by wordsim: [-inf, 2.8, 3.0, 3.6, inf]
Quantiles being used by classSim: [-inf, 3.3613822255703725, 3.809400978521772, 3.957763635353713, inf]
Quantiles being used by JC: [-inf, 2.8351504372328686, 3.503981920452832, 3.7887034134721307, inf]
Quantiles being used by topSim: [-inf, 2.446182786605355, 2.783429730161016, 2.9719271938105174, inf]
Quantiles being used by text_7_props: [-inf, 1.9650170825691686, 2.2566934546159736, 2.559627663334526, inf]
Quantiles being used by text_7_props_retrofitted: [-inf, 1.762183866514523, 2.0080365311430546, 2.3180250624316017, inf]

In [155]:
SVMProcedures.execute_all_supervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master, 
                                            new_embed_dict_master['wordsim'], 
                                            evalD.wordsim_df, "SVC_Wordsim",
                                            comb_mode = False, SVC_or_SVR = 'SVC')
print("Analysed SVC_Wordsim")

SVMProcedures.execute_all_supervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master, 
                                            new_embed_dict_master['dbpedia_mc_30'], 
                                            evalD.dbpedia_mc_30_df, "SVC_dbpedia_mc_30",
                                            comb_mode = False, SVC_or_SVR = 'SVC')
print("Analysed SVC_dbpedia_mc_30")

SVMProcedures.execute_all_supervised_scenarios(inp.emb_list, basis.basis_list, inp.embed_dict_master, 
                                            new_embed_dict_master['dbpedia_rg_65'], 
                                            evalD.dbpedia_rg_65_df, "SVC_dbpedia_rg_65",
                                            comb_mode = False, SVC_or_SVR = 'SVC')
print("Analysed SVC_dbpedia_rg_65")

  0%|          | 0/96 [00:00<?, ?it/s]

Analysed SVC_Wordsim


  0%|          | 0/96 [00:00<?, ?it/s]

Analysed SVC_dbpedia_mc_30


  0%|          | 0/96 [00:00<?, ?it/s]

Analysed SVC_dbpedia_rg_65


# Evaluation Datasets

In [None]:
bioDF = pd.read_csv('../data/pedersen2007measures_table1.csv')

In [None]:
bioDF.head()

In [None]:
bioDFNodesSet = set(bioDF.Term1_kg_id.to_list() + bioDF.Term2_kg_id.to_list())

In [None]:
P279childParNodesSet = set(p279WordSimSeededDF_wabs_text.node1.to_list() + p279WordSimSeededDF_wabs_text.node2.to_list())
P279siblingsNodesSet = set(p279Seeded_SiblingsDF3_wabs_text.node1.to_list() + p279Seeded_SiblingsDF3_wabs_text.node2.to_list())

In [None]:
sum(bioDF.Term1_kg_id.apply(lambda p: p in P279childParNodesSet or p in P279siblingsNodesSet)), \
sum(bioDF.Term2_kg_id.apply(lambda p: p in P279childParNodesSet or p in P279siblingsNodesSet))

In [None]:
probaseNodesSet = set(probDF_Qnodes_DF_WQnodes1_subset.node1.to_list() + probDF_Qnodes_DF_WQnodes1_subset.node2.to_list())

sum(bioDF.Term1_kg_id.apply(lambda p: p in probaseNodesSet)), \
sum(bioDF.Term2_kg_id.apply(lambda p: p in probaseNodesSet))