In [1]:
import numpy
import math
import pandas

In [2]:
ud_train = pandas.read_csv(r'C:\Users\jyellin\re_1\Code\path_to_re\train-ud_paths_per_relation.csv')

In [3]:
ud_train_stats_by_relation_and_path = ud_train.groupby(['relation','path'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['relation','total'], ascending=[True,False]).reset_index(drop=True)
ud_train_stats_relation = ud_train.groupby(['relation'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['relation'], ascending=[True]).set_index('relation')

In [4]:
paths = ud_train_stats_by_relation_and_path['path'].unique().tolist()
relations = ud_train_stats_by_relation_and_path['relation'].unique().tolist()

path_to_index = {k: v for v, k in enumerate(paths)}
relation_to_index = {k: v for v, k in enumerate(relations)}

totals = ud_train_stats_relation['total'].to_list()
mean_total = numpy.mean(totals)

In [5]:
relation_to_count_matrix = numpy.zeros((len(relations), len(paths)), dtype=numpy.intc)

for index, row in ud_train_stats_by_relation_and_path.iterrows():
    relation_index = relation_to_index[row['relation']]
    path_index = path_to_index[row['path']]
    
    total = row['total']
    if total == 1:
        total = 0
    relation_to_count_matrix[relation_index, path_index] = total

relation_to_count_df = pandas.DataFrame(relation_to_count_matrix, index=relations, columns=paths)
#relation_to_count_matrix = numpy.concatenate((numpy.reshape(totals, (-1,1)),relation_to_count_matrix), axis=1)

In [6]:
# create a single data frame from the total-relation counts and the relation-to-cound dataframe
temp = pandas.concat([ud_train_stats_relation,relation_to_count_df], axis=1)

# filter out all relations for which there are less than 25% of the mean total sentences in the training set
temp = temp[temp['total'] >= mean_total/4]

# normalize counts 
temp = temp.apply( lambda x: x / x['total'] * mean_total, axis=1).drop(columns=['total'])

# round up to 
relation_to_count_normalized = temp.round(0).astype(int)

In [7]:
relations = relation_to_count_normalized.index.to_list()
paths = relation_to_count_normalized.columns.to_list()
work = relation_to_count_normalized.to_numpy()

jaccard_weighted_matrix = numpy.empty((len(relations), len(relations)))

for i, i_relation in enumerate(relations):
    i_row = work[i]

    for j, j_relation in enumerate(relations):
        
        j_row = work[j]
        
        numerator = 0
        denominator = 0
        for k, k_path in enumerate(paths):
            numerator = numerator + min(i_row[k], j_row[k])
            denominator = denominator + max(i_row[k], j_row[k])
        
        jaccard_weighted_matrix[i,j] = numerator / denominator

#df = pandas.DataFrame(matrix, index=relations, columns=relations)
#df.to_csv('./matrix_ud.csv')


In [8]:
df = pandas.DataFrame(jaccard_weighted_matrix, index=relations, columns=relations)
df.to_csv('./ud-jaccard_weighted_matrix.csv')

In [21]:
single = df.reset_index().melt('index', var_name='relations')

In [15]:
df['org:alternate_names'].sort_values(ascending=False) 

org:alternate_names                    1.000000
per:title                              0.143508
per:alternate_names                    0.111111
per:stateorprovinces_of_residence      0.068966
org:website                            0.051351
org:member_of                          0.046823
per:employee_of                        0.042857
org:members                            0.038147
org:top_members/employees              0.037267
per:siblings                           0.035813
per:spouse                             0.034314
org:stateorprovince_of_headquarters    0.028571
per:age                                0.027542
org:subsidiaries                       0.027363
per:other_family                       0.025788
per:origin                             0.021845
org:parents                            0.020619
org:country_of_headquarters            0.018970
org:founded_by                         0.016990
org:city_of_headquarters               0.014052
per:cities_of_residence                0

In [20]:
df['org:alternate_names'].sort_values(ascending=False) 

org:alternate_names                    1.000000
per:title                              0.143508
per:alternate_names                    0.111111
per:stateorprovinces_of_residence      0.068966
org:website                            0.051351
org:member_of                          0.046823
per:employee_of                        0.042857
org:members                            0.038147
org:top_members/employees              0.037267
per:siblings                           0.035813
per:spouse                             0.034314
org:stateorprovince_of_headquarters    0.028571
per:age                                0.027542
org:subsidiaries                       0.027363
per:other_family                       0.025788
per:origin                             0.021845
org:parents                            0.020619
org:country_of_headquarters            0.018970
org:founded_by                         0.016990
org:city_of_headquarters               0.014052
per:cities_of_residence                0