In [1]:
import numpy
import math
import pandas
pandas.set_option('display.max_rows', 100)

In [2]:
ud_train = pandas.read_csv(r'C:\Users\jyellin\re_1\Code\path_to_re\train-ud_paths_per_relation.csv')

In [3]:
ud_train_stats_by_relation_and_path = ud_train.groupby(['relation','path'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['relation','total'], ascending=[True,False]).reset_index(drop=True)
ud_train_stats_relation = ud_train.groupby(['relation'])['id'].count().reset_index().rename(columns={'id': 'total'}).sort_values(['relation'], ascending=[True]).set_index('relation')

In [4]:
paths = ud_train_stats_by_relation_and_path['path'].unique().tolist()
relations = ud_train_stats_by_relation_and_path['relation'].unique().tolist()

path_to_index = {k: v for v, k in enumerate(paths)}
relation_to_index = {k: v for v, k in enumerate(relations)}

totals = ud_train_stats_relation['total'].to_list()
mean_total = numpy.mean(totals)

In [5]:
relation_to_count_matrix = numpy.zeros((len(relations), len(paths)), dtype=numpy.intc)

for index, row in ud_train_stats_by_relation_and_path.iterrows():
    relation_index = relation_to_index[row['relation']]
    path_index = path_to_index[row['path']]
    
    total = row['total']
    if total <= 1:
        total = 0
    relation_to_count_matrix[relation_index, path_index] = total

relation_to_count_df = pandas.DataFrame(relation_to_count_matrix, index=relations, columns=paths)
#relation_to_count_matrix = numpy.concatenate((numpy.reshape(totals, (-1,1)),relation_to_count_matrix), axis=1)

In [6]:
# create a single data frame from the total-relation counts and the relation-to-cound dataframe
temp = pandas.concat([ud_train_stats_relation,relation_to_count_df], axis=1)

# filter out all relations for which there are less than 25% of the mean total sentences in the training set
temp = temp[temp['total'] >= mean_total/4]

# normalize counts 
temp = temp.apply( lambda x: x / x['total'] * mean_total, axis=1).drop(columns=['total'])

# round up to 
relation_to_count_normalized = temp.round(0).astype(int)

In [7]:
#single = df.reset_index().melt('index', var_name='pair2').rename(columns={"index": "pair1"}).set_index(['pair1', 'pair2'])
#single = single[single['value']<1]
#single['value'].sort_values(ascending=False).head(20)

In [8]:
from operator import itemgetter

relations = relation_to_count_normalized.index.to_list()
paths = relation_to_count_normalized.columns.to_list()
work = relation_to_count_normalized.to_numpy()
jaccard_weighted_scores = []

for i, i_relation in enumerate(relations):
    i_row = work[i]

    for j, j_relation in enumerate(relations[i+1:]):
        j_row = work[i+1+j]

        numerator = 0
        denominator = 0
        for k, k_path in enumerate(paths):
            numerator = numerator + min(i_row[k], j_row[k])
            denominator = denominator + max(i_row[k], j_row[k])
        
        jaccard_weighted_scores.append((i_relation, j_relation, numerator / denominator ))
        

jaccard_weighted_scores = sorted(jaccard_weighted_scores, key=itemgetter(2), reverse=True)
jaccard_weighted_scores_df = pandas.DataFrame.from_records(jaccard_weighted_scores, columns =['rel1', 'rel2', 'jaccard']) 

In [9]:
jaccard_weighted_scores_df.head(100)

Unnamed: 0,rel1,rel2,jaccard
0,org:political/religious_affiliation,per:origin,0.472906
1,org:city_of_headquarters,org:stateorprovince_of_headquarters,0.33209
2,per:siblings,per:spouse,0.318182
3,per:parents,per:spouse,0.290909
4,org:country_of_headquarters,per:countries_of_residence,0.270408
5,org:political/religious_affiliation,per:age,0.264808
6,per:countries_of_residence,per:employee_of,0.244
7,per:other_family,per:siblings,0.234973
8,per:age,per:origin,0.232198
9,per:other_family,per:spouse,0.225225
