In [None]:
import numpy as np
import pandas as pd

import codex

In [None]:
codes = ['ar', 'de', 'en', 'es', 'ru', 'zh']
sizes = ['s', 'm', 'l']

# Data exploration
Load our data and do basic exploratory analysis.

In [None]:
for size in sizes:
    Codex = codex.Codex(size=size)
    
    train, valid, test = [
        Codex.split(split) for split in ('train', 'valid', 'test')]
    triples = Codex.triples()
    
    print(Codex.name())
    
    print(
        '\t', len(Codex.entities()), 'entities /',
        len(Codex.relations()), 'relations'
    )
        
    print(
        '\t', len(train), 'train /',
        len(valid), 'validation /',
        len(test), 'test'
    )
    print('\t', len(triples), 'total triples')

Get multilingual coverage.

In [None]:
for size in sizes:
    print(codex.Codex(size=size).name())
    
    for code in codes:
        Codex = codex.Codex(code=code, size=size)
        entities = Codex.entities()
        relations = Codex.relations()
        
        retrieved = []
        
        for eid in entities:
            # Does this entity have all its textual information?
            retrieved.append(bool(Codex.entity_label(eid)))
            retrieved.append(bool(Codex.entity_description(eid)))
            retrieved.append(bool(Codex.entity_extract(eid)))
            
            # Does this entity's types have textual information?
            for type_id in Codex.entity_types(eid):
                retrieved.append(bool(Codex.entity_type_label(type_id)))
                retrieved.append(bool(Codex.entity_type_description(type_id)))
                retrieved.append(bool(Codex.entity_type_extract(type_id)))
            
        # Does this relation have textual information?
        for rid in relations:
            retrieved.append(bool(Codex.relation_label(rid)))
            retrieved.append(bool(Codex.relation_description(rid)))

        print('\t{} coverage: {:.2f}%'.format(
            code, np.mean(retrieved) * 100))

Inspect a single entity.

In [None]:
eid = 'Q18'

for code in codes:
    Codex = codex.Codex(code=code)
    print(Codex.entity_label(eid))

In [None]:
Codex = codex.Codex(code='en')
Codex.entity_extract(eid)

In [None]:
Codex = codex.Codex(code='es')
Codex.entity_extract(eid)

Explore entity types.

In [None]:
Codex = codex.Codex(code='en')
types = Codex.entity_types(eid)
for etype in types:
    print(Codex.entity_label(eid), 'is of type', Codex.entity_type_label(etype))

In [None]:
type_id = 'Q5'
Codex.entity_type_extract(type_id)

Inspect a single relation.

In [None]:
rid = 'P530'
print(Codex.relation_label(rid) + ':', Codex.relation_description(rid))

Make sure all entities in valid/test are seen in train.

In [None]:
for size in sizes:
    Codex = codex.Codex(size=size)
    train, valid, test = [
        Codex.split(split) for split in ('train', 'valid', 'test')]
    
    train_entities = set(pd.concat((train['head'], train['tail'])))
    train_relations = set(train['relation'])
    
    for df in (valid, test):
        for head, relation, tail in zip(df['head'], df['relation'], df['tail']):
            assert head in train_entities
            assert relation in train_relations
            assert tail in train_entities
            
print('Passed all checks successfully')

Explore some of the negative triples.

In [None]:
Codex = codex.Codex(size='s')

valid_neg, test_neg = Codex.negative_split('valid'), Codex.negative_split('test')

print(
    len(valid_neg), 'validation negatives |',
    len(test_neg), 'test negatives'
)

In [None]:
sample = test_neg.sample(n=5)

for h, r, t in zip(sample['head'], sample['relation'], sample['tail']):
    print('({}, {}, {})'.format(
        Codex.entity_label(h), Codex.relation_label(r), Codex.entity_label(t)))

Make sure none of the negative triples are actually positives in our dataset, and that all negatives contain entities seen in the CoDEx cores. 

In [None]:
def triples_to_set(triples):
    return set(zip(triples['head'], triples['relation'], triples['tail']))

negs = triples_to_set(pd.concat((valid_neg, test_neg)))

for size in ('m', 'l'):
    Codex = codex.Codex(size=size)
    train = Codex.split('train')
    entities = set(pd.concat((train['head'], train['tail'])))
    relations = set(train['relation'])
    triples = triples_to_set(Codex.triples())
    
    for neg in negs:
        assert neg not in triples
        h, r, t = neg
        assert h in entities
        assert r in relations
        assert t in entities

# Plotting
Plot various properties like degree distribution and top-k entities/relations.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Styles from https://scentellegher.github.io/visualization/2018/10/10/beautiful-bar-plots-matplotlib.html

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = 'Helvetica'

plt.rcParams['axes.linewidth'] = 0.8
plt.rcParams['legend.frameon'] = False

dark_grey = '#333F4B'
plt.rcParams['text.color'] = dark_grey
keys = [
    'axes.edgecolor',
    'axes.labelcolor',
    'xtick.color',
    'ytick.color'
]
for key in keys:
    plt.rcParams[key] = dark_grey

palette = sns.color_palette('RdBu', 10)
sns.palplot(palette)

Plot the degree distribution of CoDEx datasets.

In [None]:
def count_entities(triples):
    """
    :param triples: pandas DataFrame [head, relation, tail]
    :return counts: pandas Dataframe [entity, count]
    """
    entities = pd.concat((triples['head'], triples['tail']))
    counts = pd.DataFrame(entities.value_counts().reset_index())
    counts.columns = ['entity', 'count']
    return counts
    
    
def count_relations(triples):
    """
    :param triples: pandas DataFrame [head, relation, tail]
    :return counts: pandas Dataframe [relation, count]
    """
    counts = pd.DataFrame(triples['relation'].value_counts().reset_index())
    counts.columns = ['relation', 'count']
    return counts

In [None]:
def plot_degree_distribution(
        triples, step=5, ax=None, 
        figsize=(5,3), **kwargs):
    """
    :param triples:
    :param step:
    :param ax:
    :param figsize:
    :param kwargs:
    :return:
    """
    counts = count_entities(triples)['count']
    
    x = np.arange(0, max(counts), step)
    y = [len(counts[counts > val]) for val in x]
    y = np.array(y, dtype=np.float32)

    if not ax:
        fig, ax = plt.subplots(figsize=figsize)
    
    ax.scatter(x, y, alpha=0.5, **kwargs)
    ax.set_xscale('symlog')
    ax.set_yscale('symlog')
    
    ax.set_xlabel('Entity degree', fontsize=14)
    ax.set_ylabel(r'Cumulative count', fontsize=12)
    
    ax.tick_params(
        which='both',
        labelsize=12
    )
    
    ax.spines['top'].set_color('none')
    ax.spines['right'].set_color('none')
    ax.spines['left'].set_smart_bounds(True)
    ax.spines['bottom'].set_smart_bounds(True)

    return ax

In [None]:
for size in sizes:
    Codex = codex.Codex(size=size)
    triples = Codex.triples()

    ax = plot_degree_distribution(
        triples, 
        color=palette[-1],
        step=5,
        figsize=(4, 2),
        label=Codex.name(),
    )

    legend = ax.legend(
        frameon=False, 
        fontsize=14,
        handletextpad=0.001,
        labelspacing=0.2,
        loc=3
    )

    for text in legend.get_texts():
        text.set_color(dark_grey)

    for lh in legend.legendHandles: 
        lh.set_alpha(0.7)

    xlim = ax.get_xlim()
    ax.set_xlim(3, max(xlim))

    plt.show()

Plot the top-k entities and relations in a CoDEx dataset.

In [None]:
def plot_top_k(count_df, k=30, 
               label_col='label',
               count_col='count',
               figsize=(5,8.5),
               color='#007acc',
               ax=None,
               **kwargs):
    count_df = count_df.sort_values(count_col, ascending=False).head(k)
    top_labels, top_counts = count_df[label_col], count_df[count_col]
    
    if ax is None:
        fig, ax = plt.subplots(figsize=figsize)

    x = np.arange(k)
    y = top_counts[::-1]
        
    ax.hlines(
        y=x,
        xmin=0, 
        xmax=y, 
        color=color, 
        alpha=0.2, 
        linewidth=5,
        **kwargs
    )
    
    ax.plot(
        y, x, "o", 
        markersize=7, 
        color=color, 
        alpha=0.6)
    
    ax.spines['top'].set_color('none')
    ax.spines['right'].set_color('none')

    ax.spines['left'].set_bounds(0, k - 1)
    ax.spines['bottom'].set_bounds(0, max(ax.get_xticks()))
    
    ax.set_xlabel('Count', fontsize=14)

    ax.set_yticks(x)
    ax.set_yticklabels(top_labels[::-1])
    
    return ax

In [None]:
size = 'l'
Codex = codex.Codex(size=size)

triples = Codex.triples()
count_df = count_entities(triples)

count_df['label'] = [Codex.entity_label(eid) for eid in count_df['entity']]

k = 30

ax = plot_top_k(
    count_df, 
    k=k, 
    color=palette[-1],
    linewidths=6
)

ax.set_xscale('linear')
ax.set_xlabel('Entity degree', fontsize=14)
ax.set_title(Codex.name(), fontsize=16)
ax.tick_params('x', labelsize=12)

plt.tight_layout()
plt.show()

In [None]:
count_df = count_relations(triples)
count_df['label'] = [Codex.relation_label(rid) for rid in count_df['relation']]

k = 15

ax = plot_top_k(
    count_df, 
    k=k, 
    color=palette[-1],
    linewidths=6,
    figsize=(5, 4)
)

ax.set_xscale('linear')
ax.set_xlabel('Mention count', fontsize=14)
ax.set_title(Codex.name(), fontsize=16)
ax.tick_params('x', labelsize=12)

plt.tight_layout()
plt.show()

Investigate symmetry.

In [None]:
def entity_pairs(triples, reverse=False):
    col1, col2 = 'head', 'tail'
    if reverse:
        col1, col2 = col2, col1
    return set(zip(triples[col1], triples[col2]))

In [None]:
threshold = 0.7

for size in sizes: 
    Codex = codex.Codex(size=size)
    print('----------' + Codex.name() + '----------')
    relations = Codex.relations()
    
    train, valid, test = [
        Codex.split(split) for split in ('train', 'valid', 'test')
    ]
    
    tot = 0
    
    for relation in relations:
        train_r, valid_r, test_r = [
            df[df['relation'] == relation]
            for df in (train, valid, test)
        ]
        
        train_pairs = entity_pairs(train_r)
        test_pairs = entity_pairs(
            pd.concat((valid_r, test_r)), reverse=True)
        
        if len(test_pairs):
            n_match = len(train_pairs.intersection(test_pairs))
            symmetry = n_match / len(test_pairs)
            percent_valid_test = (len(valid_r) + len(test_r)) / (len(valid) + len(test))
            
            if symmetry > threshold:
                print(
                    '{} | {:.2f}% symmetry | {:.3f}% of valid/test'.format(
                        Codex.relation_label(relation), 
                        symmetry * 100,
                        percent_valid_test * 100,
                    )
                )
                
                tot += (percent_valid_test * 100)
                
    print('Symmetric relations > {:.1f} make up {:.3f}% of validation/test in {}'.format(
        threshold, tot, Codex.name()
    ))

Investigate compositionality.

In [None]:
import os
import random

In [None]:
def rule_relations(rule):
    return [item for item in rule.split(' ') if item.startswith('P')]

In [None]:
amie_dir = 'paths/'

for size in sizes:
    Codex = codex.Codex(size=size)
    name = Codex.name()
    print('----------' + name + '----------')
    
    rule_file = 'codex-{}.tsv'.format(size)

    rule_df = pd.read_csv(
        os.path.join(amie_dir, rule_file), sep='\t'
    )

    rules_of_length = {}
    for length in (2, 3):
        df = rule_df[rule_df['Length'] == length]
        rules = {rule: float(conf) for rule, conf in
                 zip(df['Rule'], df['PCA Confidence'])}

        print('{} rules of length {}: {:.3f} average confidence'.format(
            len(rules), length, np.mean(list(rules.values()))
        ))

        rules_of_length[length] = rules
        
    # Count number of paths in this core
    print(
        sum([len(values) for values in rules_of_length.values()]),
        'paths in', name
    )

    # Get the average confidence across all rules
    avg_conf = []
    for rules in rules_of_length.values():
        avg_conf.extend(list(rules.values()))
    print('Average confidence: {:.3f}'.format(np.mean(avg_conf)))
    
    # Count number of valid/test triples containing relations in paths
    compositional_relations = set()
    
    for length in (2, 3):
        for rule in rules_of_length[length]:
            last_relation = rule_relations(rule)[-1]
            compositional_relations.add(last_relation)
            
    test = pd.concat((Codex.split('valid'), Codex.split('test')))
    percent_comp = len(test[test['relation'].isin(compositional_relations)]) / len(test)
            
    print(
        '{:.2f}% of validation/test triples contain entailed relations'.format(
            percent_comp * 100)
    )
    
    # Print a few examples
    print('Examples:')
    for length in (2, 3):
        rules = rules_of_length[length]
        rule = random.choice(list(rules.keys()))
        
        relations = rule_relations(rule)
        print(
            '\t',
            '/'.join([Codex.relation_label(relation) for relation in relations[:-1]]), 
            '-->', 
            Codex.relation_label(relations[-1]),
            '(confidence {:.3f})'.format(rules[rule])
        )