# Prediction

In [1]:
SAMPLE = 2  # pick sample from the dev set

# match entity label to the ctc table
import os
import sympy

import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import load_dataset
import soundfile as sf

CTC_DEPTH = 3  # size of the ctc matrix considered for search
NPATHS = 10 # number of longest paths on the bigram graph

model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
tokenizer = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")


# path = "../data/dev/"
path = "../data/gtts/dev/"


file = str(SAMPLE) + '.wav'
speech, samplerate = sf.read(path+file)
i = int(file.split('.')[0]) - 1

input_values = tokenizer(speech, return_tensors="pt", padding="longest").input_values
logits = model(input_values).logits

# find where s_tokens appear in the table
ctc_table = torch.topk(logits, k=CTC_DEPTH, dim=-1)
predicted_ids = ctc_table.indices[0]
# predicted_ids = torch.argmax(logits, dim=-1).indices

# print(predicted_ids)
print(predicted_ids.shape)

# greedy decoding
transcription = tokenizer.batch_decode(torch.argmax(logits, dim=-1))[0].lower()
print(transcription)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
It is strongly recommended to pass the ``sampling_rate`` argument to this function.Failing to do so can result in silent errors that might be hard to debug.


torch.Size([318, 3])
what is a film directed by weepcavon carolls feld


In [2]:
import numpy as np

predictions = np.transpose(np.array(predicted_ids))
indices = predictions.flatten()
# 
logits = ctc_table.values[0]
predictions_logits = np.transpose(logits.detach().numpy())
indices_logits = predictions_logits.flatten()

# print(logits)
# probs[logits < 0] = 0  # drop negative logits
probs = torch.nn.functional.softmax(logits, dim = 1)
# probs[probs < 2.e-07] = 0  # drop negative logits
# print(probs)
prediction_probs = np.transpose(probs.detach().numpy())
indices_probs = prediction_probs.flatten()


# generate adjacencies
def connect(predictions, t, k, n):
    edges = []
    for j in range(predictions.shape[0]):  # offset
        if predictions[j][k] != 0:
            if predictions_logits[j][k] > 0:
                edges.append([n*predictions.shape[1]+t, j*predictions.shape[1]+k, predictions_logits[j][k]])
        else:
            # skip to next if exists
            if k < predictions.shape[1]-1:
                edges.extend(connect(predictions, t, k+1, n))
    return edges

edges = []
for t in range(predictions.shape[1]-1):  # columns
    for n in range(predictions.shape[0]):  # rows
        if predictions[n][t] != 0:
            edges.extend(connect(predictions, t, t+1, n))

print(len(edges))
# 6107

8708


In [3]:
# load entities
import json

path = '../data/'

with open(path+'entities.json', 'r') as fin:
    entities = json.load(fin)
print(len(entities), 'entity labels')

28497 entity labels


In [4]:
# load relations
from predicates import properties

relations = {}
for p in properties['results']['bindings']:
    label = p['propertyLabel']['value']

    _id = p['property']['value'].split('/')[-1]
    relations[_id] = label

    _id = 'R' + p['property']['value'].split('/')[-1][1:]
    relations[_id] = label

# all unique predicate labels
all_predicate_labels = list(relations.values())
print(all_predicate_labels[:2])

['head of government', 'head of government']


In [5]:
# load original question
import re
from unidecode import unidecode
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

path = '../data/'
with open(path+'annotated_wd_data_valid_answerable.txt') as fin:
    lines = fin.readlines()
    l = lines[SAMPLE-1]
#         subject [tab] property [tab] object [tab] question
    s, p, o, q = l.strip('\n').split('\t')
    print(p, o)
    
    q = re.sub(chars_to_ignore_regex, '', q).lower()
    q = unidecode(q)
    q = ''.join([j for i, j in enumerate(q) if j != q[i-1]])  # remove repeated letters
    print(q)

    s_label = entities[s]
    s_label = re.sub(chars_to_ignore_regex, '', s_label).lower()
    s_label = unidecode(s_label)
    s_label = ''.join([j for i, j in enumerate(s_label) if j != s_label[i-1]])  # remove repeated letters
    print(s_label)
    
    p_label = relations[p]
    p_label = re.sub(chars_to_ignore_regex, '', p_label).lower()
    p_label = unidecode(p_label)
    p_label = ''.join([j for i, j in enumerate(p_label) if j != p_label[i-1]])  # remove repeated letters
    print(p_label)

R57 Q14949730
what is a film directed by wiebke von carolsfeld
wiebke carolsfeld
director


In [6]:
# # get all bigrams
# bigrams = []
# for e in edges:
#     i1 = indices[e[0]]
#     i2 = indices[e[1]]
#     # skip repeated predictions of the same letters (not a bigram)
#     if i1 != i2 and i1 not in special_tokens and i2 not in special_tokens:
#         bigrams.append(''.join(vocabulary([i1, i2])).lower())  # normalise to lower case
# # count all bigrams frequency in ctc matrix
# bigrams = Counter(bigrams)
# bottom_bigrams = [k.lower() for k, v in sorted(bigrams.items(), key=lambda x: x[1], reverse=True)][:20]

In [7]:
# load indices
import json

path = '../data/'

with open(path+'entities_bigrams_index.json', 'r') as fin:
    entity_bigrams = json.load(fin)
print(len(entity_bigrams), 'entity bigrams')

with open(path+'relations_bigrams_index.json', 'r') as fin:
    relation_bigrams = json.load(fin)
print(len(relation_bigrams), 'relation bigrams')

1000 entity bigrams
866 relation bigrams


In [8]:
# check bottom up correct entity label is retrievable
from collections import Counter

# bottom up from greedy decoding
t = ''.join([j for i, j in enumerate(transcription) if j != transcription[i-1]]).lower()  # remove repeated letters
# t = "vipcafon caronsfelt"
print(t)
bottom_bigrams = set([word[i:i + 2] for word in t.split() for i in range(0, len(word)-1, 1)])
print(len(bottom_bigrams))

# match to the bigrams from the greedy decoded transcription
e_labels = Counter()
for b in bottom_bigrams:
    matched_e_labels = entity_bigrams[b]
    for e_label in matched_e_labels:
        # how many bigrams did the label match normalised by the length of the label
        e_labels[e_label] += 1#/len(e_label)
# print(len(e_labels))

# normalise by label length
for l, c in e_labels.items():
    e_labels[l] = c/len(l)
    
TOPN = 500
top_e_labels = [l for l, c in e_labels.most_common(TOPN)]
print(top_e_labels[:3])
# print(len(top_e_labels))

print(s_label in top_e_labels)
print(top_e_labels.index(s_label))
# print(e_labels[s_label])

what is a film directed by wepcavon carols feld
29
['film', 'edison', 'charon']
True
63


In [9]:
# check bottom up correct property label is retrievable
p_labels = Counter()
for b in bottom_bigrams:
    matched_p_labels = relation_bigrams[b]
    for label in matched_p_labels:
        # how many bigrams did the label match normalised by the length of the label
        p_labels[label] += 1#/len(e_label)
# print(len(e_labels))

# normalise by label length
for l, c in p_labels.items():
    p_labels[l] = c/len(l)
    
TOPN = 500
top_p_labels = [l for l, c in p_labels.most_common(TOPN)]
print(top_p_labels[:3])
# print(len(top_e_labels))

print(p_label)
matched = p_label in top_p_labels
print(matched)
if matched:
    print(top_p_labels.index(p_label))

['direction', 'director', 'directions']
director
True
1


In [10]:
# # load KG
# from hdt import HDTDocument, TripleComponentRole

# hdt_path = "/ivi/ilps/personal/svakule/"
# hdt_file = 'wikidata2018_09_11.hdt'
# # hdt_file = 'wikidata20200309.hdt'

# PREFIX_E = 'http://www.wikidata.org/entity/'
# PREFIX_P = 'http://www.wikidata.org/prop/direct/P'

# kg = HDTDocument(hdt_path+hdt_file)

# # Display some metadata about the HDT document itself
# print("nb triples: %i" % kg.total_triples)
# print("nb subjects: %i" % kg.nb_subjects)
# print("nb predicates: %i" % kg.nb_predicates)

In [11]:
# # check all possible triples using entity and relation candidates by retrieving the subgraph
# path = '../data/'
# max_triples = 50000
# offset = 0

# with open(path+'entities_labels2ids.json', 'r') as fin:
#     entities = json.load(fin)
# print(len(entities), 'entity labels in total')

# with open(path+'relations_labels2ids.json', 'r') as fin:
#     relations = json.load(fin)
# print(len(relations), 'relation labels in total')

# # look up entity ids in the KG
# matched_entity_ids = []
# print(len(top_e_labels))
# for entity_label in top_e_labels:
#     # look up entites by label
#     for e_id in entities[entity_label]:
#         entity = PREFIX_E + e_id
#         matched_entity_ids.append(kg.string_to_global_id(entity, TripleComponentRole.OBJECT))
# print(len(matched_entity_ids), 'entities matched')

# # look up predicate ids in the KG
# matched_relation_ids = []
# print(len(top_p_labels))
# for p_label in top_p_labels:
#     # look up entites by label
#     for p_id in relations[p_label]:
#         predicate = PREFIX_P+p_id[1:]
#         matched_relation_ids.append(kg.string_to_global_id(predicate, TripleComponentRole.PREDICATE))
# print(len(matched_relation_ids), 'relations matched')

# kg.configure_hops(1, matched_relation_ids, 'predef-wikidata2018-09-all', True, False)
# subgraph = kg.compute_hops(matched_entity_ids, max_triples, offset)
# entity_ids, predicate_ids, adjacencies = subgraph

# n_entities = len(entity_ids)
# n_relations = len(predicate_ids)
# print("Subgraph with %d entities and %d relation types"%(n_entities, n_relations))

In [12]:
# # load adjacencies
# def generate_adj_sp(adjacencies, n_entities, include_inverse):
#     '''
#     Build adjacency matrix
#     '''
#     adj_shape = (n_entities, n_entities)
    
#     # colect all predicate matrices separately into a list
#     sp_adjacencies = []
#     for edges in adjacencies:
#         # split subject (row) and object (col) node URIs
#         n_edges = len(edges)
#         row, col = np.transpose(edges)
        
#         # duplicate edges in the opposite direction
#         if include_inverse:
#             _row = np.hstack([row, col])
#             col = np.hstack([col, row])
#             row = _row
#             n_edges *= 2
        
#         # create adjacency matrix for this predicate
#         data = np.ones(n_edges)
#         adj = sp.csr_matrix((data, (row, col)), shape=adj_shape)
#         sp_adjacencies.append(adj)
    
#     return np.asarray(sp_adjacencies)


# # seed activation
# x = np.zeros(n_entities)
# for e in matched_entity_ids:
#     idx = entity_ids.index(e)
#     x[idx] = 1

# A = generate_adj_sp(adjacencies, n_entities, include_inverse=True)
# for i, _A in enumerate(A):
#     # MP
#     _y = x @ _A

In [13]:
# ? represent speech as bigrams with probs
from collections import defaultdict

special_tokens = [4]
vocabulary = tokenizer.tokenizer.convert_ids_to_tokens

bigrams = defaultdict(int)
for e in edges:
    i1 = indices[e[0]]
    i2 = indices[e[1]]
    # skip repeated predictions of the same letters (not a bigram)
    if i1 != i2 and i1 not in special_tokens and i2 not in special_tokens:
        bigrams[''.join(vocabulary([i1, i2]))] += indices_logits[e[0]] + indices_logits[e[1]]
print(len(bigrams))

print([k for k, v in sorted(bigrams.items(), key=lambda x: x[1], reverse=True)][:10])

378
['HE', 'AE', 'LE', 'TE', 'RE', 'HA', 'TH', 'ED', 'ET', 'HT']


# Bottom up

In [14]:
# #
# e_labels = Counter()
# for b, c in bigrams.most_common(10):
#     e_ids = entity_bigrams[b.lower()]
#     for e_id in e_ids:
#         e_label = list(entities.values())[e_id]
#         e_labels[e_label] += 1
# print(len(e_labels))

# top_labels = [l for l, c in e_labels.items() if c > 1]
# print(len(top_labels))

In [15]:
# check entity matches top bigrams
# top_bigrams = [k.lower() for k, v in sorted(bigrams.items(), key=lambda x: x[1], reverse=True)][:20]
# [b for b, c in bigrams.most_common(10)]
# print(top_bigrams)

for query in s_label.lower().split():
    print(query)
    for i in range(0, len(query)-1, 1):
        bigram = query[i:i + 2]
        if bigram in bottom_bigrams:
            print(bigram)

wiebke
carolsfeld
ca
ar
ro
ol
ls
fe
el
ld


# Top down

In [16]:
import networkx as nx
import itertools
import difflib


def get_overlap(s1, s2):
    s = difflib.SequenceMatcher(None, s1, s2)
    pos_a, pos_b, size = s.find_longest_match(0, len(s1), 0, len(s2)) 
    return pos_a, pos_b, size


def match(edges, indices, query_str, tokenizer, n_paths=NPATHS):
    query = tokenizer.tokenizer(query_str)['input_ids']
    query = [query[i:i + 2] for i in range(0, len(query)-1, 1)]
    
    # filter bigrams
    bigrams = []
    for e in edges:
        bigram = [indices[e[0]], indices[e[1]]]
        if bigram in query:
            bigrams.append(e)

    # build graph
    DG = nx.DiGraph()
    DG.add_weighted_edges_from(bigrams)
    
    # find all paths
#     all_paths = []
#     for (x, y) in itertools.combinations(DG.nodes, 2):
#         for path in nx.all_simple_paths(DG, x, y):
#             all_paths.append(path)
#     # sort all paths
#     all_paths.sort(key=len, reverse=True)
    
    all_paths = [nx.dag_longest_path(DG)]

    # lookup maximum overlap between strings
    for path in all_paths[:n_paths]:
        word = ''.join(tokenizer.tokenizer.convert_ids_to_tokens([indices[i] for i in path]))
#         print(word)
        pos_a, pos_b, size = get_overlap(query_str, word)
        overlap = query_str[pos_a:pos_a+size]
#         print(overlap)
        overlap_indices = path[pos_a:pos_a+size]
#         print(indices_probs[overlap_indices])
#         print(np.prod(indices_probs[overlap_indices]))
        return len(overlap) / len(query_str), np.sum(indices_logits[overlap_indices]) #sum(indices_logits[overlap_indices])
    return 0

In [17]:
# encode entity label
query_str = s_label
# query_str = 'carol red'
print(query_str)

q_words = [w for w in query_str.split() if len(w) > 1]
print(q_words)

acc_matches, acc_logits = 0, 0
for word in q_words:
    matches, p = match(edges, indices, word.upper(), tokenizer)
    acc_matches += matches
    acc_logits += p
#     print(logits)
    
print('%.2f words matched with accumulated logits over bigrams %.2f' % (acc_matches/len(q_words), acc_logits))
# print('matched with score %.2f' % (acc_matches/len(q_words) * acc_logits/len(q_words) * (e_labels[query_str]/len(query_str))))#     break

wiebke carolsfeld
['wiebke', 'carolsfeld']
0.92 words matched with accumulated logits over bigrams 147.73


In [18]:
# search through all pre-selected entity labels
recognised_e_labels = []
for query_str in top_e_labels:
    
    q_words = [w for w in query_str.split() if len(w) > 1]
    
    acc_matches, acc_logits = 0, 0
    for word in q_words:
        matches, logits = match(edges, indices, word.upper(), tokenizer)
        acc_matches += matches
        acc_logits += logits
    if acc_matches/len(q_words) > 0.9 and acc_logits > 100:
#     score = acc_matches/len(q_words) * acc_logits/len(q_words) * (e_labels[query_str]/len(query_str))
#     if score > 25:
        print(query_str)
        recognised_e_labels.append(query_str)
        print(q_words)
        print('%.2f words matched with accumulated logits %.8f' % (acc_matches/len(q_words), acc_logits))#     break
#         print('matched with score %.2f' % score)#     break
        print(e_labels[query_str])

film director
['film', 'director']
0.94 words matched with accumulated logits 117.82028198
0.6153846153846154
wiebke carolsfeld
['wiebke', 'carolsfeld']
0.92 words matched with accumulated logits 147.73165321
0.47058823529411764
harold harefot
['harold', 'harefot']
1.00 words matched with accumulated logits 100.10855865
0.42857142857142855


In [19]:
# encode entity label
query_str = p_label
# query_str = 'carol red'
print(query_str)

q_words = [w for w in query_str.split() if len(w) > 1]
print(q_words)

acc_matches, acc_logits = 0, 0
for word in q_words:
    matches, p = match(edges, indices, word.upper(), tokenizer)
    acc_matches += matches
    acc_logits += p
#     print(logits)
    
print('%.2f words matched with accumulated logits over bigrams %.2f' % (acc_matches/len(q_words), acc_logits/len(query_str)))
# print('matched with score %.2f' % (acc_matches/len(q_words) * acc_logits/len(q_words) * (e_labels[query_str]/len(query_str))))#     break

# what is a film directed by wepcavon carols feld

director
['director']
0.88 words matched with accumulated logits over bigrams 8.10


In [20]:
# search through all pre-selected relation labels
recognised_p_labels = []

for query_str in top_p_labels:
    
    q_words = [w for w in query_str.split() if len(w) > 1]
    
    acc_matches, acc_logits = 0, 0
    for word in q_words:
        matches, logits = match(edges, indices, word.upper(), tokenizer)
        acc_matches += matches
        acc_logits += logits
    if acc_matches/len(q_words) > 0.8 and acc_logits/len(query_str) > 8:
#     score = acc_matches/len(q_words) * acc_logits/len(q_words) * (e_labels[query_str]/len(query_str))
#     if score > 25:
        print(query_str)
        recognised_p_labels.append(query_str)
        print(q_words)
        print('%.2f words matched with accumulated logits %.8f' % (acc_matches/len(q_words), acc_logits/len(query_str)))#     break
#         print('matched with score %.2f' % score)#     break
        print(e_labels[query_str])

director
['director']
0.88 words matched with accumulated logits 8.10374546
0
fe
['fe']
1.00 words matched with accumulated logits 12.69648266
0
edb film id
['edb', 'film', 'id']
0.89 words matched with accumulated logits 8.77005144
0
efis film id
['efis', 'film', 'id']
0.92 words matched with accumulated logits 8.37313096
0
sed
['sed']
1.00 words matched with accumulated logits 9.83647156
0
caries
['caries']
1.00 words matched with accumulated logits 8.87071737
0
par
['par']
1.00 words matched with accumulated logits 9.18140539
0
nf film id
['nf', 'film', 'id']
1.00 words matched with accumulated logits 8.71086884
0
elfilm film id
['elfilm', 'film', 'id']
0.94 words matched with accumulated logits 8.72986725
0
iafd film id
['iafd', 'film', 'id']
0.92 words matched with accumulated logits 8.10089032
0
sped
['sped']
1.00 words matched with accumulated logits 9.51506042
0


In [21]:
# LTR features weight scores 1) retrieval 2) recognition

# KGQA

In [22]:
# load KG
from hdt import HDTDocument, TripleComponentRole

hdt_path = "/ivi/ilps/personal/svakule/"
hdt_file = 'wikidata2018_09_11.hdt'
# hdt_file = 'wikidata20200309.hdt'

PREFIX_E = 'http://www.wikidata.org/entity/'
PREFIX_P = 'http://www.wikidata.org/prop/direct/P'

kg = HDTDocument(hdt_path+hdt_file)

# Display some metadata about the HDT document itself
print("nb triples: %i" % kg.total_triples)
print("nb subjects: %i" % kg.nb_subjects)
print("nb predicates: %i" % kg.nb_predicates)

nb triples: 2935160017
nb subjects: 760717318
nb predicates: 23387


In [23]:
# find answers in KG
# check all possible triples using entity and relation candidates by retrieving the subgraph
path = '../data/'
max_triples = 50000
offset = 0

with open(path+'entities_labels2ids.json', 'r') as fin:
    entities = json.load(fin)
print(len(entities), 'entity labels in total')

with open(path+'relations_labels2ids.json', 'r') as fin:
    relations = json.load(fin)
print(len(relations), 'relation labels in total')

# look up entity ids in the KG
matched_entity_ids = []
print(len(top_e_labels))
for entity_label in recognised_e_labels:
    # look up entites by label
    for e_id in entities[entity_label]:
        entity = PREFIX_E + e_id
        matched_entity_ids.append(kg.string_to_global_id(entity, TripleComponentRole.OBJECT))
print(len(matched_entity_ids), 'entities matched')

# look up predicate ids in the KG
matched_relation_ids = []
print(len(top_p_labels))
for p_label in recognised_p_labels:
    # look up entites by label
    for p_id in relations[p_label]:
        predicate = PREFIX_P+p_id[1:]
        matched_relation_ids.append(kg.string_to_global_id(predicate, TripleComponentRole.PREDICATE))
print(len(matched_relation_ids), 'relations matched')

namespace = 'predef-wikidata2018-09-all' # 'predef-wikidata2020-03-all'  # 
kg.configure_hops(1, matched_relation_ids, namespace, True, False)
subgraph = kg.compute_hops(matched_entity_ids, max_triples, offset)
entity_ids, predicate_ids, adjacencies = subgraph

n_entities = len(entity_ids)
n_relations = len(predicate_ids)
print("Subgraph with %d entities and %d relation types"%(n_entities, n_relations))

27994 entity labels in total
7335 relation labels in total
500
3 entities matched
500
22 relations matched
Subgraph with 6 entities and 1 relation types


In [24]:
print(recognised_e_labels)
print(matched_entity_ids)
print(entity_ids)
print(predicate_ids)
print(recognised_p_labels)
print(matched_relation_ids)
kg.global_id_to_string(predicate_ids[0], TripleComponentRole.PREDICATE)

['film director', 'wiebke carolsfeld', 'harold harefot']
[15588676, 16030421, 8833144]
[4956901, 16030421, 6161635, 15588676, 12869390, 23347336]
[9780]
['director', 'fe', 'edb film id', 'efis film id', 'sed', 'caries', 'par', 'nf film id', 'elfilm film id', 'iafd film id', 'sped']
[9780, 9780, 6658, 6658, 7221, 7221, 0, 0, 0, 0, 6612, 6612, 7592, 7592, 5925, 5925, 7223, 7223, 9195, 9195, 6163, 6163]


'http://www.wikidata.org/prop/direct/P57'

In [25]:
import sympy

import scipy.sparse as sp


def generate_adj_sp(adjacencies, n_entities, include_inverse):
    '''
    Build adjacency matrix
    '''
    adj_shape = (n_entities, n_entities)
    
    # colect all predicate matrices separately into a list
    sp_adjacencies = []
    for edges in adjacencies:
        # split subject (row) and object (col) node URIs
        n_edges = len(edges)
        row, col = np.transpose(edges)
        
        # duplicate edges in the opposite direction
        if include_inverse:
            _row = np.hstack([row, col])
            col = np.hstack([col, row])
            row = _row
            n_edges *= 2
        
        # create adjacency matrix for this predicate
        data = np.ones(n_edges)
        adj = sp.csr_matrix((data, (row, col)), shape=adj_shape)
        sp_adjacencies.append(adj)
    
    return np.asarray(sp_adjacencies)


# seed activation with primes
x = np.zeros(n_entities)
weights = []
for i, e in enumerate(matched_entity_ids):
    if e in entity_ids:
        idx = entity_ids.index(e)
        prime = sympy.prime(i+1)
        x[idx] = prime 
        weights.append(prime)

# load adjacencies
A = generate_adj_sp(adjacencies, n_entities, include_inverse=True)
for i, _A in enumerate(A):
    # MP
    _y = x @ _A
    print(_y)

[3. 0. 2. 0. 3. 3.]


In [26]:
print(matched_entity_ids[weights.index(2)])
print(matched_entity_ids[weights.index(3)])

15588676
16030421


In [27]:
kg.global_id_to_string(entity_ids[0], TripleComponentRole.OBJECT).split('/')[-1] == o

True

In [28]:
kg.global_id_to_string(entity_ids[2], TripleComponentRole.OBJECT)

'http://www.wikidata.org/entity/Q16148301'