In [17]:
import json
import argparse
from collections import defaultdict
import re
from datetime import datetime

import utils

In [18]:
objects_path = 'data/dbpedia-2021-09-kewer/graph/mappingbased-objects_lang=en.ttl'
literals_path = 'data/dbpedia-2021-09-kewer/graph/mappingbased-literals_lang=en.ttl'
infobox_path = 'data/dbpedia-2021-09-kewer/graph/infobox-properties_lang=en.ttl'
categories_path = 'data/dbpedia-2021-09-kewer/categories_lang=en_articles.ttl'

In [19]:
parser = argparse.ArgumentParser()
parser.add_argument('--outfile', default='data/dbpedia-triples.json')
args = parser.parse_args(args=[])

In [20]:
redirects = utils.dbpedia_redirects()

In [22]:
len(redirects)

9831571

In [23]:
neighbor_entities = utils.load_neighbor_entities()

In [25]:
len(neighbor_entities)

453745

In [6]:
neighbor_triples = defaultdict(lambda: defaultdict(set))

In [7]:
kewer = utils.load_kewer()

In [8]:
def load_subj_obj_lit_triples():
    i = 0

    for ttl_path in [objects_path, literals_path, infobox_path]:
        with open(ttl_path) as input_file:
            for line in input_file:
                if line.startswith('#'):
                    continue
                subj, pred, obj = line.split(maxsplit=2)
                if pred.lower() in utils.pred_blacklist or re.match(
                        r'<http://dbpedia.org/property/([0-9]|footnote|.{1,2}>$)',
                        pred):
                    continue
                obj = obj[:obj.rfind('.')].strip()
                if subj in redirects:
                    subj = redirects[subj]
                if obj.startswith('<'):
                    if obj in redirects:
                        obj = redirects[obj]

                    if subj in kewer.wv and obj in kewer.wv:
                        if subj in neighbor_entities:
                            neighbor_triples[subj]['subj'].add((pred, obj))
                        if obj in neighbor_entities:
                            neighbor_triples[obj]['obj'].add((pred, subj))
                elif obj.startswith('"'):
                    if subj in neighbor_entities and subj in kewer.wv:
                        text = obj[obj.find('"') + 1:obj.rfind('"')]
                        tokens = utils.literal_tokens(text)
                        if len(tokens):
                            neighbor_triples[subj]['lit'].add((pred, ' '.join(tokens)))
                i += 1
                if i % 1000000 == 0:
                    print(f'Processed {i} items. Current time: {datetime.now().strftime("%H:%M:%S")}.')

In [9]:
load_subj_obj_lit_triples()
print('Done with the non-category triples.')

Processed 1000000 items. Current time: 00:20:19.
Processed 2000000 items. Current time: 00:20:26.
Processed 3000000 items. Current time: 00:20:33.
Processed 4000000 items. Current time: 00:20:41.
Processed 5000000 items. Current time: 00:20:47.
Processed 6000000 items. Current time: 00:20:55.
Processed 7000000 items. Current time: 00:21:03.
Processed 8000000 items. Current time: 00:21:09.
Processed 9000000 items. Current time: 00:21:18.
Processed 10000000 items. Current time: 00:21:24.
Processed 11000000 items. Current time: 00:21:31.
Processed 12000000 items. Current time: 00:21:38.
Processed 13000000 items. Current time: 00:21:44.
Processed 14000000 items. Current time: 00:21:51.
Processed 15000000 items. Current time: 00:21:59.
Processed 16000000 items. Current time: 00:22:06.
Processed 17000000 items. Current time: 00:22:13.
Processed 18000000 items. Current time: 00:22:20.
Processed 19000000 items. Current time: 00:22:26.
Processed 20000000 items. Current time: 00:22:33.
Processed

In [10]:
def load_cat_triples():
    i = 0

    with open(categories_path) as input_file:
        for line in input_file:
            if line.startswith('#'):
                continue
            subj, pred, obj, _ = line.split()
            if subj in redirects:
                subj = redirects[subj]

            if subj in neighbor_entities and subj in kewer.wv:
                neighbor_triples[subj]['cat'].add((pred, obj))
            i += 1
            if i % 1000000 == 0:
                print(f'Processed {i} items. Current time: {datetime.now().strftime("%H:%M:%S")}.')

In [11]:
load_cat_triples()
print('Done with the categories.')

Processed 1000000 items. Current time: 00:35:01.
Processed 2000000 items. Current time: 00:35:02.
Processed 3000000 items. Current time: 00:35:04.
Processed 4000000 items. Current time: 00:35:05.
Processed 5000000 items. Current time: 00:35:07.
Processed 6000000 items. Current time: 00:35:08.
Processed 7000000 items. Current time: 00:35:09.
Processed 8000000 items. Current time: 00:35:11.
Processed 9000000 items. Current time: 00:35:12.
Processed 10000000 items. Current time: 00:35:13.
Processed 11000000 items. Current time: 00:35:15.
Processed 12000000 items. Current time: 00:35:16.
Processed 13000000 items. Current time: 00:35:18.
Processed 14000000 items. Current time: 00:35:19.
Processed 15000000 items. Current time: 00:35:20.
Processed 16000000 items. Current time: 00:35:22.
Processed 17000000 items. Current time: 00:35:23.
Processed 18000000 items. Current time: 00:35:24.
Processed 19000000 items. Current time: 00:35:26.
Processed 20000000 items. Current time: 00:35:27.
Processed

In [12]:
neighbor_triples = {
    entity: {triple_type: sorted(triples) for triple_type, triples in triple_types_dict.items()} for
    entity, triple_types_dict in neighbor_triples.items()
}

In [26]:
len(neighbor_triples)

453745

In [9]:
args.outfile

'data/dbpedia-triples.json'

In [21]:
with open(args.outfile, 'w') as f:
    json.dump(neighbor_triples, f, sort_keys=True, indent=4, separators=(',', ': '))

In [3]:
def load_neighbor_triples():
    with open(args.outfile) as ntf:
        return json.load(ntf)

In [10]:
neighbor_triples = load_neighbor_triples()

In [11]:
len(neighbor_triples)

453745

In [16]:
neighbor_triples['<http://dbpedia.org/resource/!Arriba!_La_Pachanga>']

{'cat': [['<http://purl.org/dc/terms/subject>',
   '<http://dbpedia.org/resource/Category:1961_albums>'],
  ['<http://purl.org/dc/terms/subject>',
   '<http://dbpedia.org/resource/Category:Latin_jazz_albums_by_Cuban_artists>']],
 'lit': [['<http://dbpedia.org/property/cover>',
   'file mongo santamaria arriba la pachanga album cover jpg'],
  ['<http://dbpedia.org/property/headline>', 'side a'],
  ['<http://dbpedia.org/property/headline>', 'side b'],
  ['<http://dbpedia.org/property/label>', 'fantasy records'],
  ['<http://dbpedia.org/property/length>', '156 0'],
  ['<http://dbpedia.org/property/length>', '161 0'],
  ['<http://dbpedia.org/property/length>', '170 0'],
  ['<http://dbpedia.org/property/length>', '178 0'],
  ['<http://dbpedia.org/property/length>', '188 0'],
  ['<http://dbpedia.org/property/length>', '190 0'],
  ['<http://dbpedia.org/property/length>', '201 0'],
  ['<http://dbpedia.org/property/length>', '214 0'],
  ['<http://dbpedia.org/property/length>', '216 0'],
  ['<ht

In [27]:
feature_inputs = utils.load_feature_inputs()

In [29]:
len(feature_inputs)

453745