# DF analysis

In [10]:
import os
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import rltk

In [12]:
repo_name = 'jchen-test-ta1'
input_df_path = 'dryrun-3-GAIA_2_v1-ta1_entity_trans_all_001.h5'
version = '001'
output_path = 'clusters-{}-{}.jl'.format(repo_name, version)
kg_tab_dir_path = '/nas/gaia/corpora/LDC2019E43_AIDA_Phase_1_Evaluation_Reference_Knowledge_Base/data/'

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('max_colwidth', 800)
pd.set_option('display.width', 1000)

In [4]:
df_entity = pd.read_hdf(input_df_path)
df_entity.head()

FileNotFoundError: File store_data/dryrun-3-GAIA_2_v1-ta1_entity_trans_all_001.h5 does not exist

In [442]:
df_entity.shape

(60388, 23)

In [443]:
df_entity.columns

Index(['e', 'type', 'name', 'source', 'targets', 'target_scores', 'fbid', 'fbid_score_avg', 'fbid_score_max', 'wikidata', 'wiki_label_en', 'wiki_label_ru', 'wiki_label_uk', 'wiki_alias_en', 'wiki_alias_ru', 'wiki_alias_uk', 'lang', 'label', 'transl_name', 'transl_label_ru', 'transl_label_uk', 'transl_alias_ru', 'transl_alias_uk'], dtype='object')

In [444]:
df_entity.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60388 entries, 1 to 253587
Data columns (total 23 columns):
e                  60388 non-null object
type               60388 non-null object
name               44491 non-null object
source             60388 non-null object
targets            15226 non-null object
target_scores      15226 non-null object
fbid               24997 non-null object
fbid_score_avg     24997 non-null object
fbid_score_max     24997 non-null object
wikidata           24997 non-null object
wiki_label_en      24997 non-null object
wiki_label_ru      24997 non-null object
wiki_label_uk      24997 non-null object
wiki_alias_en      24997 non-null object
wiki_alias_ru      24997 non-null object
wiki_alias_uk      24997 non-null object
lang               59371 non-null object
label              19480 non-null object
transl_name        30920 non-null object
transl_label_ru    24997 non-null object
transl_label_uk    24997 non-null object
transl_alias_ru    24997 non-

In [445]:
all_types = set(df_entity['type'])
all_types

{'ldcOnt:COM.Document.Map',
 'ldcOnt:CRM.BehaviorCrime',
 'ldcOnt:CRM.FinancialCrime',
 'ldcOnt:CRM.FinancialCrime.Extortion',
 'ldcOnt:CRM.ViolentCrime',
 'ldcOnt:CRM.ViolentCrime.Terrorism',
 'ldcOnt:FAC',
 'ldcOnt:FAC.Building',
 'ldcOnt:FAC.Building.GovernmentBuilding',
 'ldcOnt:FAC.Building.House',
 'ldcOnt:FAC.Building.OfficeBuilding',
 'ldcOnt:FAC.Building.School',
 'ldcOnt:FAC.GeographicalArea',
 'ldcOnt:FAC.GeographicalArea.Border',
 'ldcOnt:FAC.Installation',
 'ldcOnt:FAC.Installation.Airport',
 'ldcOnt:FAC.Installation.MilitaryInstallation',
 'ldcOnt:FAC.Installation.TrainStation',
 'ldcOnt:FAC.Structure',
 'ldcOnt:FAC.Structure.Barricade',
 'ldcOnt:FAC.Way',
 'ldcOnt:FAC.Way.Street',
 'ldcOnt:GPE',
 'ldcOnt:GPE.Country.Country',
 'ldcOnt:GPE.OrganizationOfCountries.OrganizationOfCountries',
 'ldcOnt:GPE.ProvinceState.ProvinceState',
 'ldcOnt:GPE.UrbanArea',
 'ldcOnt:GPE.UrbanArea.City',
 'ldcOnt:GPE.UrbanArea.Village',
 'ldcOnt:LOC',
 'ldcOnt:LOC.GeographicPoint',
 'ldcOnt:

In [446]:
# df_entity.groupby('type').count().reset_index()

In [447]:
# df_entity_general_types = df_entity[['e', 'type']]
# df_entity_general_types['general_type'] = df_entity_general_types['type'].str.split('.', 1).str.get(0)
# df_entity_general_types.groupby('general_type').count()

In [448]:
selected_types = filter(lambda x: x.startswith(('ldcOnt:GPE', 'ldcOnt:LOC', 'ldcOnt:ORG', 'ldcOnt:PER')), all_types)
selected_types = list(selected_types)
selected_types

['ldcOnt:PER.ProfessionalPosition.Scientist',
 'ldcOnt:PER.Combatant.Sniper',
 'ldcOnt:ORG.Association',
 'ldcOnt:ORG.MilitaryOrganization.GovernmentArmedForces',
 'ldcOnt:ORG.Government.Railway',
 'ldcOnt:ORG.Government.LegislativeBody',
 'ldcOnt:PER.MilitaryPersonnel.MilitaryOfficer',
 'ldcOnt:PER.ProfessionalPosition.Ambassador',
 'ldcOnt:GPE.Country.Country',
 'ldcOnt:PER.ProfessionalPosition.Spokesperson',
 'ldcOnt:ORG.PoliticalOrganization.Party',
 'ldcOnt:PER.Combatant.Mercenary',
 'ldcOnt:LOC.Position.AirSpace',
 'ldcOnt:PER.MilitaryPersonnel',
 'ldcOnt:ORG.CommercialOrganization.BroadcastingCompany',
 'ldcOnt:LOC.Position',
 'ldcOnt:GPE.OrganizationOfCountries.OrganizationOfCountries',
 'ldcOnt:GPE.ProvinceState.ProvinceState',
 'ldcOnt:GPE.UrbanArea.City',
 'ldcOnt:ORG.Government.LawEnforcementAgency',
 'ldcOnt:PER.Politician.Governor',
 'ldcOnt:PER.ProfessionalPosition',
 'ldcOnt:LOC.Position.Field',
 'ldcOnt:ORG.Association.Team',
 'ldcOnt:ORG.International',
 'ldcOnt:ORG.M

In [449]:
df_entity = df_entity.loc[df_entity['type'].isin(selected_types)]
df_entity.head()

Unnamed: 0,e,type,name,source,targets,target_scores,fbid,fbid_score_avg,fbid_score_max,wikidata,wiki_label_en,wiki_label_ru,wiki_label_uk,wiki_alias_en,wiki_alias_ru,wiki_alias_uk,lang,label,transl_name,transl_label_ru,transl_label_uk,transl_alias_ru,transl_alias_uk
1,http://www.isi.edu/gaia/entities/20ed5fc6-502c-4a2b-85dc-6e4e5fbca1ab,ldcOnt:GPE.UrbanArea.City,"(Хабаровск, Хабаровск, Хабаровск)",IC0015JFY,"(LDC2019E43:2022890, LDC2019E43:2022888, LDC2019E43:6300934)","(0.9701, 0.024, 0.003)","(m.010f1kyh,)","(0.5627072627,)","(0.7758610249,)","(None,)","(None,)","(None,)","(None,)","(None,)","(None,)","(None,)",ru,"((Khabarovsk ,),)","[None, None, None]",[None],[None],"(None,)","(None,)"
5,http://www.isi.edu/gaia/entities/4547c11a-b636-4732-94aa-40aec264a13b,ldcOnt:GPE.ProvinceState.ProvinceState,"(Москву, Москву, Москву, Москвы, Москвы, Москвы)",IC0015JFY,"(LDC2019E43:524894, LDC2019E43:524901, LDC2019E43:5601538)","(0.4948, 0.4948, 0.007644459)","(m.04swd,)","(0.7611546613,)","(1.0,)","(http://www.wikidata.org/entity/Q649,)","(Moscow,)","(Москва,)","(Москва,)","((Moskva, City of Moscow, Moscow, Russia, Moscow, Russian Federation, Moscow, Russian SFSR, Moscow, Soviet Union, Moscow, USSR, Moskva Federal City, Russia, Moskva, Russia, Москва),)","((Москва (город), Москва (Россия), Москва Златоглавая, Москва, Россия, Первопрестольная, Порт пяти морей, Третий Рим),)","((Москва (місто), Москва (Росія), Москва, Росія, Москва, РФ),)",ru,"((Moscow , Moscow, ), (Moscow , Moscow ))","[moscow, moscow, moscow, moscow, moscow, moscow]",[moscow],[moscow],"([moscow (city), Moscow, Russia), moscow gold-domed, Moscow, Russia, mother of god, port of the five seas, third rome],)","([moscow city, moscow (russia), Moscow, Russia, Moscow, Rf],)"
11,http://www.isi.edu/gaia/entities/2e8e9dbe-07b5-44ae-92bd-82f36d673dab,ldcOnt:GPE.UrbanArea.City,"(Севастополь, Севастополь, Севастополь)",IC0015JFY,"(LDC2019E43:4926267, LDC2019E43:694422, LDC2019E43:694423)","(0.002142857, 0.4954, 0.4993)","(m.0dlvj, m.02q172s)","(0.6444659938, 0.2478182763)","(0.927274704, 0.2478182763)","(http://www.wikidata.org/entity/Q7525, http://www.wikidata.org/entity/Q177373)","(Sevastopol, FC Sevastopol)","(Севастополь, Севастополь)","(Севастополь, Севастополь)","((Aqyar, Sebastopol, Sevastopol', Sewastupolis, Sewastúpolis, Σεβαστούπολις, Ахтіар, Севастополь), ())","((Ахтиар,), ())","((), (ФК «Севасто́поль»,))",ru,"((Sevastopol , Sevastopol ),)","[sevastopol, sevastopol, sevastopol]","[sevastopol, sevastopol]","[sevastopol, sevastopol]","([ahtiar], None)","(None, [None])"
15,http://www.isi.edu/gaia/entities/90c6f6ec-b605-438f-a5cb-89ea35f628b9,ldcOnt:PER.Politician.HeadOfGovernment,"(Путина,)",IC0015JFY,,,,,,,,,,,,,ru,"((Putin. Now Putin ,),)",[putin's],,,,
16,http://www.isi.edu/gaia/entities/b7291b40-d96b-4835-98eb-fba70e5d6251,ldcOnt:PER.Politician.HeadOfGovernment,"(Виктором Федоровичем Януковичем,)",IC0015JFY,,,,,,,,,,,,,ru,,[None],,,,


## Dumping data for wikifier

In [450]:
# # export all people names
# df_entity_per = df_entity[df_entity['type'].str.startswith('ldcOnt:PER')]
# df_entity_per.head()

In [451]:
# df_entity_per.to_hdf('store_data/all_entity_per.h5', 'entity', mode='w', format='fixed')
# df_entity_per.to_csv('store_data/all_entity_per.csv')

In [452]:
# pd.read_hdf('store_data/all_entity_per.h5').head()

In [453]:
# # export all org names
# df_entity_org = df_entity[df_entity['type'].str.startswith('ldcOnt:ORG')]
# df_entity_org.to_hdf('store_data/all_entity_org.h5', 'entity', mode='w', format='fixed')
# df_entity_org.to_csv('store_data/all_entity_org.csv')

In [454]:
# df_entity_org.head()

## continue

In [455]:
# remove non-name entities
df_entity = df_entity[df_entity['name'].notnull()]
df_entity.head()

Unnamed: 0,e,type,name,source,targets,target_scores,fbid,fbid_score_avg,fbid_score_max,wikidata,wiki_label_en,wiki_label_ru,wiki_label_uk,wiki_alias_en,wiki_alias_ru,wiki_alias_uk,lang,label,transl_name,transl_label_ru,transl_label_uk,transl_alias_ru,transl_alias_uk
1,http://www.isi.edu/gaia/entities/20ed5fc6-502c-4a2b-85dc-6e4e5fbca1ab,ldcOnt:GPE.UrbanArea.City,"(Хабаровск, Хабаровск, Хабаровск)",IC0015JFY,"(LDC2019E43:2022890, LDC2019E43:2022888, LDC2019E43:6300934)","(0.9701, 0.024, 0.003)","(m.010f1kyh,)","(0.5627072627,)","(0.7758610249,)","(None,)","(None,)","(None,)","(None,)","(None,)","(None,)","(None,)",ru,"((Khabarovsk ,),)","[None, None, None]",[None],[None],"(None,)","(None,)"
5,http://www.isi.edu/gaia/entities/4547c11a-b636-4732-94aa-40aec264a13b,ldcOnt:GPE.ProvinceState.ProvinceState,"(Москву, Москву, Москву, Москвы, Москвы, Москвы)",IC0015JFY,"(LDC2019E43:524894, LDC2019E43:524901, LDC2019E43:5601538)","(0.4948, 0.4948, 0.007644459)","(m.04swd,)","(0.7611546613,)","(1.0,)","(http://www.wikidata.org/entity/Q649,)","(Moscow,)","(Москва,)","(Москва,)","((Moskva, City of Moscow, Moscow, Russia, Moscow, Russian Federation, Moscow, Russian SFSR, Moscow, Soviet Union, Moscow, USSR, Moskva Federal City, Russia, Moskva, Russia, Москва),)","((Москва (город), Москва (Россия), Москва Златоглавая, Москва, Россия, Первопрестольная, Порт пяти морей, Третий Рим),)","((Москва (місто), Москва (Росія), Москва, Росія, Москва, РФ),)",ru,"((Moscow , Moscow, ), (Moscow , Moscow ))","[moscow, moscow, moscow, moscow, moscow, moscow]",[moscow],[moscow],"([moscow (city), Moscow, Russia), moscow gold-domed, Moscow, Russia, mother of god, port of the five seas, third rome],)","([moscow city, moscow (russia), Moscow, Russia, Moscow, Rf],)"
11,http://www.isi.edu/gaia/entities/2e8e9dbe-07b5-44ae-92bd-82f36d673dab,ldcOnt:GPE.UrbanArea.City,"(Севастополь, Севастополь, Севастополь)",IC0015JFY,"(LDC2019E43:4926267, LDC2019E43:694422, LDC2019E43:694423)","(0.002142857, 0.4954, 0.4993)","(m.0dlvj, m.02q172s)","(0.6444659938, 0.2478182763)","(0.927274704, 0.2478182763)","(http://www.wikidata.org/entity/Q7525, http://www.wikidata.org/entity/Q177373)","(Sevastopol, FC Sevastopol)","(Севастополь, Севастополь)","(Севастополь, Севастополь)","((Aqyar, Sebastopol, Sevastopol', Sewastupolis, Sewastúpolis, Σεβαστούπολις, Ахтіар, Севастополь), ())","((Ахтиар,), ())","((), (ФК «Севасто́поль»,))",ru,"((Sevastopol , Sevastopol ),)","[sevastopol, sevastopol, sevastopol]","[sevastopol, sevastopol]","[sevastopol, sevastopol]","([ahtiar], None)","(None, [None])"
15,http://www.isi.edu/gaia/entities/90c6f6ec-b605-438f-a5cb-89ea35f628b9,ldcOnt:PER.Politician.HeadOfGovernment,"(Путина,)",IC0015JFY,,,,,,,,,,,,,ru,"((Putin. Now Putin ,),)",[putin's],,,,
16,http://www.isi.edu/gaia/entities/b7291b40-d96b-4835-98eb-fba70e5d6251,ldcOnt:PER.Politician.HeadOfGovernment,"(Виктором Федоровичем Януковичем,)",IC0015JFY,,,,,,,,,,,,,ru,,[None],,,,


In [456]:
df_entity.shape

(40637, 23)

In [457]:
# validate related df columns have same dimension
for r in df_entity.iterrows():
    r = r[1]
    if r['targets'] and len(r['targets']) != len(r['target_scores']):
        print('Invalid target_scores', r['e'])
        break
        
    if r['fbid']:
        if len(r['fbid']) != len(r['fbid_score_avg']) or len(r['fbid']) != len(r['fbid_score_max']):
            print('Invalid fbid_score_avg or fbid_score_max', r['e'])
            break
        if r['wikidata'] and len(r['fbid']) != len(r['wikidata']):
            print('Invalid wikidata', r['e'])
            break
        if r['wiki_label_en'] and len(r['fbid_score_avg']) != len(r['wiki_label_en']):
            print('Invalid wiki_label_en', r['e'])
            break
        if r['wiki_label_ru'] and len(r['fbid_score_avg']) != len(r['wiki_label_ru']):
            print('Invalid wiki_label_ru', r['e'])
            break
        if r['wiki_label_uk'] and len(r['fbid_score_avg']) != len(r['wiki_label_uk']):
            print('Invalid wiki_label_uk', r['e'])
            break
        if r['transl_label_ru'] and len(r['fbid_score_avg']) != len(r['transl_label_ru']):
            print('Invalid transl_label_ru', r['e'])
            break
        if r['transl_label_uk'] and len(r['fbid_score_avg']) != len(r['transl_label_uk']):
            print('Invalid transl_label_uk', r['e'])
            break
else:
    print('Valid')

Valid


In [13]:
from collections import defaultdict
kb_names = defaultdict(lambda: {'type': None, 'names': []})

# entities
with open(os.path.join(kg_tab_dir_path, 'entities.tab')) as f:
    for idx, line in enumerate(f):
        if idx == 0:
            continue
        line = line.strip().split('\t')
        type_, id_, name1 = line[1], line[2], line[3]
        kb_names[id_]['type'] = type_
        kb_names[id_]['names'].append(name1)
        if len(line) >= 5:
            name2 = line[4]
            kb_names[id_]['names'].append(name2)
    
# alternative names
with open(os.path.join(kg_tab_dir_path, 'alternate_names.tab')) as f:
    for idx, line in enumerate(f):
        if idx == 0:
            continue
        line = line.strip().split('\t')
        id_, name_ = line[0], line[1]
        kb_names[id_]['names'].append(name_)

In [459]:
len(kb_names)

10215675

In [460]:
# samples of kb names
for i, (k, v) in enumerate(kb_names.items()):
    if i >= 10:
        break
    print(k, v)

2986043 {'type': 'LOC', 'names': ['Pic de Font Blanca', 'Pic de Font Blanca', 'Pic de Font Blanca', 'Pic du Port']}
2993838 {'type': 'LOC', 'names': ['Pic de Mil-Menut', 'Pic de Mil-Menut', 'Pic de Mil-Menut']}
2994701 {'type': 'LOC', 'names': ['Roc Mélé', 'Roc Mele', 'Roc Mele', 'Roc Meler', 'Roc Mélé']}
3007683 {'type': 'LOC', 'names': ['Pic des Langounelles', 'Pic des Langounelles', 'Pic des Langounelles']}
3017832 {'type': 'LOC', 'names': ['Pic de les Abelletes', 'Pic de les Abelletes', 'Pic de la Font-Negre', 'Pic de la Font-Nègre', 'Pic de les Abelletes']}
3017833 {'type': 'LOC', 'names': ['Estany de les Abelletes', 'Estany de les Abelletes', 'Estany de les Abelletes', 'Etang de Font-Negre', 'Étang de Font-Nègre']}
3023203 {'type': 'LOC', 'names': ['Port Vieux de la Coume d’Ose', "Port Vieux de la Coume d'Ose", "Port Vieux de Coume d'Ose", 'Port Vieux de Coume d’Ose', "Port Vieux de la Coume d'Ose", 'Port Vieux de la Coume d’Ose']}
3029315 {'type': 'LOC', 'names': ['Port de la Ca

In [461]:
# detect if all target ids appear in KB
wild_ids = set([])
for _, targets in df_entity['targets'].iteritems():
    if targets:
        for t in targets:
            t_ = t.split(':')[1]
            if t_ not in kb_names:
                wild_ids.add(t)
for i, id_ in enumerate(wild_ids):
    if i >= 10:
        break
    print(id_)

# Create RLTK components

In [462]:
# df_entity.head()

In [463]:
def flatten(list_, remove_none=True):
    if isinstance(list_, (list, tuple)):
        ret = []
        for l in list_:
            for n in flatten(l):
                ret.append(n)
        return ret
    else:
        if remove_none and not list_:
            return []
        return [list_]

# flatten([1,2,3,4, [2,2,4],[13,[2,3,2]]])

In [464]:
from operator import itemgetter

def top_score_indices(list_, num):
    return sorted(range(len(list_)), key=lambda i: list_[i])[-num:]

@rltk.set_id('e')
class GaiaRecord(rltk.AutoGeneratedRecord):
    
    @rltk.cached_property
    def augmented_names(self):
        ret = []
        targets = self.raw_object['targets']
        if targets:
            for t in targets:
                kb_prefix, t = t.split(':')
                if kb_prefix != 'LDC2019E43':
                    ret.append([])
                    continue
                    
                names = kb_names.get(t)
                if names:
                    ret.append(names['names'])
                else:
                    ret.append([])
            return ret
        
    @rltk.cached_property
    def selected_wikidata(self):
        if self.fbid_score_avg:
            selected_indice = top_score_indices(self.fbid_score_avg, 1)
            return flatten(itemgetter(*selected_indice)(self.wikidata))
        
    @rltk.cached_property
    def selected_fbid(self):
        if self.fbid_score_avg:
            selected_indice = top_score_indices(self.fbid_score_avg, 1)
            return flatten(itemgetter(*selected_indice)(self.fbid))
    
    @rltk.cached_property
    def selected_targets(self):
        if self.target_scores:
            selected_indice = top_score_indices(self.target_scores, 1)
            return flatten(itemgetter(*selected_indice)(self.targets))
        
    
    @rltk.cached_property
    def selected_wikidata_label_en(self):
        if self.fbid_score_avg:
            selected_indice = top_score_indices(self.fbid_score_avg, 1)
        
            # wikidata labels and translation (based on freebase)
            if self.wiki_label_en:
                return flatten(itemgetter(*selected_indice)(self.wiki_label_en))
    
#     @property
    @rltk.cached_property
    def concatenated_labels(self):
        ret = []
        
        # name and translation
        if self.name:
            ret += self.name
        if self.transl_name:
            ret += self.transl_name
        
        # target labels
        if self.target_scores:
            selected_indice = top_score_indices(self.target_scores, 1)
            ret += flatten(itemgetter(*selected_indice)(self.augmented_names))
        
        if self.fbid_score_avg:
            selected_indice = top_score_indices(self.fbid_score_avg, 1)
        
            # wikidata labels and translation (based on freebase)
            if self.wiki_label_en:
                ret += flatten(itemgetter(*selected_indice)(self.wiki_label_en))
            if self.wiki_label_ru:
                ret += flatten(itemgetter(*selected_indice)(self.wiki_label_ru))
            if self.wiki_label_uk:
                ret += flatten(itemgetter(*selected_indice)(self.wiki_label_uk))
            if self.transl_label_ru:
                ret += flatten(itemgetter(*selected_indice)(self.transl_label_ru))
            if self.transl_label_uk:
                ret += flatten(itemgetter(*selected_indice)(self.transl_label_uk))

            # wikidata alias and translation
            if self.wiki_label_en:
                ret += flatten(itemgetter(*selected_indice)(self.wiki_label_en))
            if self.wiki_alias_ru:
                ret += flatten(itemgetter(*selected_indice)(self.wiki_alias_ru))
            if self.wiki_alias_uk:
                ret += flatten(itemgetter(*selected_indice)(self.wiki_alias_uk))
            if self.transl_alias_ru:
                ret += flatten(itemgetter(*selected_indice)(self.transl_alias_ru))
            if self.transl_alias_uk:
                ret += flatten(itemgetter(*selected_indice)(self.transl_alias_uk))
        
        return set(ret)
        
ds = rltk.Dataset(reader=rltk.DataFrameReader(df_entity), record_class=GaiaRecord)

In [465]:
for r in ds.head():
    print(r.id, r.name, r.targets, r.fbid, r.selected_wikidata, r.wiki_label_en, r.wiki_alias_en, r.selected_fbid, r.selected_targets, r.concatenated_labels)

http://www.isi.edu/gaia/entities/20ed5fc6-502c-4a2b-85dc-6e4e5fbca1ab ('Хабаровск', 'Хабаровск', 'Хабаровск') ('LDC2019E43:2022890', 'LDC2019E43:2022888', 'LDC2019E43:6300934') ('m.010f1kyh',) [] (None,) (None,) ['m.010f1kyh'] ['LDC2019E43:2022890'] {'KHV', 'Khabàrovsk', 'Chabarofsk', 'Champarofsk', "Khabarovs'k", 'Горад Хабараўск', 'Khbarovsk', 'habarofusuku', '伯力', 'ہابروسک', 'خابارووسک', 'Chabarovsk', 'ハバロフスク', 'khabarovska', 'חברובסק', 'خاباروفسك', 'Χαμπάροφσκ', 'hbrwbsq', 'खबारोव्स्क', 'Habarovsk', 'Khabarovskaj', None, 'خاباروفسک', 'Chabarowsk', 'ख़ाबारोव्स्क', 'khabarwwsk', 'bo li', 'Khabarovsk', 'Habarovska', 'Хабаровськ', 'Chabarovskas', 'Ĥabarovsk', 'Khabarovka', 'khabarwfsk', 'habalobseukeu', '하바롭스크', 'Хабаровск', 'Хабаровскай', 'Jabárovsk', 'Խաբարովսկ', 'Jabarovsk', 'Xabarovsk', 'Habarovszk', 'Gorad Khabarausk'}
http://www.isi.edu/gaia/entities/4547c11a-b636-4732-94aa-40aec264a13b ('Москву', 'Москву', 'Москву', 'Москвы', 'Москвы', 'Москвы') ('LDC2019E43:524894', 'LDC2019E43

In [466]:
def distribution_of_block(blocks):
    num_in_block = []
    for b, data in blocks.key_set_adapter:
        num_in_block.append(len(data))

    from collections import Counter
    return dict(sorted(Counter(num_in_block).items()))

In [467]:
bg_kb = rltk.TokenBlockGenerator()
blocks_kb = bg_kb.block(ds, function_=lambda r: list(r.selected_targets) if r.selected_targets else ['None'])

In [468]:
sum(1 for _ in blocks_kb.key_set_adapter)

2128

In [469]:
distinct_entity_ids = set([])
for bid, data in blocks_kb.key_set_adapter:
    if bid == 'None':
        continue
    for _, rid in data:
        distinct_entity_ids.add(rid)
len(distinct_entity_ids)

14503

In [470]:
# distribution_of_block(blocks_kb)

In [471]:
bg_fb = rltk.TokenBlockGenerator()
blocks_fb = bg_fb.block(ds, function_=lambda r: r.selected_fbid if r.selected_fbid else ['None'])

In [472]:
sum(1 for _ in blocks_fb.key_set_adapter)

5641

In [473]:
# distribution_of_block(blocks_fb)

# Cluster baseline

In [474]:
# MAX_DISTANCE = 999999
class Cluster(object):
    def __init__(self, ds):
        self.attractive_records = set([])  # contribute to clustering
        self.all_records = set([])
        self.ds = ds
        self.type = None
        self.wd_id = set([])
        self.kb_id = set([])
        self.fb_id = set([])
    
    @staticmethod
    def record_score(r1, r2):
        score = rltk.jaccard_index_similarity(set(r1.concatenated_labels), set(r2.concatenated_labels))
        return score

    def similarity(self, r):
#         if r.type != self.type:
#             return MAX_DISTANCE
        
        score =  max([self.record_score(r, self.ds.get_record(rr)) for rr in self.attractive_records])
#         if score == 0:
#             return MAX_DISTANCE
        return score

    def add(self, r, contribute=True):
        if isinstance(r, rltk.Record):
            r = r.id
        if contribute:
            self.attractive_records.add(r)
        self.all_records.add(r)

In [475]:
# build cluster based on type
all_clusters = []
for bid, data in blocks_kb.key_set_adapter:
    if bid == 'None':
        continue
    
    c = Cluster(ds)
    for _, r_id in data:
        r = ds.get_record(r_id)
        for id_ in r.selected_targets:
            c.kb_id.add(id_)
        if r.fbid:
            for id_ in r.selected_fbid:
                c.fb_id.add(id_)
        if r.wikidata:
            for id_ in r.selected_wikidata:
                c.wd_id.add(id_)
        c.add(r)
    all_clusters.append(c)

In [476]:
for idx, c in enumerate(all_clusters):
    if idx == 20:
        break
    print(c.kb_id, c.fb_id, c.wd_id)

{'LDC2019E43:2022890'} {'m.010f1kyh'} set()
{'LDC2019E43:524901'} {'m.04swd', 'm.06pr6'} {'http://www.wikidata.org/entity/Q656', 'http://www.wikidata.org/entity/Q649'}
{'LDC2019E43:694423'} {'m.0dlvj'} {'http://www.wikidata.org/entity/Q7525'}
{'LDC2019E43:1004464'} {'m.032yv'} {'http://www.wikidata.org/entity/Q42225'}
{'LDC2019E43:518255'} {'m.04fx9m'} {'http://www.wikidata.org/entity/Q15760'}
{'LDC2019E43:703884'} {'m.0840w', 'm.0ndwhc3', 'm.02h83h', 'm.0b26yk'} {'http://www.wikidata.org/entity/Q1020077', 'http://www.wikidata.org/entity/Q170672', 'http://www.wikidata.org/entity/Q7835', 'http://www.wikidata.org/entity/Q7817'}
{'LDC2019E43:491422'} {'m.0m465'} {'http://www.wikidata.org/entity/Q39420'}
{'LDC2019E43:630673'} {'m.015h7'} {'http://www.wikidata.org/entity/Q166'}
{'LDC2019E43:688533'} {'m.01jbk6'} {'http://www.wikidata.org/entity/Q128499'}
{'LDC2019E43:703448'} {'m.021g_z', 'm.02sn34', 'm.09c7w0'} {'http://www.wikidata.org/entity/Q40855', 'http://www.wikidata.org/entity/Q30',

In [477]:
# # build cluster based on ids
# all_clusters = []

# for bid, data in blocks_kb.key_set_adapter:
#     if bid == 'None':
#         continue
    
#     c = Cluster(ds)
#     for _, r_id in data:
#         r = ds.get_record(r_id)
#         for id_ in r.selected_targets:
#             c.kb_id.add(id_)
#         c.add(r)
#     all_clusters.append(c)

# for bid, data in blocks_fb.key_set_adapter:
#     if bid == 'None':
#         continue
    
#     c = Cluster(ds)
#     for _, r_id in data:
#         r = ds.get_record(r_id)
#         for id_ in r.selected_fbid:
#             c.fb_id.add(id_)
#         c.add(r)
#     all_clusters.append(c)

In [478]:
for idx, c in enumerate(all_clusters):
    if idx == 5:
        break
    print(c.kb_id, c.fb_id, c.wd_id)

{'LDC2019E43:2022890'} {'m.010f1kyh'} set()
{'LDC2019E43:524901'} {'m.04swd', 'm.06pr6'} {'http://www.wikidata.org/entity/Q656', 'http://www.wikidata.org/entity/Q649'}
{'LDC2019E43:694423'} {'m.0dlvj'} {'http://www.wikidata.org/entity/Q7525'}
{'LDC2019E43:1004464'} {'m.032yv'} {'http://www.wikidata.org/entity/Q42225'}
{'LDC2019E43:518255'} {'m.04fx9m'} {'http://www.wikidata.org/entity/Q15760'}


In [479]:
# label_to_cluster_index = defaultdict(list)
# for c in all_clusters:
#     for rid in c.attractive_records:
#         for l in ds.get_record(rid).concatenated_labels:
#             label_to_cluster_index[l].append(c)

In [480]:
count_of_freebase_in_cluster = set([])
for c in all_clusters:
    count_of_freebase_in_cluster.add(len(c.fb_id))
count_of_freebase_in_cluster

{0, 1, 2, 3, 4, 5, 6, 7, 14}

In [481]:
# old approach
# # merge clusters
# merged_blocks = set([])
# for c in all_clusters:
#     for fb_id in c.fb_id:
#         for _, r_id in blocks_fb.get(fb_id):
#             c.add(r_id)
#         merged_blocks.add(fb_id)

# # only has fb id
# for bid, data in blocks_fb.key_set_adapter:
#     if bid == 'None' or bid in merged_blocks:
#         continue
    
#     c = Cluster(ds)
#     for _, r_id in data:
#         r = ds.get_record(r_id)
#         c.fb_id.add(r.freebase)
#         c.add(r)
#     all_clusters.append(c)

# only has freebase id
fb_only_clusters = {}
for bid, data in blocks_fb.key_set_adapter:
    if bid == 'None':
        continue
        
    fb_only_clusters[bid] = set()
    for _, r_id in data:
        r = ds.get_record(r_id)
        if r.selected_targets:
            continue
        fb_only_clusters[bid].add(r_id)
    if len(fb_only_clusters[bid]) == 0:
        del fb_only_clusters[bid]

for bid, cluster in fb_only_clusters.items():
    c = Cluster(ds)
    for r_id in cluster:
        c.add(r_id)
        r = ds.get_record(r_id)
        if r.fbid:
            for id_ in r.selected_fbid:
                c.fb_id.add(id_)
        if r.wikidata:
            for id_ in r.selected_wikidata:
                c.wd_id.add(id_)
    all_clusters.append(c)

In [482]:
len(fb_only_clusters)

3891

In [483]:
len_all_clusters = len(all_clusters)
len_all_clusters

6018

In [484]:
for idx, c in enumerate(all_clusters):
    if len(c.kb_id) > 1:
        print('mulitple kb_ids in cluster', c.kb_id)
        break
        
    kb_ids = set()
    for r_id in c.all_records:
        r = ds.get_record(r_id)
        if r.selected_targets:
            for id_ in r.selected_targets:
                kb_ids.add(id_)
    if len(kb_ids) > 1:
        print('mulitple kb_ids in cluster', kb_ids, c.kb_id)
        break
else:
    print('No multi-targets detected')

No multi-targets detected


In [485]:
# type normalization
# normalize_types = {
#     'ldcOnt:GPE': 'GeoLoc',
#     'ldcOnt:GPE.UrbanArea.City': 'GeoLoc',
#     'ldcOnt:LOC': 'GeoLoc',
#     'ldcOnt:LOC.Land': 'GeoLoc',
#     'ldcOnt:LOC.Land.Continent': 'GeoLoc',
#     'ldcOnt:LOC.Position.Region': 'GeoLoc',
#     'ldcOnt:ORG': 'Organization',
#     'ldcOnt:ORG.Association': 'Organization',
#     'ldcOnt:ORG.Association.Team': 'Organization',
#     'ldcOnt:ORG.CommercialOrganization': 'Organization',
#     'ldcOnt:ORG.CommercialOrganization.BroadcastingCompany': 'Organization',
#     'ldcOnt:ORG.CommercialOrganization.NewsAgency': 'Organization',
#     'ldcOnt:ORG.Government.Agency': 'Organization',
#     'ldcOnt:ORG.Government.Council': 'Organization',
#     'ldcOnt:ORG.Government.LawEnforcementAgency': 'Organization',
#     'ldcOnt:ORG.Government.LegislativeBody': 'Organization',
#     'ldcOnt:ORG.International': 'Organization',
#     'ldcOnt:ORG.MilitaryOrganization': 'Organization',
#     'ldcOnt:ORG.MilitaryOrganization.GovernmentArmedForces': 'Organization',
#     'ldcOnt:ORG.MilitaryOrganization.NonGovernmentMilitia': 'Organization',
#     'ldcOnt:ORG.PoliticalOrganization.Party': 'Organization',
#     'ldcOnt:PER': 'Person',
#     'ldcOnt:PER.Combatant': 'Person',
#     'ldcOnt:PER.MilitaryPersonnel': 'Person',
#     'ldcOnt:PER.MilitaryPersonnel.MilitaryOfficer': 'Person',
#     'ldcOnt:PER.Politician': 'Person',
#     'ldcOnt:PER.Politician.HeadOfGovernment': 'Person',
#     'ldcOnt:PER.ProfessionalPosition.Ambassador': 'Person',
#     'ldcOnt:PER.ProfessionalPosition.Minister': 'Person',
#     'ldcOnt:PER.ProfessionalPosition.Scientist': 'Person',
#     'ldcOnt:PER.ProfessionalPosition.Spokesperson': 'Person',
# }

def normalize_type(t):
    type_prefix = t.split('.')[0][len('ldcOnt:'):]
    if type_prefix in ('GPE', 'LOC'):
        return 'GeoLoc'
    return type_prefix

for t in selected_types:
    print(t, normalize_type(t))

ldcOnt:PER.ProfessionalPosition.Scientist PER
ldcOnt:PER.Combatant.Sniper PER
ldcOnt:ORG.Association ORG
ldcOnt:ORG.MilitaryOrganization.GovernmentArmedForces ORG
ldcOnt:ORG.Government.Railway ORG
ldcOnt:ORG.Government.LegislativeBody ORG
ldcOnt:PER.MilitaryPersonnel.MilitaryOfficer PER
ldcOnt:PER.ProfessionalPosition.Ambassador PER
ldcOnt:GPE.Country.Country GeoLoc
ldcOnt:PER.ProfessionalPosition.Spokesperson PER
ldcOnt:ORG.PoliticalOrganization.Party ORG
ldcOnt:PER.Combatant.Mercenary PER
ldcOnt:LOC.Position.AirSpace GeoLoc
ldcOnt:PER.MilitaryPersonnel PER
ldcOnt:ORG.CommercialOrganization.BroadcastingCompany ORG
ldcOnt:LOC.Position GeoLoc
ldcOnt:GPE.OrganizationOfCountries.OrganizationOfCountries GeoLoc
ldcOnt:GPE.ProvinceState.ProvinceState GeoLoc
ldcOnt:GPE.UrbanArea.City GeoLoc
ldcOnt:ORG.Government.LawEnforcementAgency ORG
ldcOnt:PER.Politician.Governor PER
ldcOnt:PER.ProfessionalPosition PER
ldcOnt:LOC.Position.Field GeoLoc
ldcOnt:ORG.Association.Team ORG
ldcOnt:ORG.Internation

In [486]:
# split based on types
all_clusters_splitted = []
for c in all_clusters:
    types = {}
    for r_id in c.all_records:
        r = ds.get_record(r_id)
        type_ = normalize_type(r.type)
        if type_ not in types:
            cc = Cluster(ds)
            cc.type = type_
            types[type_] = cc
            
        cc = types[type_]
        cc.add(r_id)
        if r.selected_targets:
            for id_ in r.selected_targets:
                cc.kb_id.add(id_)
        if r.selected_fbid:
            for id_ in r.selected_fbid:
                cc.fb_id.add(id_)
        if r.selected_wikidata:
            for id_ in r.selected_wikidata:
                cc.wd_id.add(id_)
    for cc in types.values():
        all_clusters_splitted.append(cc)

In [487]:
len_all_clusters_splitted = len(all_clusters_splitted)
len_all_clusters_splitted

6124

In [488]:
# # generate most common label
# most_common_labels = []
# for c in all_clusters_splitted:
#     name_count = defaultdict(int)
#     for rid in c.all_records:
#         for n in ds.get_record(rid).name:
#             name_count[n] += 1
# #     sorted_names = sorted(name_count.items(), key=lambda x: x[1], reverse=True)
# #     if sorted_names[0][0] == 'Житомирі вночі':
# #         print(sorted_names)
#     most_common_labels.append(sorted(name_count.items(), key=lambda x: x[1], reverse=True)[0][0])

# # for idx, l in enumerate(most_common_labels):
# #     if l == 'Житомирі вночі':
# #         print(idx)

In [489]:
# # merge
# deduped_all_clusters_splitted = {}
# merged_from = {}
# for idx, c in enumerate(all_clusters_splitted):
#     key = most_common_labels[idx] + ' ' + c.type
#     if key not in deduped_all_clusters_splitted:
#         deduped_all_clusters_splitted[key] = Cluster(ds)
#         deduped_all_clusters_splitted[key].main_label = most_common_labels[idx]
#         deduped_all_clusters_splitted[key].type = c.type
#         merged_from[key] = []
        
#     deduped_all_clusters_splitted[key].all_records |= c.all_records
#     deduped_all_clusters_splitted[key].attractive_records |= c.attractive_records
#     deduped_all_clusters_splitted[key].wd_id |= c.wd_id
#     deduped_all_clusters_splitted[key].kb_id |= c.kb_id
#     deduped_all_clusters_splitted[key].fb_id |= c.fb_id
#     merged_from[key].append(list(c.all_records))
        

In [490]:
# deduped_all_clusters_splitted_list = list(deduped_all_clusters_splitted.values())

In [491]:
# for idx, c in enumerate(deduped_all_clusters_splitted_list):
#     if len(c.kb_id) > 1:
#         print(c.main_label, len(c.all_records), len(c.attractive_records),len(c.kb_id), list(c.kb_id)[:10])
# #     if idx == 100:
# #         break

In [492]:
# for idx, c in enumerate(deduped_all_clusters_splitted_list):
#     if len(c.kb_id) > 1:
#         print(c.main_label, len(c.all_records), len(c.attractive_records),len(c.kb_id), list(c.kb_id)[:10])
# #     if idx == 100:
# #         break

In [493]:
# for c in merged_from['Житомирі вночі GeoLoc']:
#     for idx, rid in enumerate(c):
#         r = ds.get_record(rid)
#         print(rid, r.name, r.type, r.selected_targets)
#         if idx == 10:
#             break
        
#     break

In [494]:
from statistics import mean, median

def compute_statistics(scores):
    return {
        'min': min(scores),
        'max': max(scores),
        'median': median(scores),
        'average': mean(scores)
    }

def debug_output(c, output_type):
    # aggregate target
    target_agg_confidence = defaultdict(list)
    for rid in c.all_records:
        r = ds.get_record(rid)
        if r.targets:
            for idx, t in enumerate(r.targets):
                target_agg_confidence[t].append(r.target_scores[idx])
                
    # aggregate wikidata
    wikidata_agg_confidence = defaultdict(list)
    for rid in c.all_records:
        r = ds.get_record(rid)
        if r.wikidata:
            for idx, t in enumerate(r.wikidata):
                wikidata_agg_confidence[t].append(r.fbid_score_avg[idx])
                
    # generate json
    j = {'attractive_records': list(c.attractive_records), 'all_records': {}, 'output_type': output_type, 'type': c.type,
        'kb_id': list(c.kb_id), 'fb_id': list(c.fb_id), 'wd_id': list(c.wd_id),
        'kb_statistics': {k: compute_statistics(v) for k, v in target_agg_confidence.items()},
        'wd_statistics': {k: compute_statistics(v) for k, v in wikidata_agg_confidence.items()}
    }
    for rid in c.all_records:
        j['all_records'][rid] = ds.get_record(rid).__dict__
        if 'concatenated_labels' in j['all_records'][rid]:
            del j['all_records'][rid]['concatenated_labels']
#         j['all_records'][rid]['concatenated_labels'] = list(j['all_records'][rid]['concatenated_labels'])
    return j

In [495]:
# import json
# with open('clusters-baseline-20190712-rpi-002.jl', 'w') as f:
#     for c in all_clusters_splitted:
#         f.write(json.dumps(list(c.all_records)) + '\n')
# with open('clusters-baseline-20190710-001-debug.jl', 'w') as f:
#     for c in deduped_all_clusters_splitted_list:
#         f.write(json.dumps(debug_output(c, 'baseline')) + '\n')

# Cluster singletons

In [496]:
df_singleton = df_entity[df_entity['targets'].isnull() & df_entity['fbid'].isnull()]['e']

In [497]:
df_entity[df_entity['targets'].isnull() & df_entity['fbid'].isnull()].head()

Unnamed: 0,e,type,name,source,targets,target_scores,fbid,fbid_score_avg,fbid_score_max,wikidata,wiki_label_en,wiki_label_ru,wiki_label_uk,wiki_alias_en,wiki_alias_ru,wiki_alias_uk,lang,label,transl_name,transl_label_ru,transl_label_uk,transl_alias_ru,transl_alias_uk
15,http://www.isi.edu/gaia/entities/90c6f6ec-b605-438f-a5cb-89ea35f628b9,ldcOnt:PER.Politician.HeadOfGovernment,"(Путина,)",IC0015JFY,,,,,,,,,,,,,ru,"((Putin. Now Putin ,),)",[putin's],,,,
16,http://www.isi.edu/gaia/entities/b7291b40-d96b-4835-98eb-fba70e5d6251,ldcOnt:PER.Politician.HeadOfGovernment,"(Виктором Федоровичем Януковичем,)",IC0015JFY,,,,,,,,,,,,,ru,,[None],,,,
41,http://www.isi.edu/gaia/entities/d19e9ae1-3ee7-42ff-b63b-76926b3a200d,ldcOnt:PER.Politician.HeadOfGovernment,"(байкера Хирурга,)",IC0015JFY,,,,,,,,,,,,,ru,,[байкера Хирурга],,,,
52,http://www.isi.edu/gaia/entities/47516ee1-3744-4d38-9dab-a2f2d903d896,ldcOnt:GPE,"(Керченскому,)",IC0015JFY,,,,,,,,,,,,,ru,,[None],,,,
66,http://www.isi.edu/gaia/entities/67cfd11d-6c95-454a-b84b-d7df6d39d745,ldcOnt:GPE.Country.Country,"(субъект Федерации,)",IC0015JFY,,,,,,,,,,,,,ru,,[None],,,,


In [498]:
singleton_ids = df_singleton.tolist()

In [499]:
len_singleton = len(singleton_ids)
len_singleton

16489

In [500]:
from copy import deepcopy
merged_all_clusters = deepcopy(all_clusters_splitted)
MIN_SIM = 0.4

for idx, rid in enumerate(singleton_ids):
    print('\r', idx, end='')
    added = False
    r = ds.get_record(rid)
    r_type = normalize_type(r.type)
    for c in merged_all_clusters:
        sim = c.similarity(r)
        if r_type != c.type:
            continue
        if sim >= MIN_SIM:
            c.add(r, contribute=False)
            added = True
    # still singleton
    if not added:
        c = Cluster(ds)
        c.type = r_type
        c.add(r)
        merged_all_clusters.append(c)

 16488

In [501]:
len(merged_all_clusters)

19209

In [502]:
len_singleton + len_all_clusters_splitted

22613

In [503]:
len_singleton + len_all_clusters_splitted - len(merged_all_clusters)

3404

In [504]:
# # find merged singletons
# select_count = 50
# for rid in singleton_ids:
#     for c in merged_all_clusters:
#         if rid in c.all_records and len(c.all_records) > 1:
#             for rid_ in c.all_records:
#                 r = ds.get_record(rid_)
#                 if r.targets:
#                     print('targets', r.targets, rid, c.all_records)
#                     print('-----------------')
#                     break
# #             print(c.all_records, rid)
# #             print('-----------------')
#             select_count -= 1
#             break
#     if select_count <= 0:
#         break

In [505]:
# r = ds.get_record('http://www.isi.edu/gaia/entities/50163156-1d27-45b2-9fca-011ad9f661ea')
# print(r.name, r.targets, r.concatenated_labels)
# r = ds.get_record('http://www.isi.edu/gaia/entities/d557fcd9-3c2a-4f3c-a749-5d4f5de1e1e5')
# print(r.name, r.targets, r.concatenated_labels)
# r = ds.get_record('http://www.isi.edu/gaia/entities/3804847e-12d5-4fba-96a6-cf8c78914e19')
# print(r.name, r.targets, r.concatenated_labels)
# r = ds.get_record('http://www.isi.edu/gaia/entities/d0e5b877-597d-4140-9197-f152a88fb932')
# print(r.name, r.targets, r.concatenated_labels)
# r = ds.get_record('http://www.isi.edu/gaia/entities/767f0133-7fdc-4a3e-8e15-e1203b8d9bfe')
# print(r.name, r.targets, r.concatenated_labels, r.selected_targets)
# r = ds.get_record('http://www.isi.edu/gaia/entities/5ea2c3d8-4348-48a3-82d4-b8eb3c4a2dcc')
# print(r.name, r.targets, r.concatenated_labels)
# r = ds.get_record('http://www.isi.edu/gaia/entities/de97de20-0cb4-4189-b8e4-26dec6033ba5')
# print(r.name, r.targets, r.selected_targets, r.concatenated_labels)

In [506]:
# r = ds.get_record('http://www.isi.edu/gaia/entities/c0aa107f-d4d6-43c8-81df-d4541dac9b76')
# print(r.name, r.targets, r.concatenated_labels)
# r = ds.get_record('http://www.isi.edu/gaia/entities/ea75c413-d572-42d7-86f6-d39d7dc180d8')
# print(r.name, r.targets, r.concatenated_labels)
# r = ds.get_record('http://www.isi.edu/gaia/entities/70ecb2fd-9c41-4378-b332-c872611102c8')
# print(r.name, r.targets, r.concatenated_labels)

# merge similar clusters

In [507]:
# # generate most common label
# merged_most_common_labels = []
# for c in merged_all_clusters:
#     name_count = defaultdict(int)
#     for rid in c.all_records:
#         for n in ds.get_record(rid).name:
#             name_count[n] += 1
#     merged_most_common_labels.append(sorted(name_count.items(), key=lambda x: x[1], reverse=True)[0][0])

In [508]:
# # merge
# # self.attractive_records = set([])  # contribute to clustering
# # self.all_records = set([])
# # self.ds = ds
# # self.type = None
# # self.wd_id = set([])
# # self.kb_id = set([])
# # self.fb_id = set([])
    
# deduped_merged_all_clusters = {}
# for idx, c in enumerate(merged_all_clusters):
#     key = merged_most_common_labels[idx] + ' ' + c.type
#     if key not in deduped_merged_all_clusters:
#         deduped_merged_all_clusters[key] = Cluster(ds)
#         deduped_merged_all_clusters[key].main_label = merged_most_common_labels[idx]
#         deduped_merged_all_clusters[key].type = c.type
        
#     deduped_merged_all_clusters[key].all_records |= c.all_records
#     deduped_merged_all_clusters[key].attractive_records |= c.attractive_records
#     deduped_merged_all_clusters[key].wd_id |= c.wd_id
#     deduped_merged_all_clusters[key].kb_id |= c.kb_id
#     deduped_merged_all_clusters[key].fb_id |= c.fb_id

In [509]:
# len(merged_all_clusters) - len(deduped_merged_all_clusters)

In [510]:
# all clusters have more than one kb id
for idx, c in enumerate(merged_all_clusters):
    if len(c.kb_id) > 1:
        print(c.main_label, len(c.all_records), len(c.attractive_records),len(c.kb_id), list(c.kb_id)[:10])
#     if idx == 100:
#         break

In [511]:
# all the clusters which have more than 1000 records
cnt = 0
for idx, c in enumerate(merged_all_clusters):
    if len(c.all_records) > 100:
#         print(c.main_label, len(c.all_records), len(c.attractive_records),len(c.kb_id), list(c.kb_id)[:10])
        cnt += 1
cnt

26

In [512]:
# for idx, c in enumerate(deduped_merged_all_clusters.values()):
#     if len(c.kb_id) > 1:
#         print(c.main_label, len(c.all_records), len(c.attractive_records),len(c.kb_id), list(c.kb_id)[:10])
# #     if idx == 100:
# #         break

In [513]:
# # all the clusters which have more than 1000 records
# for idx, c in enumerate(deduped_merged_all_clusters.values()):
#     if len(c.all_records) > 1000:
#         print(c.main_label, len(c.all_records), len(c.attractive_records),len(c.kb_id), list(c.kb_id)[:10])

In [514]:
import json

with open(output_path, 'w') as f:
    for c in merged_all_clusters:
        f.write(json.dumps(list(c.all_records)) + '\n')
# with open('clusters-20190710-001-debug.jl', 'w') as f:
#     for c in merged_all_clusters:
#         f.write(json.dumps(debug_output(c, 'complete')) + '\n')

# Augment labels for entities have no id