In [1]:
"""
lookup candidate entities and classes
"""
import os
import pandas as pd
import sys
import argparse

current_path = os.getcwd()
parser = argparse.ArgumentParser()
parser.add_argument(
    '--input_dir',
    type=str,
    default=os.path.join(current_path, 'data'),
    help='Directory of input/output')
parser.add_argument(
    '--file_type',
    type=str,
    default='csv',
    help='File type')

FLAGS, unparsed = parser.parse_known_args()
# if not os.path.exists(FLAGS.input_dir):
#     os.mkdir(FLAGS.input_dir)

In [2]:
# Get all the csv files from the input directory
def get_data_files(data_folder):
    """
    A function used to get all the csv files from the input directory
    ...

    Attributes
    ----------
    data_folder : str
        the folder within  the working directory where the data is located
    """

    files = [] # a list of all filenames, including file extensions, that contain data
    csv_files = [] # same list as above but without the file extension

    # Get the list of files
    files = [f for f in os.listdir(FLAGS.input_dir+data_folder) if os.path.isfile(os.path.join(FLAGS.input_dir+data_folder, f))]
    csv_files = [f.replace(".csv","") for f in os.listdir(FLAGS.input_dir+data_folder) if os.path.isfile(os.path.join(FLAGS.input_dir+data_folder, f))]
    
    return csv_files

def get_target_cta_columns(target_config_file, csv_files, filter_col = True):
    """
    A function used to get which columns from the csv files need to be considered for the CTA. This is a subset of the file columns ignoring anything that is not an entity
    ...

    Attributes
    ----------
    target_config_file : str
        the file that contains the target column indices for each file
    csv_files : list
        the list of csv files that have the tabular data
    filter_col : boolean
        a flag to indicate whether we should narrow down the reading of the columns to only those targeted for the CTA task
    """
   
    target_col_file = os.path.join(FLAGS.input_dir, target_config_file)
    df_target_col = pd.read_csv(target_col_file,header=None, names=['filename','column_index'])
    
    # filter to only those files that are included in the csv_files
    df_target_col = df_target_col.loc[df_target_col['filename'].isin(csv_files)]
    
    # collapse all rows pertaining to the same file into one key value pair. The key is the filename and the value is the list with the column indices that should be considered
    # dict_target = {'CTRL_DBP_GEO_european_countries_capital_populated_cities': [0, 1, 2]}
    dict_target = dict()
    
    for index,row in df_target_col.iterrows():
        
        # is this is the first row with this file create the key
        if row['filename'] not in dict_target:
            dict_target[row['filename']]= []
            
        # append the new target column to the target column list for that file
        if filter_col:
            dict_target[row['filename']].append(row['column_index'])
    
    return dict_target

    

In [3]:
data_folder = '\lite'
# data_folder = '\\tables_full'

# Get the list of csv files with tabular data
csv_files = get_data_files(data_folder)
# csv_files = get_data_files('\\tables_full')

# Get the columns we need to consider for the CTA task
dict_target_col = get_target_cta_columns('CTA_DBP_Round1_Targets.csv', csv_files,True)

# dict_target_col

In [4]:
# data = list()
# data_folder = '\lite'
# has_header_row = True

# for file in dict_target_col:
#     element = dict()
#     element['filename'] = file
#     df_data = pd.DataFrame()
#     df_title = pd.DataFrame()
    
    
    
#     filename = file + '.' + FLAGS.file_type
#     tab_data_file = os.path.join(FLAGS.input_dir + data_folder, filename)
      
#     # read the file data in a dataframe. Also read the column titles if we need to use them
#     if len(dict_target_col[file])>0:
#         if has_header_row:
#             df_data = pd.read_csv(tab_data_file,header=None, skiprows=[0], usecols=dict_target_col[file])
#             df_title = pd.read_csv(tab_data_file,header=None, usecols=dict_target_col[file], nrows = 1)
#         else:
#             df_data = pd.read_csv(tab_data_file,header=None, usecols=dict_target_col[file])
#     else:
#         if has_header_row:
#             df_data = pd.read_csv(tab_data_file,header=None, skiprows=[0])
#             df_title = pd.read_csv(tab_data_file,header=None, nrows = 1)
#         else:
#             df_data = pd.read_csv(tab_data_file,header=None)

#     # add the column headers to the data dictionary
#     try:
#         element['column_titles'] = list(df_title.iloc[0,:])
#     except:
#         pass
    
#     file_element = dict()
#     for column in df_data.columns:
#         file_element[column] = list(set(df_data[column]))
#     element['data'] = file_element
    
#     element['dataframe'] = df_data    
#     data.append(element)

In [5]:
# data

In [6]:
def read_data(data_folder, dict_target_col, has_header_row = False):
    data = list()

    for file in dict_target_col:
        element = dict()
        element['filename'] = file
        df_data = pd.DataFrame()
        df_title = pd.DataFrame()



        filename = file + '.' + FLAGS.file_type
        tab_data_file = os.path.join(FLAGS.input_dir + data_folder, filename)

        # read the file data in a dataframe. Also read the column titles if we need to use them
        if len(dict_target_col[file])>0:
            if has_header_row:
                df_data = pd.read_csv(tab_data_file,header=None, skiprows=[0], usecols=dict_target_col[file])
                df_title = pd.read_csv(tab_data_file,header=None, usecols=dict_target_col[file], nrows = 1)
            else:
                df_data = pd.read_csv(tab_data_file,header=None, usecols=dict_target_col[file])
        else:
            if has_header_row:
                df_data = pd.read_csv(tab_data_file,header=None, skiprows=[0])
                df_title = pd.read_csv(tab_data_file,header=None, nrows = 1)
            else:
                df_data = pd.read_csv(tab_data_file,header=None)

        # add the column headers to the data dictionary
        try:
            element['column_titles'] = list(df_title.iloc[0,:])
        except:
            pass

        file_element = dict()
        for column in df_data.columns:
            file_element[column] = list(set(df_data[column]))
        element['data'] = file_element

        element['dataframe'] = df_data    
        data.append(element)
    
    return data
    

In [11]:
# data

In [8]:
import requests
import xml.etree.ElementTree as ET

def retrieve_dbpedia_classes (query_string, entity_classes, max_hits = 5):
    web_api = 'http://lookup.dbpedia.org/api/search/KeywordSearch?MaxHits=%s&QueryString=%s'
    dbo_prefix = 'http://dbpedia.org/ontology/'
    dbp_prefix = 'http://dbpedia.org/resource/'
    
#     entity_classes = dict()
    try:
        lookup_url = web_api % (max_hits, query_string)
#         print(lookup_url)
        lookup_res = requests.get(lookup_url)
        root = ET.fromstring(lookup_res.content)
        i=0
        for child in root:
            i+=1
#             print("\n\n\n")
#             print(child[1].text)
            entity = child[1].text.split(dbp_prefix)[1]
#             print(entity)
            classes = list()
            for cc in child[3]:
                cls_URI = cc[1].text
#                 print(cls_URI)
                if dbo_prefix in cls_URI:
                    classes.append((cls_URI.split(dbo_prefix)[1],i))
            if len(classes)>0:
                entity_classes[entity] = classes
    except UnicodeDecodeError:
        pass
    return entity_classes

In [9]:
data = read_data(data_folder, dict_target_col, True)
# data[0]['data'][2]
# data[1]['column_titles']
# data[0]['data']

In [8]:
# entity_classes = dict()
# entity_classes = retrieve_dbpedia_classes('Capital City of Budapest',entity_classes,5)
# entity_classes

{'Budapest': [('Settlement', 1),
  ('City', 1),
  ('Place', 1),
  ('PopulatedPlace', 1),
  ('Location', 1)],
 'Hungary': [('Place', 2),
  ('Country', 2),
  ('PopulatedPlace', 2),
  ('Location', 2)],
 'Cluj-Napoca': [('Settlement', 3),
  ('City', 3),
  ('Place', 3),
  ('PopulatedPlace', 3),
  ('Location', 3)],
 'Debrecen': [('Settlement', 4),
  ('City', 4),
  ('Place', 4),
  ('PopulatedPlace', 4),
  ('Location', 4)],
 'Miskolc': [('Settlement', 5),
  ('City', 5),
  ('Place', 5),
  ('PopulatedPlace', 5),
  ('Location', 5)]}

In [12]:
from IPython.display import clear_output
entity_classes = dict()
i=0;
for entity in data[2]['data'][4]:
    entity_classes = retrieve_dbpedia_classes(entity,entity_classes,5)
    clear_output(wait=True)
    print(round(i*100 / len(data[2]['data'][4]),2))
    i+=1

99.57


In [17]:
candidate_classes = list([])
candidate_classes_rank = list([])
for key in entity_classes:
    for candicate_class in entity_classes[key]:
        cc,rank = candicate_class
        if cc not in candidate_classes:
            candidate_classes.append(cc)
            candidate_classes_rank.append((cc,rank))

candidate_classes_rank = sorted(candidate_classes_rank, key=lambda x: x[1])
[t[0] for t in candidate_classes_rank if t[1]<=2]

['AdministrativeRegion',
 'Place',
 'PopulatedPlace',
 'Location',
 'Region',
 'Town',
 'Criminal']

In [75]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

model = Word2Vec.load("word2vec.model")
model.train([[t[0] for t in candidate_classes_rank if t[1]<=100]], total_examples=1, epochs=1)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).e

(0, 60)