In [1]:
"""
lookup candidate entities and classes
"""
import os
import pandas as pd
import sys
import argparse

current_path = os.getcwd()
parser = argparse.ArgumentParser()
parser.add_argument(
    '--input_dir',
    type=str,
    default=os.path.join(current_path, 'data'),
    help='Directory of input/output')
parser.add_argument(
    '--file_type',
    type=str,
    default='csv',
    help='File type')

FLAGS, unparsed = parser.parse_known_args()
# if not os.path.exists(FLAGS.input_dir):
#     os.mkdir(FLAGS.input_dir)

In [2]:
# Get all the csv files from the input directory
def get_data_files(data_folder):
    """
    A function used to get all the csv files from the input directory
    ...

    Attributes
    ----------
    data_folder : str
        the folder within  the working directory where the data is located
    """

    files = [] # a list of all filenames, including file extensions, that contain data
    csv_files = [] # same list as above but without the file extension

    # Get the list of files
    files = [f for f in os.listdir(FLAGS.input_dir+data_folder) if os.path.isfile(os.path.join(FLAGS.input_dir+data_folder, f))]
    csv_files = [f.replace(".csv","") for f in os.listdir(FLAGS.input_dir+data_folder) if os.path.isfile(os.path.join(FLAGS.input_dir+data_folder, f))]
    
    return csv_files

def get_target_cta_columns(target_config_file, csv_files, filter_col = True):
    """
    A function used to get which columns from the csv files need to be considered for the CTA. This is a subset of the file columns ignoring anything that is not an entity
    ...

    Attributes
    ----------
    target_config_file : str
        the file that contains the target column indices for each file
    csv_files : list
        the list of csv files that have the tabular data
    filter_col : boolean
        a flag to indicate whether we should narrow down the reading of the columns to only those targeted for the CTA task
    """
   
    target_col_file = os.path.join(FLAGS.input_dir, target_config_file)
    df_target_col = pd.read_csv(target_col_file,header=None, names=['filename','column_index'])
    
    # filter to only those files that are included in the csv_files
    df_target_col = df_target_col.loc[df_target_col['filename'].isin(csv_files)]
    
    # collapse all rows pertaining to the same file into one key value pair. The key is the filename and the value is the list with the column indices that should be considered
    # dict_target = {'CTRL_DBP_GEO_european_countries_capital_populated_cities': [0, 1, 2]}
    dict_target = dict()
    
    for index,row in df_target_col.iterrows():
        
        # is this is the first row with this file create the key
        if row['filename'] not in dict_target:
            dict_target[row['filename']]= []
            
        # append the new target column to the target column list for that file
        if filter_col:
            dict_target[row['filename']].append(row['column_index'])
    
    return dict_target

# Get the list of csv files with tabular data
csv_files = get_data_files('\lite')
# csv_files = get_data_files('\\tables_full')

# Get the columns we need to consider for the CTA task
dict_target_col = get_target_cta_columns('CTA_DBP_Round1_Targets.csv', csv_files,True)

    

In [3]:
data = list()
data_folder = '\lite'
has_header_row = True

for file in dict_target_col:
    element = dict()
    element['filename'] = file
    df_data = pd.DataFrame()
    df_title = pd.DataFrame()
    
    
    
    filename = file + '.' + FLAGS.file_type
    tab_data_file = os.path.join(FLAGS.input_dir + data_folder, filename)
      
    # read the file data in a dataframe. Also read the column titles if we need to use them
    if len(dict_target_col[file])>0:
        if has_header_row:
            df_data = pd.read_csv(tab_data_file,header=None, skiprows=[0], usecols=dict_target_col[file])
            df_title = pd.read_csv(tab_data_file,header=None, usecols=dict_target_col[file], nrows = 1)
        else:
            df_data = pd.read_csv(tab_data_file,header=None, usecols=dict_target_col[file])
    else:
        if has_header_row:
            df_data = pd.read_csv(tab_data_file,header=None, skiprows=[0])
            df_title = pd.read_csv(tab_data_file,header=None, nrows = 1)
        else:
            df_data = pd.read_csv(tab_data_file,header=None)

    # add the column headers to the data dictionary
    try:
        element['column_titles'] = list(df_title.iloc[0,:])
    except:
        pass
    
    file_element = dict()
    for column in df_data.columns:
        file_element[column] = list(set(df_data[column]))
    element['data'] = file_element
    
    element['dataframe'] = df_data    
    data.append(element)

In [29]:
data[0]['data'][2]
data[1]['column_titles']

['country', 'capital', 'most_populated_city']

In [28]:
data[1]['data']

{0: ['Serbia',
  'Georgia',
  'France',
  'Montenegro',
  'Latvia',
  'Norway',
  'Cyprus',
  'Greece',
  'Kazakhstan',
  'Hungary',
  'Slovakia',
  'Bosnia and Herzegovina',
  'Bulgaria',
  'Armenia',
  'Moldova',
  'Denmark',
  'Poland',
  'Sweden',
  'Босна и Херцеговина',
  'Ukraine',
  'Crna Gora',
  'San Marino',
  'Spain',
  'Lithuania',
  'Liechtenstein',
  'Republika Srbija',
  'Konungariket Sverige',
  'Belarus',
  'Република Србија',
  'Црна Гора',
  'Romania',
  'Andorra',
  'Republic of Macedonia',
  'Italy',
  'Estonia',
  'Switzerland',
  'Croatia',
  'Kosovo',
  'Slovenia',
  'Bosna i Hercegovina',
  'Malta',
  'United Kingdom',
  'the Czech Republic',
  'Austria',
  'Azerbaijan',
  'Turkey'],
 1: ['VVaduzz',
  'Zagrebbb',
  'Beograd / Београдд',
  'Chiișinău',
  'Rīga',
  'Prahaaa',
  'Roma Capitallee',
  'Skopjee',
  'City of San Marrino',
  'Belgradee',
  'Baakı',
  'City of Zagrebb',
  'Град Скоопје',
  'Budapesttt',
  'Soofia',
  'Rommaa',
  'City of Belgrade',
  '

In [3]:
csv_files

['CTRL_DBP_GEO_european_countries_capital_populated_cities',
 'CTRL_DBP_GEO_european_countries_capital_populated_cities_NOISE2',
 'CTRL_DBP_GEO_protected_areas',
 'CTRL_DBP_GEO_protected_areas_NOISE2',
 'CTRL_DBP_GEO_us_lakes - Copy',
 'CTRL_DBP_GEO_us_lakes',
 'CTRL_DBP_GEO_us_lakes_NOISE2']

In [6]:
def lookup_resources(cell_text):
    dbo_prefix = 'http://dbpedia.org/ontology/'
    dbp_prefix = 'http://dbpedia.org/resource/'
    entity_classes = dict()
    cell_items = list()
    cell_brackets = re.findall('\((.*?)\)', cell_text)
    for cell_bracket in cell_brackets:
        cell_text = cell_text.replace('(%s)' % cell_bracket, '')
    cell_text = cell_text.strip()
    if len(cell_text) > 2:
        cell_items.append(cell_text)
    for cell_bracket in cell_brackets:
        if len(cell_bracket) > 2:
            cell_items.append(cell_bracket.strip())
    for cell_item in cell_items:
        try:
            lookup_url = 'http://lookup.dbpedia.org/api/search/KeywordSearch?MaxHits=2&QueryString=%s' % cell_item
            lookup_res = requests.get(lookup_url)
            root = ET.fromstring(lookup_res.content)
            for child in root:
                entity = child[1].text.split(dbp_prefix)[1]
                classes = list()
                for cc in child[3]:
                    cls_URI = cc[1].text
                    if dbo_prefix in cls_URI:
                        classes.append(cls_URI.split(dbo_prefix)[1])
                entity_classes[entity] = classes
        except UnicodeDecodeError:
            pass
    return entity_classes

In [7]:
import requests
import re
# import sparql
import xml.etree.ElementTree as ET
lookup_resources('Aaron River Reservoir')


{'Aaron_River_Reservoir': ['Place',
  'Dam',
  'ArchitecturalStructure',
  'Location',
  'Infrastructure']}

In [30]:
candidate_class = dict()

for k in data[1]['data']:
    candidate_class[k] = list([])
    for i in data[1]['data'][k]:
        lookup_resources(i)
        candidate_class[k].append(lookup_resources(i))
    print(k)

0
1
2


In [49]:

cell_text = 'Metropolitan City of Rome Capital'
# cell_text = 'Limasol'

# dbo_prefix = 'http://dbpedia.org/ontology/'
# dbp_prefix = 'http://dbpedia.org/resource/'
entity_classes = dict()
cell_items = list()
cell_brackets = re.findall('\((.*?)\)', cell_text)
for cell_bracket in cell_brackets:
    cell_text = cell_text.replace('(%s)' % cell_bracket, '')
cell_text = cell_text.strip()
if len(cell_text) > 2:
    cell_items.append(cell_text)
for cell_bracket in cell_brackets:
    if len(cell_bracket) > 2:
        cell_items.append(cell_bracket.strip())
# for cell_item in cell_items:
#     try:
#         lookup_url = 'http://lookup.dbpedia.org/api/search/KeywordSearch?MaxHits=2&QueryString=%s' % cell_item
#         lookup_res = requests.get(lookup_url)
#         root = ET.fromstring(lookup_res.content)
#         for child in root:
#             entity = child[1].text.split(dbp_prefix)[1]
#             classes = list()
#             for cc in child[3]:
#                 cls_URI = cc[1].text
#                 if dbo_prefix in cls_URI:
#                     classes.append(cls_URI.split(dbo_prefix)[1])
#             entity_classes[entity] = classes
#     except UnicodeDecodeError:
#         pass
# return entity_classes

In [50]:
cell_items

['Metropolitan City of Rome Capital']

In [43]:
import re
m = re.search('(?<=abc)ef', 'abcdef')
m.group(0)

AttributeError: 'NoneType' object has no attribute 'group'