In [1]:
"""
lookup candidate entities and classes
"""
import os
import pandas as pd
import sys
import argparse
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
import json
import time
from itertools import islice
from collections import Counter
from tqdm import tqdm
import re

current_path = os.getcwd()
parser = argparse.ArgumentParser()
parser.add_argument(
    '--input_dir',
    type=str,
    default=os.path.join(current_path, 'data'),
    help='Directory of input/output')
parser.add_argument(
    '--file_type',
    type=str,
    default='csv',
    help='File type')
parser.add_argument(
    '--lookup_results_rank',
    type=int,
    default=5,
    help='File type')

FLAGS, unparsed = parser.parse_known_args()
# if not os.path.exists(FLAGS.input_dir):
#     os.mkdir(FLAGS.input_dir)


In [2]:
# Get all the csv files from the input directory
def get_data_files(data_folder):
    """
    A function used to get all the csv files from the input directory
    ...

    Attributes
    ----------
    data_folder : str
        the folder within  the working directory where the data is located
    """

    files = [] # a list of all filenames, including file extensions, that contain data
    csv_files = [] # same list as above but without the file extension

    # Get the list of files
    files = [f for f in os.listdir(FLAGS.input_dir+data_folder) if os.path.isfile(os.path.join(FLAGS.input_dir+data_folder, f))]
    csv_files = [f.replace(".csv","") for f in os.listdir(FLAGS.input_dir+data_folder) if os.path.isfile(os.path.join(FLAGS.input_dir+data_folder, f))]
    
    return csv_files

def get_target_cta_columns(target_config_file, data_folder, csv_files, filter_col = True):
    """
    A function used to get which columns from the csv files need to be considered for the CTA. This is a subset of the file columns ignoring anything that is not an entity
    ...

    Attributes
    ----------
    target_config_file : str
        the file that contains the target column indices for each file
    csv_files : list
        the list of csv files that have the tabular data
    filter_col : boolean
        a flag to indicate whether we should narrow down the reading of the columns to only those targeted for the CTA task
    """
   
    target_col_file = os.path.join(FLAGS.input_dir+data_folder, target_config_file)
    df_target_col = pd.read_csv(target_col_file,header=None, names=['filename','column_index'])
    
    # filter to only those files that are included in the csv_files
    df_target_col = df_target_col.loc[df_target_col['filename'].isin(csv_files)]
    
    # collapse all rows pertaining to the same file into one key value pair. The key is the filename and the value is the list with the column indices that should be considered
    # dict_target = {'CTRL_DBP_GEO_european_countries_capital_populated_cities': [0, 1, 2]}
    dict_target = dict()
    
    for index,row in df_target_col.iterrows():
        
        # is this is the first row with this file create the key
        if row['filename'] not in dict_target:
            dict_target[row['filename']]= []
            
        # append the new target column to the target column list for that file
        if filter_col:
            dict_target[row['filename']].append(int(row['column_index']))
    
    return dict_target

def get_ground_truth(file, folder, csv_files):
    """
    A function used to get the ground truths as provided in the setup
    ...

    Attributes
    ----------
    file : str
        the file that contains the ground truth for the class of each column in each file
    folder : str
        the folder that contains the ground truth file
    csv_files : list
        the list of csv files that have the tabular data
    """
    
    dbo_prefix = 'http://dbpedia.org/ontology/'
   
    filepath = os.path.join(FLAGS.input_dir+folder, file)
    df_ground_truth = pd.read_csv(filepath,header=None, names=['filename','column_index', 'class'])
    
    # filter to only those files that are included in the csv_files
    df_ground_truth = df_ground_truth.loc[df_ground_truth['filename'].isin(csv_files)]
    
    # collapse all rows pertaining to the same file into one key value pair. The key is the filename and the value is the list with the column indices that should be considered
    # dict_target = {'CTRL_DBP_GEO_european_countries_capital_populated_cities': [0, 1, 2]}
    dict_gt = dict()
    
    for index,row in df_ground_truth.iterrows():
        
        # is this is the first row with this file create the key
        if row['filename'] not in dict_gt:
            dict_gt[row['filename']]= dict()
            
        # append the new target column to the target column list for that file
        dict_gt[row['filename']][int(row['column_index'])] = row['class'].split(dbo_prefix)[1]
    
    return dict_gt

def read_data(data_folder, dict_target_col, has_header_row = False):
    """
    A function used to read the data from the csvs in the data_folder only considering the columns that are in the dict_target_col
    ...

    Attributes
    ----------
    folder : str
        the folder that contains the csvs with the tabular data
    dict_target_col : dictionary
        a dictionary with csv filenames as the key and an array of relevant column indices as a value
    has_header_row : boolean
        a flag to indicate whether the first row in the csv files needs to be skipped as it is a header
    """
    data = list()

    for file in dict_target_col:
        element = dict()
        element['filename'] = file
        df_data = pd.DataFrame()
        df_title = pd.DataFrame()



        filename = file + '.' + FLAGS.file_type
        tab_data_file = os.path.join(FLAGS.input_dir + data_folder, filename)

        # read the file data in a dataframe. Also read the column titles if we need to use them
        if len(dict_target_col[file])>0:
            if has_header_row:
                df_data = pd.read_csv(tab_data_file,header=None, skiprows=[0], usecols=dict_target_col[file])
                df_title = pd.read_csv(tab_data_file,header=None, usecols=dict_target_col[file], nrows = 1)
            else:
                df_data = pd.read_csv(tab_data_file,header=None, usecols=dict_target_col[file])
        else:
            if has_header_row:
                df_data = pd.read_csv(tab_data_file,header=None, skiprows=[0])
                df_title = pd.read_csv(tab_data_file,header=None, nrows = 1)
            else:
                df_data = pd.read_csv(tab_data_file,header=None)

        # add the column headers to the data dictionary
        try:
            element['column_titles'] = list(df_title.iloc[0,:])
        except:
            pass

        file_element = dict()
        for column in df_data.columns:
            file_element[column] = list(set(df_data[column])) #SEE IF WE CAN COMMENT THE SET OUT AND AGE ALL THE DATA IN WITH REPETITION
        element['data'] = file_element

#         element['dataframe'] = df_data    
        data.append(element)
    
    return data

### Background Setup

As part of this initial step we will need to load the data we are going to process as well as the targets we are trying to meet. The data is located in the data folder as follows
- round_1:
    - gt: the expected outcome (ground truth)
    - tables: the tabular data
    - targets: the columns / cells we need to consider for the CTA/CEA
----
Step 1: Get a list of all the csv files in the data folder

In [3]:
# Get the list of csv files with tabular data
csv_files = get_data_files('\\round_1\\tables')
# csv_files = csv_files[:1]
csv_files[:5]

['10579449_0_1681126353774891032',
 '11833461_1_3811022039809817402',
 '13719111_1_5719401842463579519',
 '14067031_0_559833072073397908',
 '1438042986423_95_20150728002306-00125-ip-10-236-191-2_88435628_5']

Step 2: Get the columns we need to consider for the CTA task

In [4]:
# Get the columns we need to consider for the CTA task
dict_target_col = get_target_cta_columns('CTA_Round1_Targets.csv', '\\round_1\\targets', csv_files,True)
list(islice(dict_target_col.items(), 5))

[('58891288_0_1117541047012405958', [1, 3]),
 ('8468806_0_4382447409703007384', [1, 2]),
 ('50245608_0_871275842592178099', [0, 3, 4]),
 ('14067031_0_559833072073397908', [1, 7, 5, 0]),
 ('39759273_0_1427898308030295194', [1, 3])]

Step 3: Get the ground truth for all columns in the set of csv files

In [5]:
ground_truth = get_ground_truth('CTA_Round1_gt.csv', '\\round_1\\gt', csv_files)
list(islice(ground_truth.items(), 5))

[('58891288_0_1117541047012405958', {1: 'Film', 3: 'Person'}),
 ('8468806_0_4382447409703007384', {1: 'Lake', 2: 'Country'}),
 ('50245608_0_871275842592178099', {0: 'Film', 3: 'Person', 4: 'Writer'}),
 ('14067031_0_559833072073397908',
  {1: 'Language', 7: 'Currency', 5: 'City', 0: 'Country'}),
 ('39759273_0_1427898308030295194', {1: 'Film', 3: 'Person'})]

# 1. Load Data

The next step is to load the data from the csv files. We load the data as an array of dictionaries.
Each dictionary will have the following structure:<br>
{<br>
<blockquote>
<strong>'filename':</strong> '1438042986423_95_20150728002306-00125-ip-10-236-191-2_88435628_5',<br>
<strong>'column_titles'</strong>: ['Party'],<br>
<strong>'data'</strong>: <br>
    {<br>
    <blockquote>
        <strong>0:</strong> ['PC', 'Lib-Dem','SNP','UKIP','Labour','BNP','Conservative','Green']<br>
    </blockquote>
        }<br>
</blockquote>    
 }

In [6]:
data = read_data('\\round_1\\tables', dict_target_col,True)

def append_gt_to_data(data, ground_truth):
    for file in data:
        filename = file['filename']
    #     print(filename)
        file['gt'] = dict() 
        for col in file['data']:
    #         print(col, ground_truth[filename][col])
            file['gt'][col] = ground_truth[filename][col]
        
append_gt_to_data(data, ground_truth)

with open(('data-%s.json' % time.strftime("%Y%m%d-%H%M%S")), 'w') as fp:
        json.dump(data, fp)

In [7]:

for file in data:
    filename = file['filename']
#     print(filename)
    file['gt'] = dict() 
    for col in file['data']:
#         print(col, ground_truth[filename][col])
        file['gt'][col] = ground_truth[filename][col]

In [8]:
# data[:5]

In [9]:
def dbo_sparql_results(query_string):
    sparql = SPARQLWrapper('https://dbpedia.org/sparql')
    sparql.setQuery(query_string)
    
    try:
        sparql.setReturnFormat(JSON)
        qres = sparql.query().convert()
        return qres
    except:
        pass


def get_dbo_classes_sparql(cell):
    
    classes = list([])

    dbo_prefix = 'http://dbpedia.org/ontology/'
#     print(f'###################{cell}########################')
    query_string = f'''
    SELECT ?class
    WHERE {{ dbr:{cell} a ?class.
    }}'''

#         query_string = f'''
#         select distinct ?superclass 
#         where {{dbr:{cell} rdf:type ?e. 
#             ?e rdfs:subClassOf* ?superclass.
#         FILTER (strstarts(str(?superclass), '{dbo_prefix}'))}}'''

#         print(query_string)

    qres = dbo_sparql_results(query_string)
#         pprint(qres)
    try:
        for entity_class in qres['results']['bindings']:
            if dbo_prefix in entity_class[list(qres['results']['bindings'][1].keys())[0]]['value']:
                candicate_class = entity_class[list(qres['results']['bindings'][1].keys())[0]]['value'].split(dbo_prefix)[1]
                classes.append(candicate_class)
#                 print(candicate_class)
    except:
        pass
    return classes

In [10]:
import requests
import xml.etree.ElementTree as ET
import time




def retrieve_dbpedia_classes (query_string, max_hits = 5):
    web_api = 'http://lookup.dbpedia.org/api/search/KeywordSearch?MaxHits=%s&QueryString=%s'
    dbo_prefix = 'http://dbpedia.org/ontology/'
    dbp_prefix = 'http://dbpedia.org/resource/'
    entity_classes = dict()
    try:
        lookup_url = web_api % (max_hits, query_string)
        print(lookup_url)
#         print(lookup_url)
        lookup_res = requests.get(lookup_url)
        root = ET.fromstring(lookup_res.content)
        i=0
        for child in root:
            i+=1
#             print("\n")
            entity = child[1].text.split(dbp_prefix)[1]
#             print(entity)
            classes = list()
            for cc in child[3]:
                cls_URI = cc[1].text
#                 print(cls_URI)
                if dbo_prefix in cls_URI:
                    classes.append((cls_URI.split(dbo_prefix)[1]))
            
            # if no classes have been retrieved from the lookup go to the sparql endpoint to get the classes for the entity
            if len(classes) == 0:
                classes = get_dbo_classes_sparql(re.escape(entity))
                
            if len(classes) > 0:
                entity_classes[entity] = dict()
                entity_classes[entity]['rank'] = i
                entity_classes[entity]['candidate_classes'] = classes
    except UnicodeDecodeError:
        pass
    return entity_classes

def lookup_cells_in_dbpedia(data):
    cell_values = dict()
    i = 0

    from IPython.display import clear_output

    size = 0
    for file_i in range(len(data)):
        for col in data[file_i]['data']:
            for line_j in range(len(data[file_i]['data'][col])):
                size+=1

    start_time = time.time()

    # from tqdm import tqdm
    for file_i in range(len(data)):
    #     print(data[file_i])
        filename = data[file_i]['filename']
        for col in data[file_i]['data']:
            column_index = col
    #         print(col)
    #         print(data[file_i]['data'][col])
            for line_j in range(len(data[file_i]['data'][col])):
                i+=1
                cell_value = data[file_i]['data'][col][line_j]
                clear_output(wait=True)
                print('{0:.2f}'.format(100*i/size,2),'-->',filename, ": ",cell_value)
                if cell_value in cell_values.keys():
                    cell_values[cell_value]['location'].append((filename,column_index))
                else:
                    cell_values[cell_value] = dict()
                    cell_values[cell_value]['location'] = [(filename,column_index)]
                    try:
                        cell_values[cell_value]['candidate_entities'] = retrieve_dbpedia_classes(cell_value.replace("[",'').replace("]",''),FLAGS.lookup_results_rank)
                    except:
                        cell_values[cell_value]['candidate_entities'] = retrieve_dbpedia_classes(cell_value,FLAGS.lookup_results_rank)

    end_time = time.time()

    print(f"{int(end_time - start_time)//60} min and {int((end_time - start_time)%60)} seconds Elapsed")
    
    # Also save the data in json for future runs
    with open(('cell_values-%s.json' % time.strftime("%Y%m%d-%H%M%S")), 'w') as fp:
        json.dump(cell_values, fp)
    
    return cell_values

In [11]:
from rdflib import Graph
from SPARQLWrapper import SPARQLWrapper, JSON, N3
from pprint import pprint

def get_dbo_class_entities_sparql(candidate_class, num_of_results = 10000):
    sparql = SPARQLWrapper('https://dbpedia.org/sparql')
    
    ent_list = []

    dbp_prefix = 'http://dbpedia.org/resource/'
    
#     print(f'###################{candidate_class}########################')
    sparql.setQuery(f'''
    SELECT ?object
    WHERE {{ ?object a dbo:{candidate_class}. }}
    ORDER BY RAND()
    LIMIT {num_of_results}
    ''')
    try:
        sparql.setReturnFormat(JSON)
        qres = sparql.query().convert()
        for entity in qres['results']['bindings']:
            ent_list.append(entity['object']['value'].split(dbp_prefix)[1])
    except:
        pass
        
    return ent_list

# 2. Lookup cell_values

With the data loaded in the *data* dictionary the next step is to lookup the cell values in the DBpedia endpoint and get the canidate classes and entities.
Each cell value is only looked up once, however we still keep track of any column it might have appeared in as well as all candidate entities and classes it may have matched to.

For this level of analysis we are flexible to store the 5 top lookup results for each cell value (default value for FLAGS.lookup_results_rank).
We will then assess the number of classifiers we need to train later and perhaps filter out any candidate classes that only appeared in lower ranks.

The outcome of the lookup is stored in the *cell_values* dictionary as follows:

{<strong>"Madagascar":</strong><br>
{
<blockquote><strong>"location":</strong> [("14067031_0_559833072073397908",0)]
            , <br><strong>"candidate_entities":</strong><br> 
                        {
                            <blockquote><strong>"Madagascar":</strong> <br>{<blockquote><strong>"rank":</strong> 1,<br> <strong>"candidate_classes":</strong> ["Place", "Country", "PopulatedPlace", "Location"]</blockquote>}, <br>
                            <strong>"Antananarivo":</strong> <br> {<blockquote><strong>"rank":</strong> 3,<br> <strong>"candidate_classes":</strong> ["Settlement", "Place", "PopulatedPlace", "Location"]</blockquote>}, <br>
                            <strong>"List_of_Madagascar_(franchise)_characters":</strong> <br> {<blockquote><strong>"rank":</strong> 4,<br> <strong>"candidate_classes":</strong> ["FictionalCharacter", "Agent"]</blockquote>},<br>
                            <strong>"Madagascar_national_football_team"</strong> <br> {<blockquote><strong>"rank":</strong> 5,<br> <strong>"candidate_classes":</strong> ["Organisation", "SoccerClub", "Agent", "SportsClub"]</blockquote>}<br>
</blockquote>}<br> 
</blockquote>},<br>
               
 <strong>"South Africa":</strong> {...},<br>
  ...<br>
 }

### PLEASE NOTE THAT THIS STEP <span style="color:red">TAKES A LONG TIME TO RUN</span>.
IF THERE IS ALREADY A CELL_VALUES.JSON FROM A PREVIOUS RUN THEN THAT SHOULD BE LOADED INSTEAD

Alternatively load the lookup values previously saved as part of a past lookup

In [17]:
# Load the dictionary with the lookup results for each cell value in the tabular data

cell_values_directory = os.getcwd()+'\\output\\'

try:
    cell_values_json = cell_values_directory + 'cell_values.json'
    with open(cell_values_json) as json_file:
        cell_values = json.load(json_file)
except:
    cell_values = lookup_cells_in_dbpedia(data)

In [19]:
# list(islice(cell_values.items(), 5))

In [20]:
# retrieve_dbpedia_classes('Indiana Jones',10)

# 3. Process that data

The next step is to process the data so that we can use them for training the classifiers and also predicting classes. To achieve that we create the following structures:

## 3.1 dict_col_candidate_classes
This is a dictionary with the following structure <br/>
{<strong>'58891288_0_1117541047012405958'</strong>: 
<br/>{
<blockquote><strong>1</strong>: [('PoliticalParty', 'Shining_Path', 'The Shining', 5),<br>
                   ('Organisation', 'Shining_Path', 'The Shining', 5),<br>
                   ('Agent', 'Shining_Path', 'The Shining', 5),<br>
                    ...<br/>
                   ('Book', 'The_Bridge_over_the_River_Kwai', 'The Bridge on the River Kwai', 1)]<br>
            <strong>2</strong>: [('PoliticalParty', 'Shining_Path', 'The Shining', 5),<br>
                    ...<br/>
</blockquote>
}<br/>
<strong>'58891288_0_1117541047012405958'</strong>: {...}<br/>
}
where each element in the array represents (type, entity, cell value, rank) of all the lookup results for each cell in that column of that file

In [21]:
dict_col_candidate_classes = dict()
threshold = 2

for filename in dict_target_col: #later replace with dict_target_col
    dict_col_candidate_classes[filename] = dict()
    for i in dict_target_col[filename]:
        dict_col_candidate_classes[filename][i] = []
#     print(key)
    for cell_value in cell_values:
        try:
            column_index = dict(cell_values[cell_value]['location'])[filename]
            for candidate_entity in cell_values[cell_value]['candidate_entities']:
#                 print(candidate_entity)
                rank = cell_values[cell_value]['candidate_entities'][candidate_entity]['rank']
                if rank <= threshold:
                    for candidate_class in cell_values[cell_value]['candidate_entities'][candidate_entity]['candidate_classes']:
                        dict_col_candidate_classes[filename][column_index].append((candidate_class, candidate_entity, cell_value,rank))
                
            dict_col_candidate_classes[filename][column_index]
#             print('found')
        except:
            pass
#             print('not found')


with open(('dict_col_candidate_classes-%s.json' % time.strftime("%Y%m%d-%H%M%S")), 'w') as fp:
        json.dump(dict_col_candidate_classes, fp)

In [22]:
# select a subset of the file columns to do some sample testing
from itertools import islice

# list(islice(dict_col_candidate_classes.items(), 5))

Test the effectiveness of the simple lookup

In [23]:
# Test the effectiveness of the simple lookup

def lookup_assessment(dict_col_candidate_classes, threshold = 10000):
    found = 0
    total_columns = 0
    for file in ground_truth:
        for col in ground_truth[file]:
            actual_cls = ground_truth[file][col]
            candidate_cls = Counter([i[0] for i in dict_col_candidate_classes[file][col]]).most_common()[:threshold]
    #         print (file, col, actual_cls, candidate_cls)
            if actual_cls in dict(candidate_cls):
                found+=1
            total_columns+=1
    print(f"{round(100*found/total_columns,2)}% of the columns have the right type in{(' the full list of', ' the top ' + str(threshold))[threshold != 10000] } candidate classes")

lookup_assessment(dict_col_candidate_classes, 10)

91.74% of the columns have the right type in the top 10 candidate classes


## 3.2 df_entities

A variariation of dict_col_candidate_classes this is a datafra of the lookup results with columns representing:
* type
* entity
* cell_value and 
* rank

of all the lookup results regardless of file / column the cell value appears in

In [24]:
# type_neighours_pos_neg_samples['Embryology']
df_entities = pd.DataFrame()

for filename in dict_col_candidate_classes:
    for col in dict_col_candidate_classes[filename]:
        df_entities = df_entities.append(pd.DataFrame(dict_col_candidate_classes[filename][col], columns=['type', 'entity', 'cell_value', 'rank']))
        
df_entities.head()

Unnamed: 0,type,entity,cell_value,rank
0,Film,Night_Hunter,The Night of the Hunter,2
1,Album,I'm_Breathless,Breathless,2
2,MusicalWork,I'm_Breathless,Breathless,2
3,Work,I'm_Breathless,Breathless,2
4,Album,American_Beauty/American_Psycho,American Beauty,1


## 3.2 type_neighours_pos_neg_samples

This is a dictionary that will help create classifiers for the the candidate classes. For each identified class this dictionary will have the following keys:
* 'cooccuring_classes': a set of classes that appear in the same columns as this class
* 'positive_candidate_entities': a set of all positive candidate entities that have been retrieved from lookups of the tabular data and belong to this class
* 'negative_candidate_entities': a set of all negative candidate entities that have been retrieved from lookups of the tabular data and belong to any of the classes in the neighborhood of this one

### 3.2.1 Create the dictionary with all classes and populate the coocuring classes


In [25]:
def get_candidate_classes(dict_col_candidate_classes):
    
    candidate_classes = dict()
    for file in dict_col_candidate_classes:
        for col in (dict_col_candidate_classes[file]):
            
            neighours = set([])
            for cell in dict_col_candidate_classes[file][col]:
                neighours.add(cell[0])
                if cell[0] not in candidate_classes.keys():
                    candidate_classes[cell[0]] = dict()
                    candidate_classes[cell[0]]['cooccuring_classes'] = set()
                    candidate_classes[cell[0]]['positive_candidate_entities'] = set()
                    candidate_classes[cell[0]]['negative_candidate_entities'] = set()
                    candidate_classes[cell[0]]['general_positive_entities'] = set()
#             print(neighours)
            for candidate_class in neighours:
                temp = neighours.copy()
                temp.remove(candidate_class)
#                 print(temp)
                candidate_classes[candidate_class]['cooccuring_classes'].update(temp)

#             print(file, '--->', col, '--->', neighours)
                
    return candidate_classes

type_neighours_pos_neg_samples = get_candidate_classes(dict_col_candidate_classes)

# for key in type_neighours_pos_neg_samples:
#     print(f"Class:{key} with \t {len(type_neighours_pos_neg_samples[key]['cooccuring_classes'])} neighbouring classes")

### 3.2.2 Populate the 'positive_candidate_entities' key


In [22]:
# update the positive samples for each candidate class
for candidate_cls in tqdm(type_neighours_pos_neg_samples):
    type_neighours_pos_neg_samples[candidate_cls]['positive_candidate_entities'].update(set(df_entities[df_entities.type == candidate_cls].entity))

100%|███████████████████████████████████████████████████████████████████████████████| 316/316 [00:01<00:00, 243.44it/s]


### 3.2.3 Populate the 'negative_candidate_entities' key


In [23]:
# update the negative samples for each class

for candidate_cls in tqdm(type_neighours_pos_neg_samples):
    for neighbour_cls in type_neighours_pos_neg_samples[candidate_cls]['cooccuring_classes']:
        type_neighours_pos_neg_samples[candidate_cls]['negative_candidate_entities'].update(set(df_entities[df_entities.type == neighbour_cls].entity))

100%|████████████████████████████████████████████████████████████████████████████████| 316/316 [04:01<00:00,  1.31it/s]


### 3.2.4 Populate the 'general_positive_entities' key

In [41]:
for candidate_cls in tqdm(type_neighours_pos_neg_samples):
    limit = len(type_neighours_pos_neg_samples[candidate_cls]['negative_candidate_entities'])
    type_neighours_pos_neg_samples[candidate_cls]['general_positive_entities']=set(get_dbo_class_entities_sparql(candidate_cls,limit))

100%|████████████████████████████████████████████████████████████████████████████████| 316/316 [07:06<00:00,  1.35s/it]


In [42]:
for candidate_cls in tqdm(type_neighours_pos_neg_samples):
    type_neighours_pos_neg_samples[candidate_cls]['cooccuring_classes'] = list(type_neighours_pos_neg_samples[candidate_cls]['cooccuring_classes'])
    type_neighours_pos_neg_samples[candidate_cls]['positive_candidate_entities'] = list(type_neighours_pos_neg_samples[candidate_cls]['positive_candidate_entities'])
    type_neighours_pos_neg_samples[candidate_cls]['negative_candidate_entities'] = list(type_neighours_pos_neg_samples[candidate_cls]['negative_candidate_entities'])
    type_neighours_pos_neg_samples[candidate_cls]['general_positive_entities'] = list(type_neighours_pos_neg_samples[candidate_cls]['general_positive_entities'])
    
with open(('type_neighours_pos_neg_samples-%s.json' % time.strftime("%Y%m%d-%H%M%S")), 'w') as fp:
        json.dump(type_neighours_pos_neg_samples, fp)

100%|███████████████████████████████████████████████████████████████████████████████| 316/316 [00:01<00:00, 221.89it/s]


convert the samples from sets to list in order to be able to save the json

In [43]:
# type_neighours_pos_neg_samples['Fern']
len(get_dbo_class_entities_sparql('Fern'))

727

In [25]:
# for candidate_cls in type_neighours_pos_neg_samples:
#     print(candidate_cls, len(type_neighours_pos_neg_samples[candidate_cls]['cooccuring_classes'])\
#           ,len(type_neighours_pos_neg_samples[candidate_cls]['positive_candidate_entities'])\
#           ,len(type_neighours_pos_neg_samples[candidate_cls]['negative_candidate_entities'])\
#           , len(type_neighours_pos_neg_samples[candidate_cls]['general_positive_entities']))

In [27]:
def load_json(data_json):
    with open(data_json) as json_file:
        return json.load(json_file)
    
type_neighours_pos_neg_samples = load_json(cell_values_directory+'type_neighours_pos_neg_samples.json')

In [42]:
type_neighours_pos_neg_samples['CanadianFootballTeam']

{'cooccuring_classes': ['MilitaryPerson',
  'Musical',
  'Country',
  'MusicalArtist',
  'Film',
  'PopulatedPlace',
  'BaseballTeam',
  'SportsTeam',
  'Town',
  'MilitaryConflict',
  'SportsEvent',
  'Organisation',
  'Book',
  'NaturalPlace',
  'AmericanFootballTeam',
  'Ship',
  'TelevisionStation',
  'Colour',
  'BaseballPlayer',
  'Athlete',
  'PoliticalParty',
  'ComicsCharacter',
  'Company',
  'Artist',
  'Group',
  'SoccerClub',
  'Mountain',
  'Drug',
  'Region',
  'PersonFunction',
  'Poem',
  'EducationalInstitution',
  'Album',
  'SportsClub',
  'University',
  'SocietalEvent',
  'Royalty',
  'AmusementParkAttraction',
  'FictionalCharacter',
  'Award',
  'TelevisionShow',
  'Aircraft',
  'HockeyTeam',
  'Location',
  'Settlement',
  'WineRegion',
  'Software',
  'Band',
  'MusicGenre',
  'City',
  'Place',
  'Island',
  'MilitaryUnit',
  'CricketTeam',
  'Person',
  'AdministrativeRegion',
  'BasketballTeam',
  'Song',
  'SpaceMission',
  'ArchitecturalStructure',
  'Eve

# 4. Train CNN

In [28]:
import os
import pandas as pd
import sys
import argparse
import random
import math
import numpy as np

In [29]:
import argparse

# parser = argparse.ArgumentParser()

parser.add_argument(
    '--synthetic_column_size',
    type=int,
    default=10,
    help='Size of synthetic column')
parser.add_argument(
    '--sequence_size',
    type=int,
    default=50,
    help='Length of word sequence of synthetic column')
parser.add_argument(
    '--model_dir',
    type=str,
    default=os.path.abspath('C:/Users/zacharias.detorakis/Desktop/city-ds-final-project/SemAIDA-master/AAAI19/exp_T2D/in_out/w2v_model/enwiki_model'),
    # default='~/w2v_model/enwiki_model/',
    help='Directory of word2vec model')
FLAGS, unparsed = parser.parse_known_args()

# ONLY LOAD ONCE

In [30]:
from gensim.models import Word2Vec
w2v_model = Word2Vec.load(os.path.join(FLAGS.model_dir, 'word2vec_gensim'))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).e

In [31]:
from pattern.text.en import tokenize

def generate_synthetic_columns(entities, synthetic_column_size):
    ent_units = list()
    if len(entities) >= synthetic_column_size:
        for i, ent in enumerate(entities):
            unit = random.sample(entities[0:i] + entities[(i + 1):], synthetic_column_size - 1)
            unit.append(ent)
            ent_units.append(unit)
    else:
        unit = entities + ['NaN'] * (len(entities) - synthetic_column_size)
        ent_units.append(unit)
    return ent_units

def synthetic_columns2sequence(ent_units, sequence_size):
    word_seq = list()
    for ent in ent_units:
        ent_n = ent.replace('_', ' ').replace('-', ' ').replace('.', ' ').replace('/', ' '). \
            replace('"', ' ').replace("'", ' ')
        tokenized_line = ' '.join(tokenize(ent_n))
        is_alpha_word_line = [word for word in tokenized_line.lower().split() if word.isalpha()]
        word_seq += is_alpha_word_line
    if len(word_seq) >= sequence_size:
        return word_seq[0:sequence_size]
    else:
        return word_seq + ['NaN'] * (sequence_size - len(word_seq))
    
def sequence2matrix(word_seq, sequence_size, w2v_model):
    ent_v = np.zeros((sequence_size, w2v_model.vector_size, 1))
    for i, word in enumerate(word_seq):
        if not word == 'NaN' and word in w2v_model.wv.vocab:
            w_vec = w2v_model.wv[word]
            ent_v[i] = w_vec.reshape((w2v_model.vector_size, 1))
    return ent_v

In [32]:
def align_samples(pos, neg, pct = 0.5):
    if len(pos) <= len(neg):
        return pos+[random.choice(pos) for _ in range(math.ceil((len(neg)-len(pos))*pct))], neg
    else:
        return pos, neg+[random.choice(neg) for _ in range(math.ceil((len(pos)-len(neg))*pct))]

In [33]:
def embedding(entities_positive, entities_negative):
    # embedding
    units_positive = generate_synthetic_columns(entities_positive, FLAGS.synthetic_column_size)
    units_negative = generate_synthetic_columns(entities_negative, FLAGS.synthetic_column_size)

    sequences_positive = list()
    for ent_unit in units_positive:
        sequences_positive.append(synthetic_columns2sequence(ent_unit, FLAGS.sequence_size))
    sequences_negative = list()
    for ent_unit in units_negative:
        sequences_negative.append(synthetic_columns2sequence(ent_unit, FLAGS.sequence_size))

    x = np.zeros((len(sequences_positive) + len(sequences_negative), FLAGS.sequence_size, w2v_model.vector_size, 1))
    for sample_i, sequence in enumerate(sequences_positive + sequences_negative):
        x[sample_i] = sequence2matrix(sequence, FLAGS.sequence_size, w2v_model)

    y_positive = np.ones((len(sequences_positive), 1))
    y_negative = np.zeros((len(sequences_negative), 1))
    y = np.concatenate((y_positive, y_negative))

    # shuffling
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(y.shape[0]))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]
    return x_shuffled, y_shuffled

In [34]:
# os.chdir(os.getcwd()+'/cnn_models'

Load the directory of cnn trained models so that the solution can pick up from where it left off in case there are more classifiers to be trained based on the candidate classes

In [39]:
def get_cnn_models(directory):
    temp = [x[0] for x in os.walk(directory)]
    temp.remove(directory)
    return set([x.replace(directory+'\\','').split('\\')[0] for x in temp])

loaded_models = get_cnn_models(os.getcwd()+'\\output\\cnn_models')

In [36]:
# loaded_models

{'Actor',
 'AdministrativeRegion',
 'Agent',
 'Aircraft',
 'Airport',
 'Album',
 'Animal',
 'ArchitecturalStructure',
 'Artist',
 'Artwork',
 'Athlete',
 'Award',
 'Band',
 'Bank',
 'BasketballLeague',
 'BasketballPlayer',
 'BasketballTeam',
 'BodyOfWater',
 'Book',
 'Bridge',
 'Broadcaster',
 'Building',
 'Canal',
 'ChemicalCompound',
 'ChemicalSubstance',
 'City',
 'CityDistrict',
 'Cleric',
 'ClericalAdministrativeRegion',
 'ComedyGroup',
 'Comic',
 'Company',
 'Continent',
 'Country',
 'CricketTeam',
 'Criminal',
 'Crustacean',
 'Dam',
 'Device',
 'Diocese',
 'EducationalInstitution',
 'EthnicGroup',
 'Eukaryote',
 'Event',
 'FictionalCharacter',
 'Film',
 'Fish',
 'Food',
 'FormulaOneTeam',
 'Genre',
 'Group',
 'HockeyTeam',
 'Holiday',
 'InformationAppliance',
 'Infrastructure',
 'Insect',
 'Lake',
 'Language',
 'Location',
 'Locomotive',
 'Magazine',
 'Manga',
 'MeanOfTransportation',
 'MemberOfParliament',
 'MilitaryConflict',
 'MilitaryPerson',
 'MilitaryUnit',
 'Mineral',
 'M

In [43]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
import os
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from IPython.display import clear_output

batch_size = 32 
epochs = 1
test_train_split = 0.2

def save_model(model, candidate_class):
    cwd = os.getcwd()+'\\output\\cnn_models'

    model.save(cwd+'/%s' % candidate_class)

loaded_models = get_cnn_models(os.getcwd()+'\\output\\cnn_models')

for candidate_class in tqdm(type_neighours_pos_neg_samples):
    if candidate_class not in loaded_models:
        print(candidate_class)
        # Get the positive and negative samples to train the model
        cls_neg_par_entities = list(type_neighours_pos_neg_samples[candidate_class]['negative_candidate_entities'])
        cls_pos_gen_entities = list(type_neighours_pos_neg_samples[candidate_class]['general_positive_entities'])

        # align the samples to create a balance set
        p_ents, n_ents = align_samples(cls_pos_gen_entities, cls_neg_par_entities,1)

        # Create the embeddings using the w2v_model. here the samples are shuffled so we have a mixture of positive and negative samples
        X, Y = embedding(p_ents, n_ents)

        dev_sample_index = int(test_train_split * float(X.shape[0]))
        X_train, X_dev = X[dev_sample_index:], X[:dev_sample_index]
        Y_train, Y_dev = Y[dev_sample_index:], Y[:dev_sample_index]

        IMG_HEIGHT = X_train.shape[1]
        IMG_WIDTH = X_train.shape[2]

        #Build the model
        model = Sequential([
            Conv2D(16, 3, padding='same', activation='relu', 
                   input_shape=(IMG_HEIGHT, IMG_WIDTH ,1)),
            MaxPooling2D(),
            Dropout(0.2),
            Conv2D(32, 3, padding='same', activation='relu'),
            MaxPooling2D(),
            Conv2D(64, 3, padding='same', activation='relu'),
            MaxPooling2D(),
            Dropout(0.2),
            Flatten(),
            Dense(512, activation='relu'),
            Dense(1)
        ])
        # Compile the model
        model.compile(optimizer='adam',
                      loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                      metrics=['accuracy'])
        # print the model architecture
    #     model.summary()

        # Fit the model
        history = model.fit(X_train, Y_train, 
                           batch_size=batch_size, 
                           epochs=epochs,  
                           verbose=1)
        # save the model
        save_model(model,candidate_class)
        clear_output(wait=True)

100%|██████████████████████████████████████████████████████████████████████████████| 316/316 [2:28:19<00:00, 28.16s/it]


In [None]:
# CanadianFootballTeam

In [None]:
# y_pred = tf.keras.activations.sigmoid(model4.predict(X_dev)).numpy().round()