In [None]:
import os
import json
import itertools
from tqdm import tqdm
from tensorflow import keras
from IPython.display import clear_output

from pattern.text.en import tokenize
from gensim.models import Word2Vec

In [None]:
import argparse

parser = argparse.ArgumentParser()

parser.add_argument(
    '--synthetic_column_size',
    type=int,
    default=10,
    help='Size of synthetic column')
parser.add_argument(
    '--sequence_size',
    type=int,
    default=50,
    help='Length of word sequence of synthetic column')
parser.add_argument(
    '--model_dir',
    type=str,
    default=os.path.abspath('C:/Users/zacharias.detorakis/Desktop/nov-city-ms-project/app/w2v_model/enwiki_model'),
#     default='/w2v_model/enwiki_model/',
    help='Directory of word2vec model')
FLAGS, unparsed = parser.parse_known_args()

In [None]:
# Path(os.getcwd()+"\output\cnn_models").mkdir(parents=True, exist_ok=True)

# Load input data

* data: Tabular data + ground truth
* dict_col_candidate_classes: a dictionary with filename_columns and in each of the an array of [(candidate_type, candidate_entity, original_cell_value, rank)]
* type_neighours_pos_neg_samples: a dictionary that is used to train the classifiers so for each candidate class we have the neighbouring classes, positive samples from the KG and positive and negative samples from the tabular data

In [None]:
# Load the dictionary with the lookup results for each cell value in the tabular data
def load_json(data_json):
    with open(data_json) as json_file:
        return json.load(json_file)

In [None]:
output_folder = 'output\\'
cnn_model_directory = os.getcwd()+'\\output\\cnn_models'


data = load_json(output_folder+'data.json')
dict_col_candidate_classes = load_json(output_folder+'dict_col_candidate_classes.json')
type_neighours_pos_neg_samples = load_json(output_folder+'type_neighours_pos_neg_samples.json')
dict_cand = load_json(output_folder+'dict_cand.json')


In [None]:
for key in type_neighours_pos_neg_samples['SoccerLeague'].keys():
    print(f'The length of {key} is: {len(type_neighours_pos_neg_samples["Publisher"][key])}')

In [None]:
def get_cnn_models(directory):
    temp = [x[0] for x in os.walk(directory)]
    temp.remove(directory)
    return set([x.replace(directory+'\\','').split('\\')[0] for x in temp])

trained_models = list(get_cnn_models(cnn_model_directory))
# trained_models

# Get predictions

In this step, provided that we have the ground truth, we asses if the expected class is in the top x of the retrieved candidate classes.

In [None]:
def load_model(cnn_model_directory, candidate_class):
    return keras.models.load_model(cnn_model_directory+'\%s' % candidate_class)

# Get avg number of words per cell value

In [None]:
def avg_cell_value_word_lenght(data):
    cell_values = list()

    for file in data:
        for col in data[file]['data']:
            cell_values += data[file]['data'][col]

    cell_values = list(set(cell_values))
    len(cell_values)

    word_seq = list()

    for cell_value in cell_values:
        value = str(cell_value).replace('_', ' ').replace('-', ' ').replace('.', ' ').replace('/', ' ').replace('"', ' ').replace("'", ' ')
        tokenized_line = ' '.join(tokenize(value))
        is_alpha_word_line = [word for word in tokenized_line.lower().split() if word.isalpha()]
        word_seq += is_alpha_word_line

    return len(word_seq) / len(cell_values)

avg_cell_value_word_lenght(data)

# only run once

In [None]:
w2v_model = Word2Vec.load(os.path.join(FLAGS.model_dir, 'word2vec_gensim'))

In [None]:
def generate_synthetic_columns_random(entities, synthetic_column_size):
    ent_units = list()
    if len(entities) >= synthetic_column_size:
        for i, ent in enumerate(entities):
            unit = list([ent])
            unit += random.sample(entities[0:i] + entities[(i + 1):], synthetic_column_size - 1)
            
            ent_units.append(unit)
    else:
        unit = entities + ['NaN'] * (len(entities) - synthetic_column_size)
        ent_units.append(unit)
    return ent_units

def generate_synthetic_columns_sliding_window(entities, synthetic_column_size):
    ent_units = list()
    if len(entities) >= synthetic_column_size:
        for i in range(len(entities)-synthetic_column_size+1):
            try:
                unit = entities[i:i+synthetic_column_size]
            except:
                pass
            ent_units.append(unit)
    else:
        unit = entities + ['NaN'] * (len(entities) - synthetic_column_size)
        ent_units.append(unit)
    return ent_units

def synthetic_columns2sequence(ent_units, sequence_size):
    word_seq = list()
    for ent in ent_units:
        ent_n = str(ent).replace('_', ' ').replace('-', ' ').replace('.', ' ').replace('/', ' '). \
            replace('"', ' ').replace("'", ' ')
        tokenized_line = ' '.join(tokenize(ent_n))
        is_alpha_word_line = [word for word in tokenized_line.lower().split() if word.isalpha()]
        word_seq += is_alpha_word_line
    if len(word_seq) >= sequence_size:
        return word_seq[0:sequence_size]
    else:
        return word_seq + ['NaN'] * (sequence_size - len(word_seq))
    
def sequence2matrix(word_seq, sequence_size, w2v_model):
    ent_v = np.zeros((sequence_size, w2v_model.vector_size, 1))
    for i, word in enumerate(word_seq):
        if not word == 'NaN' and word in w2v_model.wv.vocab:
            w_vec = w2v_model.wv[word]
            ent_v[i] = w_vec.reshape((w2v_model.vector_size, 1))
    return ent_v

In [None]:
import random
import math
import numpy as np
import time


In [None]:
def embedding(cell_values):
    synthetic_columns = generate_synthetic_columns_sliding_window(cell_values, FLAGS.synthetic_column_size)

    synthetic_columns_sequences = list()
    for synthetic_column in synthetic_columns:
    #     print(synthetic_column)
        synthetic_columns_sequences.append(synthetic_columns2sequence(synthetic_column, FLAGS.sequence_size))


    X = np.zeros((len(synthetic_columns_sequences), FLAGS.sequence_size, w2v_model.vector_size, 1))

    for sample_i, sequence in enumerate(synthetic_columns_sequences):
        X[sample_i] = sequence2matrix(sequence, FLAGS.sequence_size, w2v_model)

    return X

# X = embedding(unique_cell_values)

In [None]:
import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D
# from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
# import os
# import numpy as np
# import matplotlib.pyplot as plt
# from tensorflow import keras
# from IPython.display import clear_output

In [None]:
limit = 5

try:
    dict_predictions = load_json(output_folder+'dict_predictions.json')
except:
    dict_predictions = dict()

# for filename in tqdm(dict_cand):
for filename in tqdm(dict(itertools.islice(dict_cand.items(), 200))):
    if filename not in dict_predictions.keys():
        with open(('output/dict_predictions.json'), 'w') as fp:
            json.dump(dict_predictions, fp)   

        dict_predictions[filename] = dict()
        for col in dict_cand[filename]:
            dict_predictions[filename][col] = dict()
            # Get the candidate classes identified in previous steps. there is a limit variable in case we want to focus on the top x portion of that list
            candidate_cls = dict_cand[filename][col]['class_without_hr'][:limit]
    #         candidate_cls = trained_models
            cell_values = data[filename]['data'][col]

            print(filename, col)
            actual_cls = data[filename]['gt'][col]
            dict_predictions[filename][col]['gt'] = actual_cls
            print(actual_cls)

            unique_cell_values=list(set(cell_values))
            X = embedding(unique_cell_values)

            results = list()
            for cls in tqdm(candidate_cls):
                # load the model
                try:
                    model = load_model(cnn_model_directory, cls)
                    y_pred = tf.keras.activations.sigmoid(model.predict(X)).numpy().round()
                    results.append((cls, round(y_pred.sum()*100/X.shape[0],2)))
                    tf.keras.backend.clear_session()
                except:
                    pass
#                 print(cls)
            results = sorted(results, key=lambda x: x[1], reverse=True)
            dict_predictions[filename][col]['cand_cls'] = candidate_cls
            dict_predictions[filename][col]['pred'] = results
            clear_output(wait=True)
    
    
with open(('output/dict_predictions-%s.json' % time.strftime("%Y%m%d-%H%M%S")), 'w') as fp:
        json.dump(dict_predictions, fp)        

In [None]:
from rdflib import Graph
from SPARQLWrapper import SPARQLWrapper, JSON, N3
from pprint import pprint
import requests
import xml.etree.ElementTree as ET

In [None]:
def dbo_sparql_results(query_string):
    
    classes = list([])
    dbo_prefix = 'http://dbpedia.org/ontology/'
    
    
    sparql = SPARQLWrapper('https://dbpedia.org/sparql')
    sparql.setQuery(query_string)
    
    try:
        sparql.setReturnFormat(JSON)
        qres = sparql.query().convert()
        for entity_class in qres['results']['bindings']:
            if dbo_prefix in entity_class[list(qres['results']['bindings'][0].keys())[0]]['value']:
                candicate_class = entity_class[list(qres['results']['bindings'][0].keys())[0]]['value'].split(dbo_prefix)[1]
                classes.append(candicate_class)
    except:
        pass
    
    return classes

def get_dbo_subclass(superClass):
    
    query_string = f'''
    SELECT distinct ?subClass 
    WHERE {{ ?subClass rdfs:subClassOf dbo:{superClass}. }}
    '''
    return dbo_sparql_results(query_string)


def get_dbo_superclass(subclass):
    
    query_string = f'''
    SELECT distinct ?superClass 
    WHERE {{ dbo:{subclass} rdfs:subClassOf ?superClass . }}
    '''
    
    return dbo_sparql_results(query_string)


def get_dbo_superclasses(subclass):
    classes = list([])
    
    try:
        parent = get_dbo_superclass(subclass)
    except:
        return []
    
    while len(parent) > 0:
        classes.append(parent[0])
        parent = get_dbo_superclass(parent[0])
    return classes

get_dbo_superclasses('Actor')

In [None]:
def lookup_assessment(dict_cand, threshold = 500):
    found = 0
    found_cnn = 0
    total_columns = 0
    parent_found = 0
    parent_found_in_lookup = 0

    for file in dict_cand:
        for col in dict_cand[file]:
#             print (file, col)
            candidate_class_lookup = dict_cand[file][col]['cand_cls'][:threshold]
            candidate_class_cnn = [x[0] for x in dict_cand[file][col]['pred'][:threshold]]
            actual_cls = dict_cand[file][col]['gt']
#             print(candidate_class_lookup)
#             print (candidate_class_cnn)
#             print (actual_cls)
            if actual_cls in candidate_class_lookup:
                found+=1
                parent_found_in_lookup += 1
            
            # else we give half a point in case the predicted value is in the hierarchy (i.e. parent) of the actual value
            else:
                parents_of_expected_type = get_dbo_superclasses(actual_cls)
                intersection = [value for value in parents_of_expected_type if value in candidate_class_lookup]
                if len(intersection) > 0:
                    parent_found_in_lookup += 0.5

            if actual_cls in candidate_class_cnn:
                found_cnn+=1
                parent_found += 1
            
            # else we give half a point in case the predicted value is in the hierarchy (i.e. parent) of the actual value
            else:
                parents_of_expected_type = get_dbo_superclasses(actual_cls)
                intersection = [value for value in parents_of_expected_type if value in candidate_class_cnn]
                if len(intersection) > 0:
                    parent_found += 0.5
                    
            total_columns+=1
    return (round(100*found/total_columns,2),round(100*parent_found_in_lookup/total_columns,2),round(100*found_cnn/total_columns,2),round(100*parent_found/total_columns,2))


In [None]:
dict_predictions_top5_random = load_json(output_folder+'dict_predictions_top5_random.json')

In [None]:
import matplotlib.pyplot as plt
import numpy as np


results = []

# first we calculate the max number of candidates across all the columns in the tabular data so we cap the range
max_cand = 0
for file in dict_predictions_top5_random:
    for col in dict_predictions_top5_random[file]:
        
        candidate_cls = set()
        for element in dict_predictions_top5_random[file][col]:
            candidate_cls.add(element[0])
        if len(candidate_cls) > max_cand:
            max_cand = len(candidate_cls)

            
# the we increase the size of top list one element at a time until we reac the cap...
for i in range(1,6):
    (x,y) = (i, lookup_assessment(dict_predictions_top5_random, i))
    results.append((x,y))

#... and plot the results
fig, ax = plt.subplots()
ax.plot([x[0] for x in results],[x[1] for x in results])
plt.show()
results

In [None]:
dict_predictions_top5_sliding = load_json(output_folder+'dict_predictions_top5_sliding.json')

In [None]:

import matplotlib.pyplot as plt
import numpy as np


results = []

# # first we calculate the max number of candidates across all the columns in the tabular data so we cap the range
# max_cand = 0
# for file in dict_predictions:
#     for col in dict_predictions[file]:
        
#         candidate_cls = set()
#         for element in dict_predictions[file][col]:
#             candidate_cls.add(element[0])
#         if len(candidate_cls) > max_cand:
#             max_cand = len(candidate_cls)

            
# the we increase the size of top list one element at a time until we reac the cap...
for i in range(1,6):
    (x,y) = (i, lookup_assessment(dict_predictions_top5_sliding, i))
    results.append((x,y))

#... and plot the results
fig, ax = plt.subplots()
ax.plot([x[0] for x in results],[x[1] for x in results])
plt.show()
results

In [None]:
# max_cand

In [None]:
with open(('output/dict_predictions.json'), 'w') as fp:
        json.dump(dict_predictions, fp)

In [None]:
dict_predictions_top5_sliding['50245608_0_871275842592178099']

In [None]:
print(dict_predictions_top5_sliding['50245608_0_871275842592178099']['0']['gt'])

for cls in dict_predictions_top5_sliding['50245608_0_871275842592178099']['0']['pred']:
    print(cls)
    print(f"Parents: {get_dbo_superclasses(cls[0])}")
    print(f"Offspring: {get_dbo_subclass(cls[0])}")

In [None]:
# print(dict_predictions_top5_sliding['8468806_0_4382447409703007384']['1']['gt'])

for filename in dict(itertools.islice(dict_predictions_top5_sliding.items(), 10)):

    index = 0
    print(f"-----------------------------------------------")
    print(f"Filename: {filename}")
    for col in dict_predictions_top5_sliding[filename]:
        actual_cls = dict_predictions_top5_sliding[filename][col]['gt']
        top1_clc = dict_predictions_top5_sliding[filename][col]['pred'][0][0]
        col_title = data[filename]['column_titles'][index]
        index += 1
        print(f"Column: {col},Actual class: {actual_cls}, Predicted Class: {top1_clc}, Column Title: {col_title}")
        print(f"Parents: {get_dbo_superclasses(cls[0])}")
        print(f"Offspring: {get_dbo_subclass(cls[0])}")

In [None]:
data

In [None]:
get_dbo_superclasses('Lake')

In [None]:
get_dbo_subclass('Lake')

Retrieve all folders under the cnn_models root

In [None]:
def get_cnn_models(directory):
    temp = [x[0] for x in os.walk(directory)]
    temp.remove(directory)
    return set([x.replace(directory+'\\','').split('\\')[0] for x in temp])

trained_models = list(get_cnn_models(cnn_model_directory))
trained_models

In [None]:
# model.summary()

In [None]:
# results