In [10]:
import asyncio
import os
import sys
import time
import traceback

import redis
from keras.models import load_model

import utils.utils as utils
from phases.data_preparation import DataPreparation
from phases.featuresExtractionRevision import FeaturesExtractionRevision
from phases.feauturesExtraction import FeauturesExtraction
from phases.lookup import Lookup
from phases.prediction import Prediction
from phases.decision import Decision
from wrapper.lamAPI import LamAPI
from wrapper.Database import MongoDBWrapper  # MongoDB database wrapper




start = time.time()

pn_neural_path = "./ml_models/Linker_PN_100.h5"
rn_neural_path = "./ml_models/Linker_RN_100.h5"

pn_model = load_model(pn_neural_path)    
rn_model = load_model(rn_neural_path)    


REDIS_ENDPOINT = os.environ["REDIS_ENDPOINT"]
REDIS_JOB_DB = int(os.environ["REDIS_JOB_DB"])
LAMAPI_HOST = os.environ["LAMAPI_ENDPOINT"]
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]


job_active = redis.Redis(host=REDIS_ENDPOINT, db=REDIS_JOB_DB)

# Initialize MongoDB wrapper and get collections for different data models
mongoDBWrapper = MongoDBWrapper()
log_c = mongoDBWrapper.get_collection("log")
row_c = mongoDBWrapper.get_collection("row")
candidate_scored_c = mongoDBWrapper.get_collection("candidateScored")
cea_c = mongoDBWrapper.get_collection("cea")
cpa_c = mongoDBWrapper.get_collection("cpa")
cta_c = mongoDBWrapper.get_collection("cta")
cea_prelinking_c = mongoDBWrapper.get_collection("ceaPrelinking")

data = row_c.find_one({"tableName": "test_cb"})

if data is None:
    print("No data to process", flush=True)
    job_active.set("STOP", "")
    sys.exit(0)

rows_data = data["rows"]
kg_reference = data["kgReference"]
limit = data["candidateSize"]
column_metadata = data["column"]
target = data["target"]
_id = data["_id"]
dataset_name = data["datasetName"]
table_name = data["tableName"]
page = data["page"]
header = data["header"]

lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_TOKEN, mongoDBWrapper, kg=kg_reference)

obj_row_update = {"status": "DONE", "time": None}
dp = DataPreparation(header, rows_data, lamAPI)


column_metadata, target = await dp.compute_datatype(column_metadata, target)
column_metadata[str(target["SUBJ"])] = "SUBJ"
obj_row_update["column"] = column_metadata
obj_row_update["metadata"] = {
    "column": [{"idColumn": int(id_col), "tag": column_metadata[id_col]} for id_col in column_metadata]
}
obj_row_update["target"] = target
    
metadata = {
    "datasetName": dataset_name,
    "tableName": table_name,
    "kgReference": kg_reference,
    "page": page
}

collections = {
    "ceaPrelinking": cea_prelinking_c,
    "cea": cea_c,
    "cta": cta_c,
    "cpa": cpa_c,
    "candidateScored": candidate_scored_c
}
dp.rows_normalization()
l = Lookup(data, lamAPI, target, log_c, kg_reference, limit)
await l.generate_candidates()
rows = l.get_rows()
features = await FeauturesExtraction(rows, lamAPI).compute_feautures()
Prediction(rows, features, pn_model).compute_prediction("rho")
cea_preliking_data = utils.get_cea_pre_linking_data(metadata, rows)
revision = FeaturesExtractionRevision(rows)
features = revision.compute_features()
Prediction(rows, features, rn_model).compute_prediction("rho'")
storage = Decision(metadata, cea_preliking_data, rows, revision._cta, revision._cpa_pair, collections)
end = time.time()
execution_time = round(end - start, 2)
obj_row_update["time"] = execution_time
print("End", flush=True)

[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 654us/step
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 567us/step
End


In [16]:
l.get_rows()[3].get_cells()[4].content

'kth royal insitute of technology'

In [None]:
l.get_rows()[3].get_cells()[4].candidates()

In [7]:
from keras.models import load_model

pn_neural_path = "./ml_models/Linker_PN.h5"
pn_model = load_model(pn_neural_path)
pn_model.summary()



In [4]:
from keras.models import load_model

pn_neural_path = "./ml_models/Linker1.0_PN.h5"
pn_model = load_model(pn_neural_path)
pn_model.input_shape



(None, 29)

In [6]:
from keras.models import load_model

pn_neural_path = "./ml_models/Linker_RN.h5"
pn_model = load_model(pn_neural_path)
pn_model.input_shape



(None, 29)

In [1]:
import asyncio
import os
import sys
import time
import traceback

import redis
from keras.models import load_model

import utils.utils as utils
from phases.data_preparation import DataPreparation
from phases.featuresExtractionRevision import FeaturesExtractionRevision
from phases.feauturesExtraction import FeauturesExtraction
from phases.lookup import Lookup
from phases.prediction import Prediction
from phases.decision import Decision
from wrapper.lamAPI import LamAPI
from wrapper.Database import MongoDBWrapper  # MongoDB database wrapper




REDIS_ENDPOINT = os.environ["REDIS_ENDPOINT"]
REDIS_JOB_DB = int(os.environ["REDIS_JOB_DB"])
LAMAPI_HOST = os.environ["LAMAPI_ENDPOINT"]
LAMAPI_TOKEN = os.environ["LAMAPI_TOKEN"]



# Initialize MongoDB wrapper and get collections for different data models
mongoDBWrapper = MongoDBWrapper()
log_c = mongoDBWrapper.get_collection("log")
row_c = mongoDBWrapper.get_collection("row")
candidate_scored_c = mongoDBWrapper.get_collection("candidateScored")
cea_c = mongoDBWrapper.get_collection("cea")
cpa_c = mongoDBWrapper.get_collection("cpa")
cta_c = mongoDBWrapper.get_collection("cta")
cea_prelinking_c = mongoDBWrapper.get_collection("ceaPrelinking")
lamAPI = LamAPI(LAMAPI_HOST, LAMAPI_TOKEN, mongoDBWrapper, kg="crunchbase")

2024-07-16 21:35:00.459204: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-16 21:35:00.561409: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-16 21:35:00.936229: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
await lamAPI.lookup(["facebook"])

[{'id': 'Q355',
  'name': 'FACEBOOK',
  'description': 'online social media and social networking service',
  'types': [{'id': 'Q3220391', 'name': 'social networking service'},
   {'id': 'Q35127', 'name': 'website'}],
  'ambiguity_mention': 0.241,
  'corrects_tokens': 1.0,
  'ntoken_mention': 1,
  'ntoken_entity': 1,
  'length_mention': 8,
  'length_entity': 8,
  'popularity': 0.25,
  'pos_score': 0.01,
  'es_score': 1.0,
  'ed_score': 1.0,
  'jaccard_score': 1.0,
  'jaccardNgram_score': 1.0,
  'cosine_similarity': 1.0},
 {'id': 'Q380',
  'name': 'Facebook',
  'description': 'American social media and technology company',
  'types': [{'id': 'Q783794', 'name': 'company'},
   {'id': 'Q4830453', 'name': 'business'},
   {'id': 'Q891723', 'name': 'public company'}],
  'ambiguity_mention': 0.241,
  'corrects_tokens': 1.0,
  'ntoken_mention': 1,
  'ntoken_entity': 1,
  'length_mention': 8,
  'length_entity': 8,
  'popularity': 0.06,
  'pos_score': 0.03,
  'es_score': 0.979,
  'ed_score': 1.0,

In [5]:
await lamAPI.labels(["Q5"])

{'Q5': {'kind': 'type',
  'NERtype': 'OTHERS',
  'url': 'https://www.wikidata.org/wiki/Q5',
  'description': 'any member of Homo sapiens, unique extant species of the genus Homo, from embryo to adult',
  'labels': {'en': 'human'},
  'aliases': {'en': ['people',
    'humans',
    'men',
    'non-fictional human',
    'individual human',
    'man',
    'person',
    'nonfictional human',
    'human being',
    'individual Homo sapien']}}}

In [5]:
lamAPI.labels

<bound method LamAPI.labels of <wrapper.lamAPI.LamAPI object at 0xffff1dbd1a90>>

In [16]:
column

'5'

In [None]:
result["winningCandidates"]["0"]

In [6]:
from collections import defaultdict

# Initialize dictionaries to aggregate results and count occurrences
aggregated_winning_candidates = defaultdict(lambda: defaultdict(float))
candidate_counts = defaultdict(lambda: defaultdict(int))

# Fetch documents from the collection
documents = cta_c.find({"datasetName": "test", "tableName": "film"})

# Aggregate winning candidates and count occurrences
for doc in documents:
    winning_candidates = doc.get("winningCandidates", {})
    for key, candidates in winning_candidates.items():
        for candidate, score in candidates.items():
            aggregated_winning_candidates[key][candidate] += score
            candidate_counts[key][candidate] += 1

# Normalize the aggregated scores by the number of occurrences
normalized_winning_candidates = {}
for key, candidates in aggregated_winning_candidates.items():
    normalized_winning_candidates[key] = {candidate: round(score / candidate_counts[key][candidate], 3) for candidate, score in candidates.items()}

qids_types = set()
column_to_types = {}
for column in normalized_winning_candidates:
    top_10_types = sorted(normalized_winning_candidates[column].items(), key=lambda x: x[1], reverse=True)[0:10]
    column_to_types[column] = {}
    for k, v in top_10_types:
        column_to_types[column][k] = v 
        qids_types.add(k)
        
qids_to_labels = await lamAPI.labels(list(qids_types))
new_column_to_types = {}
for column in column_to_types:
    new_column_to_types[column] = [] 
    for k, v in column_to_types[column].items():
        new_column_to_types[column].append({"id": k, "label": qids_to_labels[k]["labels"].get("en"), "score": v})

In [26]:
from collections import defaultdict
import asyncio

async def fetch_labels(qids):
    return await lamAPI.labels(qids)
    

# Initialize dictionaries to aggregate results and count occurrences
aggregated_winning_predicates = defaultdict(lambda: defaultdict(float))
predicate_counts = defaultdict(lambda: defaultdict(int))

# Fetch documents from the collection
documents = cpa_c.find({"datasetName": "test", "tableName": "film"})

# Aggregate winning predicates and count occurrences
for doc in documents:
    winning_predicates = doc.get("winningCandidates", {})
    for outer_key, inner_dict in winning_predicates.items():
        for inner_key, predicates in inner_dict.items():
            for predicate, score in predicates.items():
                aggregated_winning_predicates[(outer_key, inner_key)][predicate] += score
                predicate_counts[(outer_key, inner_key)][predicate] += 1

# Normalize the aggregated scores by the number of occurrences
normalized_winning_predicates = {}
for key, predicates in aggregated_winning_predicates.items():
    normalized_winning_predicates[key] = {predicate: round(score / predicate_counts[key][predicate], 3) for predicate, score in predicates.items()}

qids_predicates = set()
pair_to_predicates = {}
for pair in normalized_winning_predicates:
    top_10_predicates = sorted(normalized_winning_predicates[pair].items(), key=lambda x: x[1], reverse=True)[0:5]
    pair_to_predicates[pair] = {}
    for k, v in top_10_predicates:
        pair_to_predicates[pair][k] = v 
        qids_predicates.add(k)
        
qids_to_labels = await fetch_labels(list(qids_predicates))
new_pair_to_predicates = []
for pair, predicates in pair_to_predicates.items():
    source_column, target_column = pair
    new_predicates = []
    for k, v in predicates.items():
        new_predicates.append({"id": k, "label": qids_to_labels[k]["labels"].get("en"), "score": v})
    new_pair_to_predicates.append({
        "idSourceColumn": source_column,
        "idTargetColumn": target_column,
        "predicates": new_predicates
    })

In [25]:
new_pair_to_predicates

[{'idSourceColumn': '0',
  'idTargetColumn': '1',
  'predicates': [{'id': 'P57', 'label': 'director', 'score': 0.958},
   {'id': 'P58', 'label': 'screenwriter', 'score': 0.489},
   {'id': 'P162', 'label': 'producer', 'score': 0.367},
   {'id': 'P161', 'label': 'cast member', 'score': 0.242},
   {'id': 'P170', 'label': 'creator', 'score': 0.2}]},
 {'idSourceColumn': '0',
  'idTargetColumn': '2',
  'predicates': [{'id': 'P1083', 'label': 'maximum capacity', 'score': 0.09},
   {'id': 'P2047', 'label': 'duration', 'score': 0.071},
   {'id': 'P1093', 'label': 'gross tonnage', 'score': 0.045},
   {'id': 'P4519', 'label': 'payload mass', 'score': 0.029},
   {'id': 'P2043', 'label': 'length', 'score': 0.018}]},
 {'idSourceColumn': '0',
  'idTargetColumn': '3',
  'predicates': [{'id': 'P272', 'label': 'production company', 'score': 0.747},
   {'id': 'P750', 'label': 'distributed by', 'score': 0.609},
   {'id': 'P264', 'label': 'record label', 'score': 0.106},
   {'id': 'P31', 'label': 'instance

In [29]:
aggregated_winning_candidates["0"]["Q11424"]/2

0.9335

In [None]:
await lamAPI.labels(list(qids_types))