# Import libraries

In [1]:
import numpy as np

# LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# YAKE
import yake

# Word embeddings
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel

import time

# Simple similarity

In [None]:
# Jaccard similarity function
def jaccard_similarity(set1, set2):
    set1 = set(set1)
    set2 = set(set2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    similarity = intersection / union
    return similarity

In [None]:
# Function to compute the jaccard similarity of two tables with respect to the values of their columns
def column_similarity(tab1, tab2):
    numColsTab1 = tab1.shape[1]
    numColsTab2 = tab2.shape[1]

    jaccardSimCols = np.zeros((numColsTab1, numColsTab2))
    for i in range(numColsTab1):
        # For each column in Table 1 we compute the Jaccard similarity with each column of Table 2
        col1 = tab1.iloc[:,i]
        for j in range(numColsTab2):
            col2 = tab2.iloc[:,j]
            jaccardSimCols[i,j] = jaccard_similarity(col1, col2)

    similarity = jaccardSimCols.mean() # The simlarity of the tables is the mean of the Jaccard similarities computed pairwise
    return similarity  

In [None]:
# Function to compute the jaccard similarity of two tables with respect to the values of their rows
def row_similarity(tab1, tab2):
    numRowsTab1 = tab1.shape[0]
    numRowsTab2 = tab2.shape[0]

    jaccardSimRows = np.zeros((numRowsTab1, numRowsTab2))
    for i in range(numRowsTab1):
        # For each row in Table 1 we compute the Jaccard similarity with each row in Table 2
        row1 = tab1.iloc[i,:]
        for j in range(numRowsTab2):
            row2 = tab2.iloc[j,:]
            jaccardSimRows[i,j] = jaccard_similarity(row1, row2)

    similarity = jaccardSimRows.mean() # The simlarity of the tables is the mean of the Jaccard similarities computed pairwise
    return similarity  

In [None]:
# Function to perform simple similarity
def rank_simple_similarity(input_table, data_lake, by='column', k=10):
    numTables = len(data_lake) # number of tables in the data lake
    similarities = np.zeros(numTables)
    for i, table in enumerate(data_lake):
        if by == 'column':
            similarities[i] = column_similarity(input_table, table)
        elif by == 'row':
            # This option was running over 21 hours and it did not finish
            similarities[i] = row_similarity(input_table, table)
        else:
            print("The 'by' argument has to be either 'column' or 'row'. {} is not accepted".format(by))
            return []
        # print("Similarity with Table {}: {}".format(i,similarities[i]))
  
    ranked_tables = np.argsort(similarities)[::-1][:k] # indexes of the top-k tables sorted by similarity
    ranked_similarites = np.sort(similarities)[::-1][:k] # similarity of the top-k tables sorted by similarity

    return ranked_tables, ranked_similarites



# Keyword extraction

## LDA

In [3]:
def keywords_lda(text, numKeywords = 10):
    # Function to extract keywords from text using LDA

    # Document-Term Matrix
    vectorizer = CountVectorizer()
    dtm = vectorizer.fit_transform([text])

    # LDA
    num_topics = 1  # Number of topics to identify
    lda_model = LatentDirichletAllocation(n_components=num_topics)
    lda_model.fit(dtm)

    # Interpret the topic
    feature_names = vectorizer.get_feature_names_out()
    topic = lda_model.components_[0]
    top_words_indices = topic.argsort()[:-numKeywords - 1:-1]
    top_words = [feature_names[i] for i in top_words_indices]
    top_weights = [topic[i] for i in top_words_indices]

    # Normalize the weights        ------------------------------------------------ Try without normalizing. These weights represent the importance on the overall table, not only with respect to the other keywords  --------------------------------------
    total_weight = sum(top_weights)
    normalized_weights = [weight / total_weight for weight in top_weights]

    # Store the top words and normalized weights in variables
    topic_words_and_weights = list(zip(top_words, normalized_weights))

    return topic_words_and_weights

## YAKE

In [37]:
def keywords_yake(text, numKeywords = 10, lenNGram = 1):
    # Function to extract keywords from text using Yake
    
    kw_extractor = yake.KeywordExtractor(n=lenNGram, top=numKeywords)
    keywords= kw_extractor.extract_keywords(text)
    keywords = sorted(keywords, key = lambda x: x[1], reverse = True)
    return keywords

## Weighted Jaccard similarity

In [5]:
def weighted_jaccard_similarity(keywords1, weights1, keywords2, weights2):
    # Function to compute the weighted Jaccard similarity between two sets
    
    intersection = keywords1.intersection(keywords2)
    union = keywords1.union(keywords2)

    numerator = sum(min(weights1[word], weights2[word]) for word in intersection)
    denominator = sum(max(weights1.get(word, 0), weights2.get(word, 0)) for word in union)

    similarity = numerator / denominator
    return similarity

## Keyword Extraction (LDA/YAKE) + Weighted Jaccard Similarity

In [6]:
def find_keywords(text, method):
    # Funtion to call the correct keyword extraction method
    if method == "lda":
        keywords = keywords_lda(text)
    elif method == "yake":
        keywords = keywords_yake(text)
    else:
        raise ValueError("The method to extract the keywords has to be either 'lda' or 'yake'. '{}' is not accepted".format(method))
    
    return keywords


In [7]:
# Function to perform table discovery based on the similarity of the keywords of the tables
def rank_keywords(input_table_text, data_lake, keywordExtractionMethod, k=10):

    start_time = time.time()

    numTables = len(data_lake) # number of tables in the data lake

    # Find keywords of the input table and their corresponding weights
    input_table_words_topic = find_keywords(input_table_text, keywordExtractionMethod)
    input_table_words = set([t[0] for t in input_table_words_topic])
    input_table_weights = {t[0]: t[1] for t in input_table_words_topic}

    running_time = time.time() - start_time

    similarities = np.zeros(len(data_lake))
    for i, table_text in enumerate(data_lake):
        # Find keywords of the table and their corresponding weights
        table_words_topic = find_keywords(table_text, keywordExtractionMethod)
        table_words = set([t[0] for t in table_words_topic])
        table_weights = {t[0]: t[1] for t in table_words_topic}

        start_time = time.time()

        # Compute Weighted Jaccard Similarity between input table and table from the data lake
        similarities[i] = weighted_jaccard_similarity(input_table_words, input_table_weights, table_words, table_weights)
        # print("Similarity with Table {}: {}".format(i,similarities[i]))

        running_time += time.time() - start_time

    if k > numTables:
        print("The introduced k (k = {}) is larger than the number of tables in the data lake. The output is the ranking of all tables in the data lake.".format(k))
        k = numTables # We rank all tables in the data lake

    start_time = time.time()
    
    ranked_tables = np.argsort(similarities)[::-1][:k] # indexes of the top-k tables sorted by similarity
    ranked_similarites = np.sort(similarities)[::-1][:k] # similarity of the top-k tables sorted by similarity

    running_time += time.time() - start_time

    return ranked_tables, ranked_similarites, running_time

# Keyword extraction + Word embeddings

## Word2Vec

In [8]:
def embeddings_word2vec(keywords, weights, model):
    # Funtion to transforms keywords to embeddings using Word2Vec
    embeddings = []
    weights_filtered = []
    for word, weight in zip(keywords, weights):
        if word in model:
            embeddings.append(model[word])
            weights_filtered.append(weight)
        # else:
        #     print("'{}' does not have a predefined embedding.".format(word))

    return list(tuple(zip(embeddings, weights_filtered)))

## BERT

In [9]:
def embeddings_bert(keywords, model, tokenizer):
    # Initialize a list to store the embeddings
    embeddings_list = []

    # Process each keyword to obtain the embeddings
    for keyword in keywords:
        
        # Add the special tokens.
        marked_text = "[CLS] " + keyword + " [SEP]"

        # Tokenize the keyword
        tokens = tokenizer.tokenize(marked_text)
      
        # Convert tokens to token IDs
        token_ids = tokenizer.convert_tokens_to_ids(tokens)

        # Mark each of the tokens as belonging to sentence "1" because, in our case, all tokens belong to the same keywords. 
        segments_ids = [1] * len(tokens)

        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([token_ids])
        segments_tensors = torch.tensor([segments_ids])
    
        # Predict hidden states features for each layer
        with torch.no_grad():
            encoded_layers, _ = model(tokens_tensor, segments_tensors)

        token_vecs = encoded_layers[11][0]

        keyword_embedding = torch.mean(token_vecs, dim=0)
        
        # Append the embedding to the embeddings list
        embeddings_list.append(keyword_embedding.numpy())

    return embeddings_list

## Weighted Cosine Similarity

In [10]:
def weighted_centroid_cosine_similarity(embeddings1, weights1, embeddings2, weights2):
    # Function to compute the cosine similarity between two weighted embedding sets. We first find the centroid of the sets and then compute the cosine similarity between them.

    weighted_centroid1 = np.average(embeddings1, axis=0, weights=weights1)
    weighted_centroid2 = np.average(embeddings2, axis=0, weights=weights2)

    cosine_sim = cosine_similarity(weighted_centroid1.reshape(1, -1), weighted_centroid2.reshape(1, -1))

    return cosine_sim[0, 0]


def weighted_pairwise_cosine_similarity(embeddings1, weights1, embeddings2, weights2, i, aggregation='mean'):
    # Function to compute the cosine similarity between two weighted embedding sets. We first compute the similarites pairwise and then we aggregate them.
    
    pairwise_similarities = cosine_similarity(embeddings1, embeddings2)
    weighted_pairwise_similarities = pairwise_similarities * np.outer(weights1, weights2)

    if aggregation == 'mean':
        similarity = np.mean(weighted_pairwise_similarities)
    elif aggregation == 'max':
        similarity = np.max(weighted_pairwise_similarities)
    elif aggregation == 'sum':
        similarity = np.sum(weighted_pairwise_similarities)

    return similarity

## Keyword Extraction (LDA/YAKE) + Word Embeddings (Word2Vec/BERT) + Weighted Cosine Similarity

In [11]:
def find_embeddings(text, methodKeywords, methodEmbeddings, model, tokenizer = None):
    # Function to find the embeddings of the keywords of a given table

    keywords_weights = find_keywords(text, methodKeywords)
    keywords = [t[0] for t in keywords_weights]
    weights = [t[1] for t in keywords_weights]

    if methodEmbeddings == "word2vec":
        embeddings_weights = embeddings_word2vec(keywords, weights, model) # returns embeddings and corresponding weight
        embeddings = np.array([i[0] for i in embeddings_weights])
        new_weights = np.array([i[1] for i in embeddings_weights])
    elif methodEmbeddings == "bert":
        embeddings = embeddings_bert(keywords, model, tokenizer) # returns embeddings
        embeddings = np.array(embeddings)
        new_weights = np.array(weights) # weights do not change using Bert
    else:
        raise ValueError("The method to extract the keywords has to be either 'word2vec' or 'bert'. '{}' is not accepted".format(methodEmbeddings))
    
    return embeddings, new_weights

In [12]:
# Function to perform table discovery based on the similarity of the embeddings of the keywords of the tables
def rank_embeddings(input_table_text, data_lake, keywordExtractionMethod, embeddingsMethod, model, tokenizer = None, k=10):

    start_time = time.time()
    
    numTables = len(data_lake) # number of tables in the data lake

    # Find embeddings of the keywords of the input table
    embeddings_input, weights_input = find_embeddings(input_table_text, keywordExtractionMethod, embeddingsMethod, model, tokenizer)

    running_time = time.time() - start_time

    similarities = np.zeros(numTables)
    for i, table_text in enumerate(data_lake):
        # Find embeddings of the keywords of the table of the data lake
        embeddings_table, weights_table = find_embeddings(table_text, keywordExtractionMethod, embeddingsMethod, model, tokenizer)

        start_time = time.time()

        # If the embeddings model has found any embedding for the table
        if np.size(embeddings_table) > 0:
            # Compute similarity between input table and table from the data lake
            similarities[i] = weighted_pairwise_cosine_similarity(embeddings_input, weights_input, embeddings_table, weights_table, i)
        else:
            similarities[i] = 0.0
        # print("Similarity with Table {}: {}".format(i,similarities[i]))

        running_time += time.time() - start_time

    start_time = time.time()
    
    if k is None:
        # If there is no k passed as an argument (k is the number of tables we want as an output (top-k most similar tables))
        k = numTables # We rank all tables in the data lake
    elif k > numTables:
        print("The introduced k (k = {}) is larger than the number of tables in the data lake. Thus, the output is the ranking of all tables in the data lake.".format(k))
        k = numTables # We rank all tables in the data lake
    
    # Find indexes of the top-k tables sorted by similarity
    ranked_tables_filtered = np.argsort(similarities)[::-1][:k]      
    ranked_tables_filtered_similarities = np.sort(similarities)[::-1][:k] 

    running_time += time.time() - start_time   

    return ranked_tables_filtered, ranked_tables_filtered_similarities, running_time