 This notebook reads the DataFrame and taxonomy, then ranks taxonomy labels based on their similarity to each document in the DataFrame. It adds these ranked labels as a new column in the DataFrame.

In [0]:
%run ./config

In [0]:
import pandas as pd
import numpy as np
import pickle
import re
import torch
from sentence_transformers import SentenceTransformer, util
import pickle

In [0]:
conf = DataBricksDevConfig
path_results = conf.Data.path_results
path_taxonomy = conf.Data.path_taxonomy
folder_name = conf.Data.folder_name
dic_abbreviation2full = conf.taxonomy_info.dic_abbreviation2full

In [0]:

with open(path_taxonomy + 'dic_taxonomy_embeddings.pickle', 'rb') as file:
    dic_taxonomy = pickle.load(file)

# Required Functions

In [0]:

def Make_list_of_topics(dic_taxonomy):
    taxonomy_all_list = list(dic_taxonomy.keys())
    taxonomy_all_leaves_list = [topic for topic in taxonomy_all_list if len(dic_taxonomy[topic]['children_labels']) == 0]
    return(taxonomy_all_leaves_list)

In [0]:
def calculate_articles_embeddings(df_in, model):
    all_articels_content_list = []
    for _,row in df_in.iterrows():
        article_content = str(row['title']) + ' [SEP] ' + str(row['keywords']) 
        all_articels_content_list.append(article_content)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        # Create embeddings
    embeddings_list = model.encode(
        all_articels_content_list, convert_to_tensor=True
    ).tolist()
    df_articles_embeddings = pd.DataFrame(all_articels_content_list, columns=["article_content"])
    df_articles_embeddings["article_embedding"] = embeddings_list
    return(df_articles_embeddings)

In [0]:
def calculate_similarity_matrixes (df_articles_embeddings ,df_labels_embeddings):
    labels_embeddings_array = np.array(list(df_labels_embeddings["label_embedding"]))
    articles_embedding_array = np.array(list(df_articles_embeddings['article_embedding']))
    similarity_matrix = np.dot(labels_embeddings_array , articles_embedding_array.T)
    return(similarity_matrix)

In [0]:
def retrieve_top_labels(dot_products,df_labels_embeddings, k): 
    try:
        dot_product_scores = {idx: score for idx, score in enumerate(dot_products)}
        sorted_scores = sorted(dot_product_scores.items(), key=lambda x: x[1], reverse=True)
        sorted_indices = [idx for idx, _ in sorted_scores]
        retrieved_topics = [df_labels_embeddings.iloc[i]['label'] for i in sorted_indices[0:k]]
        return(retrieved_topics)
    except Exception as e:
        print(' FAILED (F_retriev_top_labels) : '+str(e))

In [0]:
def retrieve_topics_from_hits(df_in, taxonomy_all_topics_list, hits):
    retrieved_topics_all = []
    for idx_row, row in df_articles_embeddings.iterrows():
        hits_this_row = hits[idx_row]
        retrieved_topics = [taxonomy_all_topics_list[hit['corpus_id']] for hit in hits_this_row]
        retrieved_topics_all.append(retrieved_topics)
    return(retrieved_topics_all)# Amin

## load taxonomy

In [0]:

# Load Taxonomy
taxonomy_all_topics_list_original = Make_list_of_topics(dic_taxonomy)

k = len(taxonomy_all_topics_list)

In [0]:
embeddings_list = [dic_taxonomy[l]['embedding'] for l in taxonomy_all_topics_list]
df_labels_embeddings = pd.DataFrame(taxonomy_all_topics_list, columns=["label"])
df_labels_embeddings["label_embedding"] = embeddings_list

## Load input dataframe

In [0]:
try:
    df_sample = pd.read_pickle(path_results + '1_hierarchical_label_selection.pckl')
    print(len(df_sample))
    df_sample.head(2)
except Exception as e:
    print("Cluster could not read the file (1- wrong cluster 2- no new json file)",e)
    dbutils.notebook.exit(str(e))  


In [0]:
import re  
  
def remove_non_english_characters(input_string):  
    english_string = re.sub(r'[^\x00-\x7F]+',' ', input_string)  
    trimmed_string = re.sub(' +', ' ', english_string)  
    return trimmed_string.strip() 

    
for i_row,row in df_sample.iloc[:].iterrows():
    abstract = row['abstract']
    if len(abstract) - len( remove_non_english_characters(abstract)) > 100:
        print(i_row , '\t-',  len(abstract) ,len( remove_non_english_characters(abstract)) , len(abstract) -len( remove_non_english_characters(abstract)))

In [0]:
model_name = 'sentence-transformers/all-mpnet-base-v2'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Initialize model and tokenizer    
# Use Sentence Transformers models
model = SentenceTransformer(model_name, device=device)
model.to(device)


In [0]:

df_articles_embeddings = calculate_articles_embeddings(df_sample, model)

hits =util.semantic_search(torch.tensor(df_articles_embeddings['article_embedding']),
                            torch.tensor(df_labels_embeddings['label_embedding']),
                            top_k=k)

retrieved_list=   retrieve_topics_from_hits(df_sample, taxonomy_all_topics_list, hits)


df_sample['labels_BiEncoder(just label)'] = retrieved_list

# Store results

In [0]:
df_sample.to_pickle(path_results + '2_embedding_similarity_results_0.pckl')