This notebook evaluates the top taxonomy labels for each document by querying ChatGPT to determine if each label is a suitable fit. The results are stored as a new column in the DataFrame.

In [0]:
%run ./utils

In [0]:
%run ./config

In [0]:
import pandas as pd
import numpy as np
import openai
import json
from concurrent.futures import ThreadPoolExecutor, as_completed  
import re  
import pickle
import math
from collections import Counter
import random  

In [0]:
conf = DataBricksDevConfig
path_results = conf.Data.path_results
path_taxonomy = conf.Data.path_taxonomy
folder_name = conf.Data.folder_name
dic_abbreviation2full = conf.taxonomy_info.dic_abbreviation2full

In [0]:
try:
    df_in = pd.read_pickle(path_results + '2_embedding_similarity_results.pckl')
except Exception as e:
    dbutils.notebook.exit(str(e))  

## Read Taxonomy from generated Dataframe

In [0]:

def clean_label(label): 
    cleaned_label = label
    for abbreviation, expansion in dic_abbreviation2full.items():
        if cleaned_label.startswith(abbreviation):
            cleaned_label = cleaned_label.replace(abbreviation, expansion)
    return cleaned_label  

In [0]:

with open(path_taxonomy + 'dic_taxonomy_embeddings.pickle', 'rb') as file:
    dic_taxonomy = pickle.load(file)
dic_taxonomy_id2label = {dic_taxonomy[label]['id']:label for label in dic_taxonomy.keys()} 
labels_all = list(dic_taxonomy.keys())   
root_labels = []
for k,v in dic_taxonomy.items():
    if len(v['parent_label'])==0:
        root_labels.append(k)
print(root_labels)

# required functions

# Set up the ChatGPT Environment

In [0]:

prompt_system_template = """You are an AI assistant helping me to find the conceptual similarity scores between an SSRN article and a list of {} labels.   
  
Please ensure the following:  
  
- Return a score for each label. 
- ensure there are {} scores in total
- Ensure the scores are varied and accurately represent the level of similarity, rather than scoring a large percentage of labels the same.  
- Consider the main theme of the article and the specific context in which keywords are used.  
- Do not assign high similarity scores to labels that are only tangentially related or share a few keywords with the article. The focus should be on the overall subject matter of the article.  
- Scores should have two decimal points for greater precision.  
  
The output should be a JSON object named "scores" that contains a list of {} tuples. Each tuple should contain a label ID and a relevancy score between 0.01 and 1.00, indicating the level of relevancy between the label and the given document.  
"""

def F_prompt (document, labels) :


   
    labels_txt = ''
    for label in labels:
        label_id = dic_taxonomy[label]['id']
        if len(dic_taxonomy[label]['description_cleaned'])>0:
            description = dic_taxonomy[label]['description_cleaned']
        else:
            description = dic_taxonomy[label]['description_generated']
        
        if len(description) <2:
            description = "not available"
        description = description.replace('\r',' ').replace('\n\n','  ')

        labels_txt += f"""
        ID= {label_id}
        name= '{clean_label(label)}'
        description= '{description}'
        """

    label_ids  = [dic_taxonomy[label]['id'] for label in labels]
    prompt = f"""given document:
    'title'= '{document['title']}',
    'abstract'= '{document['abstract']}'
    'keywords'= '{document['keywords']}'

    Labels:{labels_txt}


    
    Please provide a score for each label, {len(labels)} scores in total. Ensure you include all of these label IDs:
    {label_ids}
    """
    return(prompt)


In [0]:



json_format =[  
    {  
        "name": "check_relevancy",  
        "description": "check relevancy",  
        "parameters": {  
            "type": "object",  
            "properties": {  
                "labels_scores": {  
                    "type": "array",  
                    "description": "list of tuples containing label ID and relevancy score",  
                    "items": {  
                        "type": "object",  
                        "properties": {  
                            "label_id": {  
                                "type": "integer",  
                                "description": "label ID"  
                            },  
                            "relevancy_score": {  
                                "type": "number",  
                                "description": "relevancy score",  
                                "minimum": 0.01,  
                                "maximum": 1.00  
                            }  
                        },  
                        "required": ["label_id", "relevancy_score"]  
                    }  
                }  
            },  
            "required": ["labels_scores"]  
        }  
    }  
]  



def F_prompt_GPT(document,labels):
    message_user = F_prompt(document, labels)
    L = len(labels)
    message_system = prompt_system_template.format(L,L,L)

    return(generate_openai_response(message_user, message_system, json_format, max_tokens=2000),message_user,message_system)

# Mathermatical Functions

In [0]:
def F_just_leaf(numbers_original):
    return (numbers_original[0])

def F_mean(numbers_original):
    return(np.mean(numbers_original))

def F_mean_leaf_parent(numbers_original):
    return(np.mean(numbers_original[:2]))

def F_harmonic_mean(numbers_original):
    numbers = numbers_original.copy()
    n = len(numbers)
    if any(x == 0 for x in numbers):
        return 0    
    return n / sum(1 / x for x in numbers)

In [0]:


def calculate_entropy(data):
    # Count the frequency of each unique number in the list
    frequency = Counter(data)
    
    # Calculate the total number of elements
    total_count = len(data)
    
    # Calculate the entropy
    entropy = 0
    for count in frequency.values():
        # Calculate the probability of each unique number
        probability = count / total_count
        
        # Update the entropy sum
        entropy -= probability * math.log2(probability)
    
    return entropy


# ASK GPT In parallel

In [0]:
n_initial = 40
F_functions = [F_just_leaf, F_mean, F_mean_leaf_parent, F_harmonic_mean]
for F_mathematical in F_functions :
    df_in[f'labels_RankRerank_approach_{F_mathematical.__name__[2:]}'] = [{} for _ in range(len(df_in))]


In [0]:
for i_doc,doc_tmp in df_in.iloc[:].iterrows():
    print(i_doc, end = ' . ')
    # getting the inital labels
    leave_labels = doc_tmp['labels_BiEncoder'][0:n_initial]
    leave_labels_temp = leave_labels.copy()
    # getting the inital labels + their parrents
    flag_bool = True
    while flag_bool:
        flag_bool = False
        L = len(leave_labels_temp)  
        print(L, end = ' --> ')
        for i_label  in range(len(leave_labels_temp)):
            label = leave_labels_temp[i_label]
            label_parent = dic_taxonomy[label]['parent_label']
            if label_parent != "":
                if label_parent in leave_labels_temp:
                    pass
                else:
                    leave_labels_temp.append(dic_taxonomy[label]['parent_label'])
                    flag_bool = True

    # Ask GPT to score them

    GPT_scores , prompt_user_, system_message_ = F_prompt_GPT(doc_tmp, leave_labels_temp)
         
    print(f' GPT gave back  {len(GPT_scores["labels_scores"])} answers' , end = ' >> ')
    dic_GPT_scores = {item['label_id']:item['relevancy_score'] for item in GPT_scores['labels_scores']}
    print('entropy:', calculate_entropy(list(dic_GPT_scores.values())), end = ' >> ')
    print('Mathematical Functions',)


    # Mathematical functions
    F_functions = [F_just_leaf, F_mean, F_mean_leaf_parent, F_harmonic_mean]
    for F_mathematical in F_functions :
        dic_function = {}
        for label_leave in leave_labels:
            label_ID = dic_taxonomy[label_leave]['id']
            parental_scores = []
            while label_ID != "":
                parental_scores.append(dic_GPT_scores[label_ID])
                label_ID = dic_taxonomy[dic_taxonomy_id2label[ label_ID]]['parent_id']

            
            dic_function[label_leave] = F_mathematical(parental_scores)
        dic_function_sorted = list(sorted(dic_function.items(), key=lambda item: item[1], reverse=True)  )
        df_in.at[i_doc,f'labels_RankRerank_approach_{F_mathematical.__name__[2:]}'] = [item[0] for item in dic_function_sorted]




# Store


In [0]:
df_in.to_pickle(path_results + '3-RankRerank.pckl')