This notebook evaluates the top taxonomy labels for each document by querying ChatGPT to determine if each label is a suitable fit. The results are stored as a new column in the DataFrame.

In [0]:
%run ./utils

In [0]:
%run ./config

In [0]:
import pandas as pd
import numpy as np
import openai
import json
from concurrent.futures import ThreadPoolExecutor, as_completed  
import re  
import pickle
import math
import random  

In [0]:
conf = DataBricksDevConfig
path_results = conf.Data.path_results
path_taxonomy = conf.Data.path_taxonomy
folder_name = conf.Data.folder_name
dic_abbreviation2full = conf.taxonomy_info.dic_abbreviation2full


In [0]:
try:
    df_in = pd.read_pickle(path_results + '0_json_to_dataframe.pckl')
    print(len(df_in))
except Exception as e:
    print("Cluster could not read the file (1- wrong cluster 2- no new json file)",e)
    dbutils.notebook.exit(str(e))  

## Read Taxonomy from generated Dataframe

In [0]:

def clean_label(label):
    cleaned_label = label
    for abbreviation, expansion in dic_abbreviation2full.items():
        if cleaned_label.startswith(abbreviation):
            cleaned_label = cleaned_label.replace(abbreviation, expansion)
    return cleaned_label  



In [0]:

with open(path_taxonomy + 'dic_taxonomy_embeddings.pickle', 'rb') as file:
    dic_taxonomy = pickle.load(file)
#1
dic_taxonomy_id2label = {dic_taxonomy[label]['id']:label for label in dic_taxonomy.keys()}
#2   
labels_all = list(dic_taxonomy.keys())  
#3  
root_labels = []
for k,v in dic_taxonomy.items():
    if len(v['parent_label'])==0:
        root_labels.append(k)
print(root_labels)

# required functions

# Set up the ChatGPT Environment

In [0]:
prompt_system = """You are an AI trained to evaluate the relevance of multiple labels for a given SSRN pre-print document. For this task, you will receive the document's title, keywords, abstract, and a list of labels. Each label in the list has an ID, a name and description. Your task is to determine which labels  are the best fit for the document. A label fits well if the document's main focus aligns with the area the label describes. Your output should be a concise JSON object containing a list, 'best_labels', which only includes the ID of labels that best fit the document.""" 

def F_prompt (document, labels) :   
    labels_txt = ''
    for label in labels:
        label_id = dic_taxonomy[label]['id']
        if len(dic_taxonomy[label]['description_cleaned'])>0:
            description = dic_taxonomy[label]['description_cleaned']
        else:
            description = dic_taxonomy[label]['description_generated']
        labels_txt += f"""
        ID= '{label_id}'
        name= '{clean_label(label)}'
        description= '{description}'
        """
    prompt = f"""given document:
    'title'= '{document['title']}',
    'abstract'= '{document['abstract']}'
    'keywords'= '{document['keywords']}'

    Labels:{labels_txt}
    """
    return(prompt)


In [0]:

json_format = [
    {
        "name": "check_relevancy",
        "description": "check relevancy",
        "parameters": {
            "type": "object",
            "properties": {
                "best_labels": {
                    "type": "array",
                    "description": "list of IDs of baest labels",
                    "items": {
                        "type": "integer"
                    }
                }
            },
            "required": ["best_labels"]
        }
    }
]

def F_prompt_GPT(document,labels):
    message_user = F_prompt(document, labels)
    message_system = prompt_system
    return(generate_openai_response( message_user, message_system, json_format,max_tokens=2000),message_user)

# ASK GPT In parallel

In [0]:
df_in['labels_hierarchical_approach'] = [[] for _ in range(len(df_in))]

In [0]:

for i_row, doc_tmp in df_in.iloc[:].iterrows():
    print()
    history_labels=[]
    history_prompts=[]
    selected_labels_final = []
    selected_labels_intermediate = root_labels

    while len(selected_labels_intermediate)>0 :

        # handle the first 100 labels:
        print(i_row , len(selected_labels_intermediate))
        try:
            GPT_selected_labels, prompt_ = F_prompt_GPT(doc_tmp, selected_labels_intermediate[0:100])
        except:
            print('Dev-GPT4o')
            GPT_selected_labels, prompt_ = F_prompt_GPT(doc_tmp, selected_labels_intermediate[0:100])

        GPT_selected_labels = GPT_selected_labels['best_labels']
        GPT_selected_labels = [dic_taxonomy_id2label[l] for l in GPT_selected_labels]
        history_prompts.append(prompt_)
        # handle the other labels:   
        if len (selected_labels_intermediate)>100:
            print(i_row ,  len(selected_labels_intermediate), "splited in two parts")

            GPT_selected_labels2, prompt_ = F_prompt_GPT(doc_tmp, selected_labels_intermediate[100:])
            
            GPT_selected_labels2 = GPT_selected_labels2['best_labels']
            GPT_selected_labels2 = [dic_taxonomy_id2label[l] for l in GPT_selected_labels2]

            GPT_selected_labels.extend(GPT_selected_labels2)
            history_prompts.append(prompt_)


        selected_labels_intermediate = []
        history_labels.append(GPT_selected_labels)
        for l in GPT_selected_labels:
            if l in labels_all:
                if len(dic_taxonomy[l]['children_labels'])==0:
                    selected_labels_final.append(l)
                else:

                    selected_labels_intermediate.extend(dic_taxonomy[l]['children_labels'])
            else:
                print(f'label <{l}> does not exist in the taxonomy')

    df_in.at[i_row,'labels_hierarchical_approach'] = selected_labels_final
    print('-->', len(selected_labels_final))



# Store


In [0]:
df_in.to_pickle(path_results + f'1_hierarchical_label_selection.pckl')