This notebook evaluates the top taxonomy labels for each document by querying ChatGPT to determine if each label is a suitable fit. The results are stored as a new column in the DataFrame.

In [0]:
%run ./utils

In [0]:
%run ./config

In [0]:
import pandas as pd
import numpy as np
import openai
import json
from concurrent.futures import ThreadPoolExecutor, as_completed  
import re  
import pickle
import math
from collections import Counter
import random  
import json  
import time

In [0]:
conf = DataBricksDevConfig

path_results = conf.Data.path_results
path_taxonomy = conf.Data.path_taxonomy
folder_name = conf.Data.folder_name
dic_abbreviation2full = conf.taxonomy_info.dic_abbreviation2full

In [0]:
try:
    df_in = pd.read_pickle(path_results + '5-PrpposedApproach(ablation2).pckl')
    print(len(df_in))
    df_in.head(2)
except Exception as e:
    print("Cluster could not read the file (1- wrong cluster 2- no new json file)",e)
    dbutils.notebook.exit(str(e))  

## Read Taxonomy from generated Dataframe

In [0]:

with open(path_taxonomy + 'dic_taxonomy_embeddings.pickle', 'rb') as file:
    dic_taxonomy = pickle.load(file)
#1
dic_taxonomy_id2label = {dic_taxonomy[label]['id']:label for label in dic_taxonomy.keys()}
#2   
labels_all = list(dic_taxonomy.keys())  


In [0]:

def clean_label(label):
    cleaned_label = label
    for abbreviation, expansion in dic_abbreviation2full.items():
        if cleaned_label.startswith(abbreviation):
            cleaned_label = cleaned_label.replace(abbreviation, expansion)
    return cleaned_label  


for k,v in dic_taxonomy.items():
    dic_taxonomy[k]['cleaned_name']= clean_label(k)



# required functions

In [0]:
prompt_system ="""You are an AI trained to evaluate the relevance of multiple labels for a given SSRN pre-print document, and select top 5 labels that best fit the document. For this task, you will receive the document's title, keywords, abstract, and a list of labels. Each label in the list has an ID, name, description. Your task is to determine which labels are the best fit for the document. A label fits well ifThe document's main focus aligns with the area the label describes.  
Please return the IDs of the top 5 labels that best fit the given document."""



def F_prompt (document, labels) :
    labels_txt = ''
    for label in labels:
        label_id = dic_taxonomy[label]['id']
        if len(dic_taxonomy[label]['description_cleaned'])>0:
            description = dic_taxonomy[label]['description_cleaned']
        else:
            description = dic_taxonomy[label]['description_generated']
        description = description.replace('\r',' ').replace('\n\n','  ')

        labels_txt += f"""
        ID= {label_id}
        name= '{clean_label(label)}'
        description= '{description}'
        """

    label_ids  = [dic_taxonomy[label]['id'] for label in labels]
    prompt = f"""given document:
    'title'= '{document['title']}',
    'abstract'= '{document['abstract']}'
    'keywords'= '{document['keywords']}'

    Labels:{labels_txt}
    """
    return(prompt)


In [0]:

json_format = [
    {
        "name": "check_relevancy",
        "description": "check relevancy",
        "parameters": {
            "type": "object",
            "properties": {
                "best_labels": {
                    "type": "array",
                    "description": "list of IDs of top 5 best labels",
                    "items": {
                        "type": "integer"
                    }
                }
            },
            "required": ["best_labels"]
        }
    }
]

def F_prompt_GPT(document, labels):
    message_user = F_prompt(document, labels)
    message_system = prompt_system
    return(generate_openai_response( message_user, message_system, json_format, max_tokens=2000),message_system ,message_user )

## ASK GPT In parallel

In [0]:
columns_to_check = ['labels_hierarchical_approach', 'labels_ShortenTaxonomy_approach','labels_Proposed_approach', 'labels_Proposed_approach_ablation1', 'labels_Proposed_approach_ablation2']

for col in columns_to_check :
    df_in[col+'_5labels'] = [[] for _ in range(len(df_in))]


In [0]:


for i_doc,doc_tmp in df_in.iloc[:].iterrows():
    print(i_doc, end = ', ')
    for col in columns_to_check:
        print(col, end = ' ')
        # call GPT in parallel
        selected_labels = doc_tmp[col]
        # print(i_doc)
        if len(selected_labels) > 5 :
            print(len(selected_labels) , end= ' -->')
            GPT_answer_labels, message_system_ ,message_user_  = F_prompt_GPT(doc_tmp,selected_labels)
            selected_label_IDs = GPT_answer_labels['best_labels']
            if len(selected_label_IDs)!=5:
                print(len(selected_label_IDs), 'WHAT?')
            selected_labels = [dic_taxonomy_id2label[id] for id in selected_label_IDs]
        print()
        df_in.at[i_doc,col+'_5labels'] = selected_labels
    print('')

# To fix the cases that GPT did not resolve

Dev-GPT4o

In [0]:

for i_doc,doc_tmp in df_in.iloc[:].iterrows():
    print(i_doc, end = ', ')
    for col in columns_to_check:
        print(col, end = ' ')
        # call GPT in parallel
        selected_labels = doc_tmp[col+'_5labels']
        # print(i_doc)
        if len(selected_labels) > 5 :
            print(len(selected_labels) , end= ' -->')
            GPT_answer_labels, message_system_ ,message_user_  = F_prompt_GPT(doc_tmp,selected_labels)
            selected_label_IDs = GPT_answer_labels['best_labels']
            if len(selected_label_IDs)!=5:
                print(len(selected_label_IDs), 'WHAT?')
            selected_labels = [dic_taxonomy_id2label[id] for id in selected_label_IDs]
        print()
        df_in.at[i_doc,col+'_5labels'] = selected_labels
    print('')

# Store


In [0]:


df_in.to_pickle(path_results + f'6-SelectionReduction.pckl')


# TEST

In [0]:
print(df_in.columns)

In [0]:
import pandas as pd  
import matplotlib.pyplot as plt  
  

# List of columns to consider (replace this with your actual list of columns)  
columns =  ['labels_hierarchical_approach', 'labels_ShortenTaxonomy_approach','labels_Proposed_approach',
       'labels_Proposed_approach_ablation1',
       'labels_Proposed_approach_ablation2',
       'labels_hierarchical_approach_5labels', 'labels_ShortenTaxonomy_approach_5labels','labels_Proposed_approach_5labels',
       'labels_Proposed_approach_ablation1_5labels',
       'labels_Proposed_approach_ablation2_5labels']
  
  
# Create a new DataFrame with the count of items in each list  
count_df = df_in[columns].applymap(len)  
  
# Plot histograms for each column  
for column in columns:  
    plt.figure()  
    plt.hist(count_df[column], bins=range(1, count_df[column].max() + 2), align='left', rwidth=0.8)  
    plt.title(f'Histogram of Number of Items in {column}')  
    plt.xlabel('Number of Items')  
    plt.ylabel('Frequency')  
    plt.grid(axis='y', alpha=0.75)  
    plt.show()  
