This notebook evaluates the top taxonomy labels for each document by querying ChatGPT to determine if each label is a suitable fit. The results are stored as a new column in the DataFrame.

In [0]:
%run ./utils

In [0]:
%run ./config

In [0]:
import pandas as pd
import numpy as np
import openai
import json
from concurrent.futures import ThreadPoolExecutor, as_completed  
import re  
import pickle
import math
from collections import Counter
import random  
import json  
import time

In [0]:
conf = DataBricksDevConfig

path_results = conf.Data.path_results
path_taxonomy = conf.Data.path_taxonomy
folder_name = conf.Data.folder_name
dic_abbreviation2full = conf.taxonomy_info.dic_abbreviation2full


In [0]:
try:
    df_in = pd.read_pickle(path_results + '4-ShortenTaxonomy.pckl')
except Exception as e:
    print("Cluster could not read the file (1- wrong cluster 2- no new json file)",e)
    dbutils.notebook.exit(str(e))  

## Read Taxonomy from generated Dataframe

In [0]:

with open(path_taxonomy + 'dic_taxonomy_embeddings.pickle', 'rb') as file:
    dic_taxonomy = pickle.load(file)
dic_taxonomy_id2label = {dic_taxonomy[label]['id']:label for label in dic_taxonomy.keys()}
labels_all = list(dic_taxonomy.keys())  

In [0]:

def clean_label(label):
    # return((label+''))    
    cleaned_label = label
    for abbreviation, expansion in dic_abbreviation2full.items():
        # if abbreviation in cleaned_label:
        if cleaned_label.startswith(abbreviation):
            cleaned_label = cleaned_label.replace(abbreviation, expansion)
    return cleaned_label  

for k,v in dic_taxonomy.items():
    dic_taxonomy[k]['cleaned_name']= clean_label(k)


# required functions

# 1- Ask about the label itself

In [0]:
prompt_system_template_label = "You are an AI trained to evaluate the relevance of a label for a given SSRN pre-print document. You will receive the document's title, keywords, abstract, and the label's ID, name and description. Your task is to determine if the label is a good fit for the document. A label fits well if the document's main focus aligns with the area the label describes. Your output should be a concise JSON object. The JSON object should contain three keys: \"main_focus\", a very short representation of the document's main focus, \"label_fit\", representing the fit as a boolean value. It's crucial to utilize the entire scoring range to reflect varying degrees of relevancy. Please do not provide no further information or explanation, in addition to the json object. Do not use the slash or backslash characters in your output."

def F_prompt_label (row) :
    label = row['label']
    label_ID = dic_taxonomy[label]['id']
    cleaned_name = dic_taxonomy[label]['cleaned_name']
    if len(dic_taxonomy[label]['description_cleaned'])>0:
        description = dic_taxonomy[label]['description_cleaned']
    else:
        description = dic_taxonomy[label]['description_generated']
                                        
    prompt = f"""given document:
'title'= '{row['title']}',
'abstract'= '{row['abstract']}'
'keywords'= '{row['keywords']}'

Label:
ID= {label_ID}
name= '{cleaned_name}'
description= '{description}'
"""
    return(prompt)


In [0]:

json_format_label = [  
    {  
        "name": "check_relevancy",  
        "description": "check relevancy",  
        "parameters": {  
            "type": "object",  
            "properties": {  
                "main_focus": {  
                    "type": "string",  
                    "description": "main focus of the paper",
                },
                "label_fit": {  
                    "type": "boolean",  
                    "description": "if label is a good match for the paper",
                },
            }  ,
            "required" : ["main_focus" ,"label_fit"  ]

        }  
    }  
]  

def F_prompt_GPT_label(row):
    message_user = F_prompt_label(row)
    message_system = prompt_system_template_label

    return(generate_openai_response(message_user, message_system, json_format_label, max_tokens=2000))

In [0]:
executor_label = ThreadPoolExecutor(max_workers=2) 
def apply_async_label(df):  
    GPT_answers = {}

    futures_to_index = {executor_label.submit(F_prompt_GPT_label, row): index for index, row in df.iterrows()}  
    for future in as_completed(futures_to_index):  
        index = futures_to_index[future]  
        print(index,end= ',')
        GPT_answers[index] = future.result() 
    print(' end of GPT calls A.')
    return(GPT_answers)
  

# 2- Ask about the label's Parent

In [0]:
prompt_system_template_parent_without = "You are an AI, trained to assess the potential relevance of a label for a given SSRN pre-print document. You'll be provided with the document's title, keywords, abstract, and the label's name. Your mission is to gauge if the label could be a reasonable match for the document. A label can be considered a reasonable match even if it only partially aligns with the document's main theme. Your response should be a JSON object. This JSON object should include three keys: \"main_focus\", a brief summary of the document's main theme, \"label_fit\", indicating the fit as a boolean value, and \"relevancy_score\", showing the relevance as a score from 0 to 1. It's important to use the full scoring range to indicate varying levels of relevance. Do not use the slash or backslash characters in your output."

prompt_system_template_parent_with = "You are an AI, trained to assess the potential relevance of a label for a given SSRN pre-print document. You'll be provided with the document's title, keywords, abstract, and the label's name and description. Your mission is to gauge if the label could be a reasonable match for the document. A label can be considered a reasonable match even if it only partially aligns with the document's main theme. Your response should be a JSON object. This JSON object should include three keys: \"main_focus\", a brief summary of the document's main theme, \"label_fit\", indicating the fit as a boolean value, and \"relevancy_score\", showing the relevance as a score from 0 to 1. It's important to use the full scoring range to indicate varying levels of relevance. Do not use the slash or backslash characters in your output."

def F_prompt_parent (row) :
    label = row['label']
    label_ID = dic_taxonomy[label]['id']
    cleaned_name = dic_taxonomy[label]['cleaned_name']
    if len(dic_taxonomy[label]['description_cleaned'])>0:
        description = dic_taxonomy[label]['description_cleaned']
    else:
        description = dic_taxonomy[label]['description_generated']
                                        
    prompt = f"""given document:
'title'= '{row['title']}',
'abstract'= '{row['abstract']}'
'keywords'= '{row['keywords']}'

Label:
ID= {label_ID}
name= '{cleaned_name}'
"""
    if len(description)>0:
        prompt += f"description= '{description}'"
        
    return(prompt)


In [0]:

json_format_parent = [  
    {  
        "name": "check_relevancy",  
        "description": "check relevancy",  
        "parameters": {  
            "type": "object",  
            "properties": {  
                "main_focus": {  
                    "type": "string",  
                    "description": "main focus of the paper",
                },
                "label_fit": {  
                    "type": "boolean",  
                    "description": "if label is a good match for the paper",
                },
            }  ,
            "required" : ["main_focus" ,"label_fit"  ]

        }  
    }  
]  

def F_prompt_GPT_parent(row):
    message_user = F_prompt_parent(row)

    if "description= " in message_user:
        message_system = prompt_system_template_parent_with
    else:
        message_system = prompt_system_template_parent_without

    return(generate_openai_response( message_user, message_system, json_format_parent, max_tokens=2000))

In [0]:
executor_parent = ThreadPoolExecutor(max_workers=4) 
def apply_async_parent(df):  
    GPT_answers = {}

    futures_to_index = {executor_parent.submit(F_prompt_GPT_parent, row): index for index, row in df.iterrows()}  
    for future in as_completed(futures_to_index):  
        index = futures_to_index[future]  
        print(index,end= ',')
        GPT_answers[index] = future.result() 
    print(' end of GPT calls B.')
    return(GPT_answers)
  

## ASK GPT In parallel

In [0]:
df_in['labels_Proposed_approach'] = [[] for _ in range(len(df_in))]

In [0]:
n_initial = 40

for i_doc,doc_tmp in df_in.iloc[:].iterrows():
    print('document' , i_doc, end = ': ')
    # getting the inital labels
    
    #1 - Label itself
    # Make a temporary dataframe with all labels
    leave_labels = doc_tmp['labels_BiEncoder'][:n_initial]
    df_temp = pd.DataFrame()
    df_temp['label'] = leave_labels
    df_temp['title']= doc_tmp['title']
    df_temp['keywords'] = doc_tmp['keywords']
    df_temp['abstract']	= doc_tmp['abstract']
    # call GPT in parallel
    GPT_answer_labels = apply_async_label(df_temp)
    selected_labels = [leave_labels[k] for k,v in GPT_answer_labels.items() if v['label_fit']]
    print('-->' , len(selected_labels), end = ' left : ')
    #2 - Parent Label
    time.sleep(10)
    df_temp = pd.DataFrame()
    df_temp['label'] = [dic_taxonomy[l]['parent_label'] for l in selected_labels]
    df_temp['title']= doc_tmp['title']
    df_temp['keywords'] = doc_tmp['keywords']
    df_temp['abstract']	= doc_tmp['abstract']
    
    GPT_answer_parents = apply_async_parent(df_temp)
    selected_labels_selected_parent = [selected_labels[k] for k,v in GPT_answer_parents.items() if v['label_fit']]
    print('-->' , len(selected_labels_selected_parent))

    #
    df_in.at[i_doc,'labels_Proposed_approach'] = selected_labels_selected_parent
    print(' . . . ')

    time.sleep(10)


# Store


In [0]:
df_in.to_pickle(path_results + '5-PrpposedApproach.pckl')