### Initialize Model & Expert-Labeled Datasets

In [97]:
import json
import copy
from typing import List
from dotenv import dotenv_values

from openai import OpenAI
from sklearn.metrics import accuracy_score
import pandas as pd

config = dotenv_values('.keys')
client = OpenAI(api_key=config['OPENAI_API_KEY'])

In [103]:
def process_market_sentiment(data_pth: str | List[str]) -> pd.Series:
    """
    Process FOMC communications according to the 'NEUTRAL', 'HAWKISH', 'DOVISH' classification regime.
    data_pth: str | List[str] - path to the data file or list of paths to the data files.
    client: OpenAI - OpenAI client object
    """
    batch_template = {
        "custom_id": "",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {
                    "role": "system",
                    "content": ("You are an expert in market sentiment analysis. Given you are an expert, you have the task of classifying "
                               "any sentence coming from communications shared by the FOMC (Federal Open Market Committee) using three labels, "
                               "NEUTRAL, HAWKISH, or DOVISH. If the sentence is clearly not neutral and has no effect, label NEUTRAL. "
                               "If the sentence corresponds to a tightening of monetary policy, label HAWKISH. " 
                               "If the sentence corresponds to an easing of monetary policy, label DOVISH. "
                               "Stick to only using these three labels, and these labels only, else you will incur a heavy fine from the SEC. "
                               "Deliberate your answer as necessary, reasoning about your choice of label and eventually "
                               "selecting the most appropriate one by its specific name."
                               ),
                }
            ],
            "max_tokens": 40,
        }
    }

    if isinstance(data_pth, str):
        data = pd.read_excel(data_pth)
    else:
        data = pd.concat([pd.read_excel(pth) for pth in data_pth])
        data.reset_index(drop=True, inplace=True)

    with open("market_sentiment_batch.jsonl", "w") as f:
        for idx, row in data.iterrows():
            batch_el = copy.deepcopy(batch_template)
            batch_el["custom_id"] = str(idx)
            batch_el["body"]["messages"].append({"role": "user", "content": row["sentence"]})
            f.write(json.dumps(batch_el) + "\n")
    
                
    batch_file = client.files.create(
        file=open("market_sentiment_batch.jsonl", "rb"),
        purpose="batch"
    )

    batch_id = client.batches.create(
        input_file_id=batch_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
    ).id
    

    while client.batches.retrieve(batch_id).status != "completed":
        if client.batches.retrieve(batch_id).status == "failed":
            raise Exception("Batch Failed")
        elif client.batches.retrieve(batch_id).status == "cancelled":
            raise Exception("Batch Cancelled")
    
    output_file_id = client.batches.retrieve(batch_id).output_file_id
    file_response = client.files.content(output_file_id)

    output = pd.Series([])
    for line in file_response.iter_lines():
        resp = json.loads(line)
        label = resp["response"]["body"]["choices"][0]["message"]["content"]
        output = pd.concat([output, pd.Series([label])], ignore_index=True)
    
    return output

### Create DataFrame between Model & Expert-Label with Evaluation Metrics

In [None]:
labels = process_market_sentiment(["training_data/lab-manual-combine-train-944601.xlsx",
                                         "training_data/lab-manual-combine-train-5768.xlsx",
                                         "training_data/lab-manual-combine-train-78516.xlsx"])

In [None]:
def process_labels(label):
    if 'HAWKISH' in label:
        return 1
    elif 'DOVISH' in label:
        return 0
    elif 'NEUTRAL' in label:
        return 2
    else:
        return -1
labels = labels.apply(process_labels)

pths = ["training_data/lab-manual-combine-train-944601.xlsx",
       "training_data/lab-manual-combine-train-5768.xlsx",
       "training_data/lab-manual-combine-train-78516.xlsx"]
data = pd.concat([pd.read_excel(pth) for pth in pths])

labels.reset_index(drop=True, inplace=True)
data.reset_index(drop=True, inplace=True)
data['pred_labels'] = labels
data

Unnamed: 0,index,sentence,year,label,pred_labels
0,554,Most other market interest rates declined furt...,2021,0,2
1,293,"In agriculture, depressed levels of crop price...",2017,0,2
2,900,The real federal funds rate probably was not g...,2011,1,2
3,211,Developments during the Second Period: 1998-20...,1999,2,2
4,638,Productivity and the equilibrium real interest...,2001,2,2
...,...,...,...,...,...
5704,183,Central banks generally appear to have embrace...,1997,2,2
5705,117,"I think the—you know, in a way, the least tigh...",2015,0,2
5706,646,"Put another way, the FOMC could have ""preempti...",2000,2,2
5707,812,The uncertainty about the threshold unemployme...,2021,2,2


In [102]:
data = data[data['label'] != 2]
accuracy = accuracy_score(data['label'], data['pred_labels'])
accuracy

0.24829700272479563

### Perform other labeling based on desired metric, and store in resulting dataframe

In [None]:
def process_market_trust(data_pth: str | List[str]) -> pd.Series:
    """
    Process FOMC communications according to the 'TRUST', 'MISTRUST', 'NEUTRAL' classification regime.
    data_pth: str | List[str] - path to the data file or list of paths to the data files.
    client: OpenAI - OpenAI client object
    """
    batch_template = {
        "custom_id": "",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {
                    "role": "system",
                    "content": ("You are an expert in market sentiment analysis. Given you are an expert, you have the task of classifying "
                               "any sentence coming from communications shared by the FOMC (Federal Open Market Committee) using three labels, "
                               "NEUTRAL, TRUST, or MISTRUST. If the sentence is clearly neutral and has no effect, label NEUTRAL. "
                               "If the sentence corresponds to a positive increase in public trust of the FOMC, label TRUST. " 
                               "If the sentence corresponds to negative effect in public trust of the FOMC, label MISTRUST. "
                               "Stick to only using these three labels, and these labels only, else you will incur a heavy fine from the SEC. "
                               "Please provide the labels for any sentences you are given."
                               ),
                }
            ],
            "max_tokens": 3,
        }
    }

    if isinstance(data_pth, str):
        data = pd.read_excel(data_pth)
    else:
        data = pd.concat([pd.read_excel(pth) for pth in data_pth])
        data.reset_index(drop=True, inplace=True)

    with open("market_sentiment_batch.jsonl", "w") as f:
        for idx, row in data.iterrows():
            batch_el = copy.deepcopy(batch_template)
            batch_el["custom_id"] = str(idx)
            batch_el["body"]["messages"].append({"role": "user", "content": row["sentence"]})
            f.write(json.dumps(batch_el) + "\n")
    
                
    batch_file = client.files.create(
        file=open("market_sentiment_batch.jsonl", "rb"),
        purpose="batch"
    )

    batch_id = client.batches.create(
        input_file_id=batch_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
    ).id
    

    while client.batches.retrieve(batch_id).status != "completed":
        if client.batches.retrieve(batch_id).status == "failed":
            raise Exception("Batch Failed")
        elif client.batches.retrieve(batch_id).status == "cancelled":
            raise Exception("Batch Cancelled")
    
    output_file_id = client.batches.retrieve(batch_id).output_file_id
    file_response = client.files.content(output_file_id)

    output = pd.Series()
    for line in file_response.iter_lines():
        resp = json.loads(line)
        label = resp["response"]["body"]["choices"][0]["message"]["content"]
        output[resp["custom_id"]] = label
    
    return output

def process_market_stability(data_pth: str | List[str]) -> pd.Series:
    """
    Process FOMC communications according to the 'STABLE', 'VOLATILE', 'NO-EFFECT' classification regime.
    data_pth: str | List[str] - path to the data file or list of paths to the data files.
    client: OpenAI - OpenAI client object
    """
    batch_template = {
        "custom_id": "",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {
                    "role": "system",
                    "content": ("You are an expert in market sentiment analysis. Given you are an expert, you have the task of classifying "
                               "any sentence coming from communications shared by the FOMC (Federal Open Market Committee) using three labels, "
                               "STABLE, VOLATILE, or NO-EFFECT. If the sentence is filler and has no implications, label NO-EFFECT. "
                               "If the sentence corresponds to an implied increase in market volatility in the future, label VOLATILE. " 
                               "If the sentence corresponds to an implied expectation of future market stability, label STABLE. "
                               "Stick to only using these three labels, and these labels only, else you will incur a heavy fine from the SEC. "
                               "Please provide the labels for any sentences you are given."
                               ),
                }
            ],
            "max_tokens": 3,
        }
    }

    if isinstance(data_pth, str):
        data = pd.read_excel(data_pth)
    else:
        data = pd.concat([pd.read_excel(pth) for pth in data_pth])
        data.reset_index(drop=True, inplace=True)

    with open("market_sentiment_batch.jsonl", "w") as f:
        for idx, row in data.iterrows():
            batch_el = copy.deepcopy(batch_template)
            batch_el["custom_id"] = str(idx)
            batch_el["body"]["messages"].append({"role": "user", "content": row["sentence"]})
            f.write(json.dumps(batch_el) + "\n")
    
                
    batch_file = client.files.create(
        file=open("market_sentiment_batch.jsonl", "rb"),
        purpose="batch"
    )

    batch_id = client.batches.create(
        input_file_id=batch_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
    ).id
    

    while client.batches.retrieve(batch_id).status != "completed":
        if client.batches.retrieve(batch_id).status == "failed":
            raise Exception("Batch Failed")
        elif client.batches.retrieve(batch_id).status == "cancelled":
            raise Exception("Batch Cancelled")
    
    output_file_id = client.batches.retrieve(batch_id).output_file_id
    file_response = client.files.content(output_file_id)

    output = pd.Series()
    for line in file_response.iter_lines():
        resp = json.loads(line)
        label = resp["response"]["body"]["choices"][0]["message"]["content"]
        output[resp["custom_id"]] = label
    
    return output