In [1]:
import os
import openai
import json

raw = json.load(open('./data/FinEntity.json'))
raw = raw[:200]
openai.api_key = os.getenv("OPENAI_API_KEY")

sys_prompt = "Discard all the previous instructions. Behave like you are an expert entity recognizer and sentiment classifier. "
user_prompt = """
Identify the entities which are companies or organizations from the following content and classify the sentiment of the corresponding entities into ‘Neutral’, ‘Positive’, or ‘Negative’ classes. 
Considering every sentence as a String in python, provide the entities with the start and end index to mark the boundaries of it including spaces and punctuation using zero-based indexing.
Do not give explanations for the sentiment. In the output,Tag means sentiment; value means entity name. If no entity is found in the sentence, the response should be empty. 
The sentence: 'NEW YORK - Wall Street ended sharply higher on Thursday, led by Tesla, Nvidia and other megacap growth stocks in a choppy session ahead of a key jobs report due on Friday. '
"""
assist_prompt = """{"start": 64, "end": 69, "value": "Tesla", "tag": "Positive"}\n{"start": 71, "end": 77, "value": "Nvidia", "tag": "Positive"}\n{"start": 82, "end": 98, "value": "megacap growth stocks", "tag": "Positive"}\n{"start": 127, "end": 132, "value": "Friday", "tag": "Neutral"}
"""
user_prompt2 = """Johnson & Johnson <JNJ.N> shares gained 0.20% after posting results that beat expectations but cut its full-year outlook, citing a stronger dollar. [nL4N2Z028U]
"""
assist_prompt2 = """{"start": 0, "end": 17, "value": "Johnson & Johnson", "tag": "Positive"}
"""


In [2]:
def subset(alist, idxs):
    sub_list = []
    for idx in idxs:
        sub_list.append(alist[idx])

    return sub_list

import time
result_list=[]
compare_list=[]
for item in raw:
    sentence = item['content']
    #sentence ='Nearly all major S&P 500 sectors are red, with materials <.SPLRCM> and communications services <.SPLRCL> taking the biggest hits. Staples <.SPLRCS> and healthcare <.SPXHC> are posting small gains.'
    try:
        rsp = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": sys_prompt},
                {"role": "user", "content": user_prompt},
                {"role": "assistant", "content": assist_prompt},
                {"role": "user", "content": user_prompt2},
                {"role": "assistant", "content": assist_prompt2},
                {"role": "user", "content": sentence},
            ],
            temperature=0.0,
        )
    except:
        print("retry request")
        time.sleep(0.5)
        rsp = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt_new},],temperature=0.0,)
    else:
        result_dict = {}
        result_dict['content'] = sentence
        if len(rsp['choices'])==0:
            result_dict['annotations'] = []
            result_list.append(result_dict)
            compare_list.append(item)
            continue
        choice = rsp['choices'][0] 
        message = choice['message']
        res_str = message['content']
        res_str = res_str.split('\n')
        anno_list = []
        if len(res_str)==0:
            result_dict['annotations'] = []
            result_list.append(result_dict)
            compare_list.append(item)
            continue
        for res in res_str:
            index_left = res.find('{')
            index_right = res.find('}')
            if index_right == -1 or index_left == -1:
                continue
            res = res[index_left:index_right+1]
            sub_json = json.loads(res)
            anno_list.append(sub_json)
        result_dict['annotations'] = anno_list
        result_list.append(result_dict)
        compare_list.append(item)
    
#Correcting start and end tags
for i,item in enumerate(result_list):
    text = item['content']
    annos = item['annotations']
    sorted_annos = sorted(annos, key=lambda x: x['start'])
    value_list = []
    start_list = []
    for indx,sub_annos in enumerate(sorted_annos):
        value = sub_annos['value']
        if value not in value_list:
            start = text.find(value)
        else:
            index_list = []
            for j,v in enumerate(value_list):
                if v==value:
                    index_list.append(j)
            sub_start = subset(start_list,index_list)
            last_start = max(sub_start)
            start = text.find(value,last_start+1)
        sub_annos['start'] = start
        sub_annos['end'] = start + len(value)
        value_list.append(value)
        start_list.append(start)
        

COMPLETE


In [10]:
with open('./data/open_ai.json', 'wt') as f:
    print(json.dumps(result_list), file=f)
print("COMPLETE")

COMPLETE


In [11]:
print(len(compare_list))
print(len(result_list))

200
200


In [12]:
result_list = json.load(open('./data/open_ai.json'))
for example in result_list:
    for annotation in example['annotations']:
        #We expect the key of label to be label but the data has tag
        annotation['label'] = annotation['tag']

In [13]:
from sequence_aligner.labelset import LabelSet
from sequence_aligner.dataset import TrainingDatasetCRF
from sequence_aligner.containers import TraingingBatch
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('yiyanghkust/finbert-pretrain')
label_set = LabelSet(labels=["Neutral", "Positive", "Negative"])  # label in this dataset

dataset = TrainingDatasetCRF(data=compare_list, tokenizer=tokenizer, label_set=label_set,tokens_per_batch = 128)
dataset_openai = TrainingDatasetCRF(data=result_list, tokenizer=tokenizer, label_set=label_set,tokens_per_batch = 128)

In [14]:
from seqeval.metrics import f1_score
from seqeval.metrics import precision_score
from seqeval.metrics import accuracy_score
from seqeval.metrics import recall_score
from seqeval.metrics import classification_report
from process import ids_to_labels,Metrics,Metrics_e
from seqeval.scheme import BILOU

label_list=[]
pred_label_list=[]
for i in range(len(dataset)):
    sub_list=[]
    pred_sub_list=[]
    for m in dataset[i].labels:
        if m == -1:
            continue
        else:
            sub_list.append(label_set.ids_to_label[m])
    for n in dataset_openai[i].labels:
        if n == -1:
            continue
        else:
            if n == None:
                n = 0
            pred_sub_list.append(label_set.ids_to_label[n])
    label_list.append(sub_list)
    pred_label_list.append(pred_sub_list)
report=classification_report(label_list, pred_label_list, mode='strict', scheme=BILOU)
print(report)

              precision    recall  f1-score   support

    Negative       0.45      0.81      0.58        63
     Neutral       0.36      0.44      0.39       139
    Positive       0.62      0.83      0.71       225

   micro avg       0.51      0.70      0.59       427
   macro avg       0.48      0.69      0.56       427
weighted avg       0.51      0.70      0.59       427

