In [10]:
from openai import OpenAI
import os
from pydantic import BaseModel
import tiktoken
import instructor
from instructor.batch import BatchJob

In [12]:
import json
import pandas as pd

In [4]:
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

# Prompt Formation

In [3]:
system_prompt = """
The following sentence contains a management-related word highlighted using [TGT]. This word can be any part of speech (noun, verb, adjective, etc.), such as “manage,” “manager,” “managing,” etc. Please answer the following question based on the context of the sentence:

Question1: What is the context of the management-related word?:
- Business and Professional: for example, work, job, industry etc.
- Personal: for example, self, personal life, personal relationship etc.
- Others: Any context not fitting into the above categories.

Question2: Categorize the object being managed into one of the following categories:
- Emotion and subjective experiences:  feelings like happiness, stress, expectations, etc.
- Human body: physical aspects like health, fitness, etc.
- Time: for example, scheduling, deadlines, etc.
- Family: relationships with family members like parents, siblings, children, spouse, etc.
- Friendship: relationships with friends
- Romantic relationships: for example, love, dating, intimacy, etc
- Household: for example, home, groceries, backyard, etc
- Financials: for example, money, savings, wealth, etc
- Business Operations: for example, sales, marketing, production, employee, etc
- Others: Any object not fitting into the above categories.

Instructions:
- Focus only on the management-related word highlighted using [TGT] in the sentence.
- Only identify instances that strictly fall under the identified categories. Label all other instances as "Others".
- If the highlighted word is intransitive (e.g., “manage .. to do”), assume the context and object being managed are "Others".

Example:
- Sentence: The manager cannot [TGT] manage [TGT] his children when they misbehave.
- Answer: Question1: Personal; Question2: Family
- Sentence: The [TGT] manager [TGT] cannot manage his children when they misbehave.
- Answer: Question1: Business and Professional; Question2: Business Operations
- Sentence: I [TGT] manage [TGT] to write him a letter.
- Answer: Question1: Others; Question2: Others
"""

In [4]:
class Category(BaseModel):
    Question1: str
    Question2: str

In [5]:
def send_message(sentence):
    try:
        completion = client.beta.chat.completions.parse(
            model="gpt-4o-2024-08-06",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": sentence},
            ],
            response_format=Category,
            max_tokens=100,
        )
        sentence_category = completion.choices[0].message
        if sentence_category.parsed:
            return sentence_category.parsed
        elif sentence_category.refusal:
            # handle refusal
            return sentence_category.refusal
    except Exception as e:
        # Handle edge cases
        if type(e) == openai.LengthFinishReasonError:
            # Retry with a higher max tokens
            print("Too many tokens: ", e)
            pass
        else:
            # Handle other exceptions
            print(e)
            pass

In [18]:
# individual test
test_sentence = "She is the [TGT] managing [TGT] editor of the magazine."
send_message(test_sentence)

Category(Question1='Business and Professional', Question2='Business Operations')

# Batch Inference
given that it is very cheap to process a large number of sentences, we decide to go for the entire dataset of COHA

In [20]:
# token estimates: copy from openai cookbook
def num_tokens_from_messages(messages, model="gpt-4o-2024-08-06"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using o200k_base encoding.")
        encoding = tiktoken.get_encoding("o200k_base")
    if model in {
        "gpt-3.5-turbo-0125",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        "gpt-4o-mini-2024-07-18",
        "gpt-4o-2024-08-06"
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0125.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0125")
    elif "gpt-4o-mini" in model:
        print("Warning: gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-mini-2024-07-18.")
        return num_tokens_from_messages(messages, model="gpt-4o-mini-2024-07-18")
    elif "gpt-4o" in model:
        print("Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.")
        return num_tokens_from_messages(messages, model="gpt-4o-2024-08-06")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

def calculate_price(num_token):
    return num_token*2.5/1000000

In [21]:
test_sentence = 'She is a good [TGT] manager [TGT] of the land that she inherited from her father.'
test_messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": test_sentence}]
num_tokens_from_messages(test_messages)
calculate_price(num_tokens_from_messages(test_messages))

0.00121

## load data

In [22]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

In [23]:
# load df
data_path = '/zfs/projects/faculty/amirgo-management/COHA_data/processed_data/'
df=pd.read_pickle(data_path+'coha_mgmt_sent_wsd_tagged.pkl')

# remove cases when WSD label with 0 and with confidence >0.95
# absolute intransitive cases
# df = df[~((df['WSD_pred']==0) & (df['WSD_conf']>0.95))]

In [24]:
# add [TGT] label to focal word
def if_is_instrasitive(sentence, mgmt_tag):
    words = word_tokenize(sentence)
    target_idx = mgmt_tag[0]
    target_word = mgmt_tag[1]
    target_pos = mgmt_tag[2]
    if "VB" in target_pos:
        if words[target_idx+1] == "to":
            return True
    return False

def add_tgt_label(sentence, mgmt_tag):
    tagged_sentence = ""
    words = word_tokenize(sentence)
    target_idx = mgmt_tag[0]
    target_word = mgmt_tag[1]
    for i, word in enumerate(words):
        if i == target_idx and word == target_word:
            words[i] = '[TGT] ' + word + ' [TGT]'
        tagged_sentence += words[i] + " "
    return tagged_sentence

df['tgt_tagged'] = df.apply(lambda x: add_tgt_label(x['mgmt_sents'], x['mgmt_tag']), axis=1)
df['is_intransitive'] = df.apply(lambda x: if_is_instrasitive(x['mgmt_sents'], x['mgmt_tag']), axis=1) # sample is very small, keep it for training purpose

In [26]:
df = df[df['is_intransitive']==False]
df.reset_index(drop=True, inplace=True)
df.shape

(37885, 11)

## batch formatting

In [27]:
# estimation of cost
def format_message(sent):
    return [{"role": "system", "content": system_prompt},
            {"role": "user", "content": sent}]
msg_ls = []
for sent in df['tgt_tagged']:
    msg_ls.append(format_message(sent))

total_tokens = 0
for msg in msg_ls:
    total_tokens += num_tokens_from_messages(msg)
print("Total tokens: ", total_tokens)
print("Total cost: ", calculate_price(total_tokens))

Total tokens:  19187776
Total cost:  47.96944


In [8]:
# use function from instructor package
class Category(BaseModel):
    Question1: str
    Question2: str

def get_messages(df):  
    for index, row in df.iterrows():
        sentence = row['tgt_tagged']
        yield [
            {
                "role": "system",
                "content": system_prompt,
            },
            {"role": "user", "content": sentence},
        ]

In [29]:
BatchJob.create_from_messages(
    messages_batch=get_messages(df),
    model="gpt-4o-2024-08-06",
    file_path=data_path+"full_coha_oct30.jsonl",
    response_model=Category,
    max_tokens=100,
    temperature=0.0,
)

In [33]:
parsed, unparsed = BatchJob.parse_from_file(  
    file_path=data_path+"full_coha_classified_oct30.jsonl", response_model=Category
)

In [34]:
df['Question1'] = [x.Question1 for x in parsed]
df['Question2'] = [x.Question2 for x in parsed]

In [41]:
# correct the labels
# df.loc[df['Question1']=='Friendship', 'Question1'] = 'Personal'
# df.loc[df['Question1']=='Household', 'Question1'] = 'Personal'

In [42]:
df['Question1'].value_counts()

Question1
Business and Professional    31337
Others                        5147
Personal                      1401
Name: count, dtype: int64

In [35]:
df['Question2'].value_counts()

Question2
Business Operations                   29398
Others                                 6098
Financials                             1150
Household                               330
Emotion and subjective experiences      291
Family                                  215
Human body                              206
Time                                     89
Romantic relationships                   83
Friendship                               25
Name: count, dtype: int64

In [54]:
sampled = df[(df['Question1']=='Others') & (df['Question2']=='Human body')]
for i in range(len(sampled)):
    print(sampled['id'].iloc[i])
    print(sampled['tgt_tagged'].iloc[i])
    print(sampled['Question1'].iloc[i])
    print(sampled['Question2'].iloc[i])
    print("")

773847
care of the sick horse the horse is the most healthful of farm animals when given his freedom and properly [TGT] managed [TGT] . 
Others
Human body

303485
infants , said dr. rice , may recover and general health may improve under proper [TGT] management [TGT] , although a residual pneumonic process may persist indefinitely . 
Others
Human body

780151
abel had been losing [TGT] management [TGT] of his eyes ; however , he seemed to be satisfied that his brother was better off than he had been . 
Others
Human body

774256
emphasizes that in the army prompt diagnosis and brief psychotherapy in the early stage of the illness were essential to proper [TGT] management [TGT] of functional gastrointestinal disorders @ @ @ @ @ @ @ @ @ @ manifestation , differentiation between peptic ulcer and functional distress was essential . 
Others
Human body

774319
when health is seen as the opposite of dis-ease , when human behavior is understood as symptomatic of underlying needs , when developm

In [42]:
sampled = df[df['Question2']=='Household']
for i in range(10):
    print(sampled['tgt_tagged'].iloc[i])
    print(sampled['Question1'].iloc[i])
    print(sampled['Question2'].iloc[i])
    print('-----------------')

why not invite your daughter to cooperate with you in [TGT] managing [TGT] your home as her firm runs its business ? 
No
Household
-----------------
why not induce her to help you apply business methods to home [TGT] manage-ment [TGT] ? 
No
Household
-----------------
this is not so simple as applying her newly acquired business experience to the [TGT] management [TGT] of your household , but you will accomplish it more easily if you will start with the firm conviction that you still exert a strong influence over your daughter 's viewpoint on moral and social questions . 
No
Household
-----------------
or , if one is a woman , one determines to marry prince charming ; he 'll be richer than rockefeller , wittier than oscar wilde , smarter than count d'orsay @ @ @ @ @ @ @ @ @ @ one [TGT] manages [TGT] very well with a comfortable house , a husband who in the winter sends one to palm beach with the children , and buys new cerise curtains for one 's bedroom . 
No
Household
----------------

In [55]:
df.to_pickle(data_path+'coha_mgmt_sent_chatgpt_tagged_oct30.pkl')

In [1]:
# I accidentally overwrite the file, so I need to extract the parsed data again
# open jsonl file
def sents_from_json(file_path):
    sentences = []
    with open(file_path, "r") as f:
        lines = f.readlines()
        for line in lines:
            sentences.append(json.loads(line)['body']['messages'][1]['content'])
    return sentences

In [19]:
data_path = '/zfs/projects/faculty/amirgo-management/COHA_data/processed_data/'
parsed, unparsed = BatchJob.parse_from_file(  
    file_path=data_path+"full_coha_classified_oct30.jsonl", response_model=Category
)
mgmt_sents = sents_from_json(data_path+"full_coha_oct30.jsonl")
df = pd.DataFrame({'tgt_tagged': mgmt_sents})
df['Question1'] = [x.Question1 for x in parsed]
df['Question2'] = [x.Question2 for x in parsed]

In [16]:
# correct the labels
# df.loc[df['Question1']=='Friendship', 'Question1'] = 'Personal'
# df.loc[df['Question1']=='Household', 'Question1'] = 'Personal'

In [20]:
df.to_pickle(data_path+'coha_mgmt_sent_chatgpt_tagged_oct30.pkl')