In [91]:
!pip install --upgrade openai wandb python-dotenv



In [2]:
from openai import OpenAI
import wandb
import os
import json
from dotenv import load_dotenv
from tqdm import tqdm



## System prompt

You are a helpful AI assistant who is an expert in language art. You are going to help me with a task described as follow: 




## User prompt

You are a helpful AI assistant who is an expert in language art. You are going to help me with a task described as follow: 

I'm planning to fine-tune an open-source, chat-based large language model to enhance its understanding of sarcasm, specifically to mimic Chandler Bing from the TV sitcom Friends, known for his sarcastic wit. The method involves using direct preference optimization and Reinforcement Learning from Human Feedback (RLHF). I need your assistance in creating a preference dataset for this task based on the following guidelines:

Source Material: Utilize the screenplay from an episode of Friends that I'll provide you later below.
Task Details:
1. Extraction: Identify and extract every line spoken by Chandler Bing.
2. Context Summarization: For each line extracted, provide a brief but useful summary of the surrounding context. If Chandler's line is a response to another character, append that information to the end of the context summary. If Chandler's line is a standalone joke, just provide a brief summary of the scene or context.
3. Sarcasm Analysis: Determine whether each quote is sarcastic. Support your classification with a brief reason.
4. Response Generation: For each quote, generate two brief responses that Chandler Bing is unlikely to say. One response should be sarcastic, and the other should be sincere. Remember both responses should neither fit Chandler Bing's and nor sound like Chandler Bing's original quote at all. These will serve as the rejected response in the dataset, contrasting with Chandler's original line which is the preferred response.
5. Output Format: Organize the results into a python-dictionary format with six keys. The final output should be a string that can be evaluated and converted into a real python dictionary and json file later. Please ensure that you skip any description or explanation of the analysis process and your output should be structured exactly as follows:
{
    "Chandler_quote": Original line from Chandler Bing,
    "Context": Brief summary of the context around the quote,
    "Sarcasm": True/False,
    "Reason": Explanation for the sarcasm classification,
    "Unlike_chandler_sarcastic": Hypothetical sarcastic response unlikely to be said by Chandler Bing,
    "Unlike_chandler_sincere": Hypothetical sincere response unlikely to be said by Chandler Bing
}

Below is the screenplay from the episode of Friends that you should use for this task. 


In [3]:
## read the system_prompt.txt and user_prompt.txt from the file in the current directory
with open("system_prompt.txt", "r") as f:
    SYSTEM_PROMPT = f.read()   

with open("user_prompt.txt", "r") as f:
    USER_PROMPT = f.read()


In [4]:
## read the frieds_screenplay.jsonl file and load the data
friends_screenplay= []

with open("friends_screenplay.jsonl", "r") as f:
    for line in f:
        friends_screenplay.append(json.loads(line))

# split the friends_screenplat_10seaons by season information extracted frmo id field, eg. S01E01
friends_screenplay_by_season = {}
for episode in friends_screenplay:
    season = episode["id"][:3]
    if season not in friends_screenplay_by_season:
        friends_screenplay_by_season[season] = []
    friends_screenplay_by_season[season].append(episode)

In [5]:
# # create a subset of the friends_screenplay_by_season with only 2 episodes from season 1
# # for testing/debugging purposes 
# friends_screenplay_by_season_subset =  friends_screenplay_by_season.copy()
# friends_screenplay_by_season_subset['S01'] = friends_screenplay_by_season['S01'][:2]

In [8]:
# load the .env file for the OPENAI_API_KEY
load_dotenv()

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")
)

def gpt_generate_data(client: OpenAI, model: str, screenplay_byseason: dict, season: str, 
                  user_prompt: str , system_prompt: str, 
                  max_tokens: int = 4096, temperature: int = 0, frequency_penalty: float = 0.0):
    """
    iterate and generate the required preference data for the specified season of friends

    Args:
        client: OpenAI client instance
        model: the model to use, here use gpt-4-turbo / gpt-4-turbo
        screenplay_byseason: the screen play data by season 
        season: the specified season 
        user_prompt: the user prompt
        system_prompt: the system prompt
        max_tokens: the maximum tokens to use
        temperature: the temperature to use
        frequency_penalty: the frequency penalty to use
    """
    generated_result = list(dict())

    for episode in tqdm(screenplay_byseason[season]):
        id = episode['id']
        screenplay = episode['screenplay'] 
        user_prompt_with_screenplay = user_prompt + screenplay

        prompt_messages =[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt_with_screenplay}
        ]

        response = client.chat.completions.create(
            model= model,
            messages = prompt_messages,
            temperature= temperature, # 0 is no randomness,a lower temperature (closer to 0) makes the model's responses more deterministic and conservative.
            max_tokens= max_tokens, # use the max tokens to caputre all results
            frequency_penalty=frequency_penalty # 0.0 means no penalty, a higher frequency_penalty means the model will penalize new tokens based on their frequency in the training data.
        )

        output = response.choices[0].message.content

        generated_result.append({"id": id, "screenplay": screenplay, "generated_result": output})
    
    return generated_result


## NOTE THE COST
* It takes ~30 minutes and ~$3 to generate the data for each season of Friends (~ 20 episodes)


### Season 1 


In [7]:
# generate the data for season 1 using the gpt-4-turbo model
s1_data_gpt4turbo = gpt_generate_data(client = client, model = "gpt-4-turbo", 
                                 screenplay_byseason = friends_screenplay_by_season, 
                                 season = "S01", user_prompt = USER_PROMPT, system_prompt = SYSTEM_PROMPT,
                                 max_tokens = 4096, temperature = 0, frequency_penalty = 0.0)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.87 µs


100%|██████████| 24/24 [26:31<00:00, 66.33s/it]


### Season 2


In [11]:
# generate the data for season 2 using the gpt-4-turbo model
s2_data_gpt4turbo = gpt_generate_data(client = client, model = "gpt-4-turbo", 
                                 screenplay_byseason = friends_screenplay_by_season, 
                                 season = "S02", user_prompt = USER_PROMPT, system_prompt = SYSTEM_PROMPT,
                                 max_tokens = 4096, temperature = 0, frequency_penalty = 0.0)

100%|██████████| 23/23 [27:01<00:00, 70.52s/it]


### Season 3

In [13]:
# generate the data for season 3 using the gpt-4-turbo model
s3_data_gpt4turbo = gpt_generate_data(client = client, model = "gpt-4-turbo", 
                                 screenplay_byseason = friends_screenplay_by_season, 
                                 season = "S03", user_prompt = USER_PROMPT, system_prompt = SYSTEM_PROMPT,
                                 max_tokens = 4096, temperature = 0, frequency_penalty = 0.0)

100%|██████████| 25/25 [23:21<00:00, 56.07s/it]


### Season 4    


In [15]:
# generate the data for season 4 using the gpt-4-turbo model
s4_data_gpt4turbo = gpt_generate_data(client = client, model = "gpt-4-turbo", 
                                 screenplay_byseason = friends_screenplay_by_season, 
                                 season = "S04", user_prompt = USER_PROMPT, system_prompt = SYSTEM_PROMPT,
                                 max_tokens = 4096, temperature = 0, frequency_penalty = 0.0)

100%|██████████| 23/23 [25:18<00:00, 66.00s/it]


### Season 5    


In [17]:
# generate the data for season 5 using the gpt-4-turbo model
s5_data_gpt4turbo = gpt_generate_data(client = client, model = "gpt-4-turbo", 
                                 screenplay_byseason = friends_screenplay_by_season, 
                                 season = "S05", user_prompt = USER_PROMPT, system_prompt = SYSTEM_PROMPT,
                                 max_tokens = 4096, temperature = 0, frequency_penalty = 0.0)

100%|██████████| 23/23 [24:41<00:00, 64.41s/it] 


### Season 6

In [18]:
# generate the data for season 6 using the gpt-4-turbo model
s6_data_gpt4turbo = gpt_generate_data(client = client, model = "gpt-4-turbo", 
                                 screenplay_byseason = friends_screenplay_by_season, 
                                 season = "S06", user_prompt = USER_PROMPT, system_prompt = SYSTEM_PROMPT,
                                 max_tokens = 4096, temperature = 0, frequency_penalty = 0.0)

100%|██████████| 23/23 [22:53<00:00, 59.73s/it]


### Season 7

In [20]:
# generate the data for season 7 using the gpt-4-turbo model
s7_data_gpt4turbo = gpt_generate_data(client = client, model = "gpt-4-turbo", 
                                 screenplay_byseason = friends_screenplay_by_season, 
                                 season = "S07", user_prompt = USER_PROMPT, system_prompt = SYSTEM_PROMPT,
                                 max_tokens = 4096, temperature = 0, frequency_penalty = 0.0)

100%|██████████| 24/24 [15:10<00:00, 37.96s/it]


### Season 8


In [21]:
# generate the data for season 8 using the gpt-4-turbo model
s8_data_gpt4turbo = gpt_generate_data(client = client, model = "gpt-4-turbo", 
                                 screenplay_byseason = friends_screenplay_by_season, 
                                 season = "S08", user_prompt = USER_PROMPT, system_prompt = SYSTEM_PROMPT,
                                 max_tokens = 4096, temperature = 0, frequency_penalty = 0.0)

100%|██████████| 23/23 [18:38<00:00, 48.63s/it]


### Season 9

In [22]:
# generate the data for season 9 using the gpt-4-turbo model
s9_data_gpt4turbo = gpt_generate_data(client = client, model = "gpt-4-turbo", 
                                 screenplay_byseason = friends_screenplay_by_season, 
                                 season = "S09", user_prompt = USER_PROMPT, system_prompt = SYSTEM_PROMPT,
                                 max_tokens = 4096, temperature = 0, frequency_penalty = 0.0)

100%|██████████| 23/23 [21:57<00:00, 57.27s/it]


### Season 10

In [23]:
# generate the data for season 10 using the gpt-4-turbo model
s10_data_gpt4turbo = gpt_generate_data(client = client, model = "gpt-4-turbo", 
                                 screenplay_byseason = friends_screenplay_by_season, 
                                 season = "S10", user_prompt = USER_PROMPT, system_prompt = SYSTEM_PROMPT,
                                 max_tokens = 4096, temperature = 0, frequency_penalty = 0.0)

100%|██████████| 17/17 [16:57<00:00, 59.84s/it]


## combine all seasons

In [74]:
def process_generated_str(generted_data: list):
    """
    process the generated data t

    Args:
        generted_data: the generated data list for specific season

    Returns:
        the processed data: lisf of dict
    """
    processed_data = []
    for episode in generted_data:
        try:
            output_string = episode['generated_result']
            start = output_string.find('[')
            end = output_string.rfind(']') # need to +1 to include the closing bracket

            # Extract the content between these positions
            data_per_episode = eval(output_string[start:end+1])
            for data in data_per_episode:
                data.update({"episode": episode['id']}) 
            processed_data += data_per_episode
        except:
            print(f"Error processing episode {episode['id']}")
    return processed_data
   

In [78]:
## concatenate the list of generated results for all seasons
# write into the all_seasons_data_gpt4turbo.jsonl file
all_seasons_data_gpt4turbo = process_generated_str(s1_data_gpt4turbo)+\
                            process_generated_str(s2_data_gpt4turbo)+\
                            process_generated_str(s3_data_gpt4turbo)+\
                            process_generated_str(s4_data_gpt4turbo)+\
                            process_generated_str(s5_data_gpt4turbo)+\
                            process_generated_str(s6_data_gpt4turbo)+\
                            process_generated_str(s7_data_gpt4turbo)+\
                            process_generated_str(s8_data_gpt4turbo)+\
                            process_generated_str(s9_data_gpt4turbo)+\
                            process_generated_str(s10_data_gpt4turbo)

with open("chandler_bing_sarcasm_analysis_all_seasons.jsonl", "w") as f:
    for item in all_seasons_data_gpt4turbo:
        f.write(json.dumps(item) + "\n")


Error processing episode S09E13


In [None]:
# ## to read the all_seasons_data_gpt4turbo.jsonl file
# all_seasons_data_gpt4turbo = []
# with open("chandler_bing_sarcasm_analysis_all_seasons.jsonl", "r") as f:
#     for line in f:
#         all_seasons_data_gpt4turbo.append(json.loads(line))

In [85]:
sarcasm = [ data['Sarcasm'] for data in all_seasons_data_gpt4turbo]
sarcasm_count = sum(sarcasm)
total_count = len(sarcasm)
sarcasm_percentage = sarcasm_count/total_count
print(f"Total number of sarcastic comments: {sarcasm_count}")
print(f"Total number of comments: {total_count}")
print(f"Percentage of sarcastic comments: {sarcasm_percentage}")


Total number of sarcastic comments: 1162
Total number of comments: 1971
Percentage of sarcastic comments: 0.5895484525621512
