In [None]:
import pandas as pd
import numpy as np
import openai
import os
import re
import sys
from langchain.vectorstores import Chroma
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain.embeddings import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import tiktoken

In [None]:
# Initialize OpenAI API
openai.api_key = 'add-your-key' 

In [None]:
# #Gender
df = pd.read_csv('/Data/target_gender_ads.csv', index_col=False) #gender


#Age
#df = pd.read_csv('/Data/target_age_ads.csv', index_col=False) #age

df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.reset_index(drop=True)
print(df.columns, df.shape)

In [None]:
tokenizer = tiktoken.encoding_for_model("o1-preview")

In [None]:
# Initialize an empty list to store the assistant's replies
assistant_replies = []

# Function to make API requests and handle rate limits
def make_api_request(prompt):
    try:
        response = openai.ChatCompletion.create(
        
            model = "o1-preview-2024-09-12",
           
            messages=[
                        {
                            "role": "user",
                            "content": [
                                {
                                "type": "text",
                                "text": prompt
                                }
                            ]
                        },
                      ],
           
        )
        return response.choices[0].message["content"]
    except openai.error.RateLimitError as e:
        # Handle rate limit exceeded error by waiting for a fixed time (e.g., 60 seconds)
        print("Rate limit exceeded. Waiting for 60 seconds...")
        time.sleep(60)
        return make_api_request(prompt)  # Retry the request after waiting


In [14]:
def remove_links(text):
    # Define a regular expression pattern to match URLs
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

    # Remove URLs from the text
    text_without_links = re.sub(url_pattern, '', text)

    return text_without_links

def remove_line_breaks_and_links(text):
    # Remove line breaks
    text_without_line_breaks = re.sub(r'\n{2,}', '\n', text)
    removew_leading_indent = re.sub(r'\n\s+', '\n', text_without_line_breaks)  # Remove leading indentation
    
    # Remove links
    text_without_links = remove_links(removew_leading_indent)

    return text_without_links

In [None]:
# Iterate through the DataFrame and send each text as a user message
for index, row in df.iterrows():
    text = row['ad_creative_body']
    cleaned_text = remove_line_breaks_and_links(text).strip()
    #gender {uncomment for gender}
    user_message = "Ad: "+cleaned_text+"\n Report who can be targeted based on gender: male vs female by the provided ad. Choose the most probable option from the gender (either male or female). Don't generate neutral or both gender answers. Provide a brief explanation with reasoning in 20 words. \n Gender: ? \n Explanation: ?"
    # age group {uncomment for age}
    #user_message = "Ad: "+cleaned_text+"\n Report who can be targeted based on the following four age groups: Young adults (18-24), Early working age (25-44), Late working age (45-64), and Senior citizens (65+) by the provided ad. Choose the most probable option from the age groups. Don't generate neutral or multiple age group answers. Provide a brief explanation with the reasoning in 20 words.  \n Age group: ? \n Explanation: ?"
    token_count = len(tokenizer.encode(user_message))
    #print("Number of tokens:", token_count)
    if (token_count+40) < 4000:
        # Call the function to make the API request and handle rate limits
        assistant_reply = make_api_request(user_message)
        # Append the assistant's reply to the list
        assistant_replies.append(assistant_reply)
    elif (token_count+40) >= 4000:
        # Call the function to make the API request and handle rate limits
        assistant_reply = make_api_request(user_message[:3950])
        # Append the assistant's reply to the list
        assistant_replies.append(assistant_reply)

In [16]:
# Add the assistant's replies to the DataFrame
df['assistant_reply'] = assistant_replies
df


Unnamed: 0,id,ad_creative_body,funding_entity,region_distribution,demographic_distribution,impressions,spend,ad_delivery_start_time,assistant_reply
0,3006923696230883,The Hebrew University Center for Climate Scien...,American Friends of Hebrew University,"[{'percentage': '0.350935', 'region': 'Califor...","[{'percentage': '0.474886', 'age': '65+', 'gen...","{'lower_bound': '10000', 'upper_bound': '14999'}","{'lower_bound': '0', 'upper_bound': '99'}",2021-11-19,Age group: Young adults (18-24)\n\nExplanation...
1,439044277591656,Tell Rep. Schrader: Now is the time to go big ...,Oregon League of Conservation Voters (OLCV),"[{'percentage': '1', 'region': 'Oregon'}]","[{'percentage': '0.012706', 'age': '65+', 'gen...","{'lower_bound': '6000', 'upper_bound': '6999'}","{'lower_bound': '400', 'upper_bound': '499'}",2021-11-03,Age group: Early working age (25-44)\n\nExplan...
2,3024511481135625,Get clean water and pollution-free electricity...,David Stauffer,"[{'percentage': '0.02351', 'region': 'Alabama'...","[{'percentage': '1', 'age': '65+', 'gender': '...","{'lower_bound': '1000', 'upper_bound': '1999'}","{'lower_bound': '0', 'upper_bound': '99'}",2022-01-19,**Age group:** Young adults (18-24)\n\n**Expla...
3,402192854968279,Let’s make one thing clear: Pennsylvania will ...,MALCOLM PAC,"[{'percentage': '0.010189', 'region': 'Alabama...","[{'percentage': '1', 'age': '65+', 'gender': '...","{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",2021-12-06,Age group: Late working age (45-64)\n\nExplana...
4,187057706873873,More Exclusive Guest events here: https://bit...,GetSetUp Inc,"[{'percentage': '0.001942', 'region': 'Idaho'}...","[{'percentage': '0.013487', 'age': '65+', 'gen...","{'lower_bound': '0', 'upper_bound': '999'}","{'lower_bound': '0', 'upper_bound': '99'}",2021-09-26,Age group: Senior citizens (65+)\n\nExplanatio...
5,1382947215432991,"Let's keep it simple: If the recall succeeds, ...",Social Security Works,"[{'percentage': '1', 'region': 'California'}]","[{'percentage': '0.439515', 'age': '65+', 'gen...","{'lower_bound': '3000', 'upper_bound': '3999'}","{'lower_bound': '0', 'upper_bound': '99'}",2021-08-26,Age group: Senior citizens (65+)\n\nExplanatio...


In [None]:
# Print or save the updated DataFrame with assistant's replies
df.to_csv('/Data/gender_pred_o1_prev.csv')