In [None]:
import openai
import pandas as pd
from tqdm.notebook import tqdm
import os
import sys
import warnings

warnings.filterwarnings("ignore")

API_KEY = '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'
openai.api_key = API_KEY
model_id = 'gpt-3.5-turbo'

def classify_text(prompt, max_tokens=2000):
    response = openai.ChatCompletion.create(
        model=model_id,
        messages=[
            {"role": "system", "content": "The homophobia definition is the fear, hatred, discomfort with, or mistrust of people who are lesbian, gay, or bisexual. Biphobia is fear, hatred, discomfort, or mistrust, specifically of people who are bisexual. Similarly, transphobia is fear, hatred, discomfort with, or mistrust of people who are transgender, genderqueer. You are a text annotation classifier of text as homophobic, transphobic, anti-lgbqt or neither. Just label the data, it must be homophobic, transphobic, anti-lgbqt or neither (only 1 word)"},
            {"role": "user", "content": prompt}
        ],
        max_tokens=max_tokens,
        n=1,
        temperature=0.7
    )

    generated_text = response.choices[0].message.content.strip()
    return generated_text

def process_sentences(data, output_file='labeled_sentences.csv', batch_size=1, input_file='updated_input.csv'):
    if not os.path.exists(output_file):
        df = pd.DataFrame(columns=['text', 'label'])
        df.to_csv(output_file, index=False)

    # Create a copy of the input DataFrame
    updated_input = data.copy()

    while not updated_input.empty:
        labeled_sentences = []
        for _, row in tqdm(updated_input.head(batch_size).iterrows(), file=sys.stdout):
            sentence = row['text']
            print(f"Processing sentence: {sentence}")
            prompt = f"Classify the following sentence: \"{sentence}\""
            label = classify_text(prompt, max_tokens=200)
            labeled_sentences.append({'text': sentence, 'label': label})

            # Remove the processed sentence from the updated_input DataFrame
            updated_input = updated_input[updated_input['text'] != sentence].reset_index(drop=True)

        # Save the updated input DataFrame to a new file
        updated_input.to_csv(input_file, index=False)

        # Read existing CSV file
        df_existing = pd.read_csv(output_file)

        # Concatenate new labeled sentences with the existing ones
        df_new = pd.DataFrame(labeled_sentences)
        df_combined = pd.concat([df_existing, df_new], ignore_index=True)

        # Remove duplicates
        df_combined.drop_duplicates(subset=['text'], inplace=True)
        df_combined.reset_index(drop=True, inplace=True)

        # Save the updated DataFrame to the CSV file
        df_combined.to_csv(output_file, index=False)

    # Check if the total row count is consistent
    original_row_count = len(data)
    updated_input_row_count = len(pd.read_csv(input_file))
    saved_output_row_count = len(pd.read_csv(output_file))

    assert original_row_count == updated_input_row_count + saved_output_row_count, "Row count mismatch detected."

def main(input_dataframe, output_file='labeled_sentences.csv', batch_size=1):
    print("Starting to process sentences...")
    process_sentences(input_dataframe, output_file=output_file, batch_size=batch_size)
    print("Finished processing sentences.")

if __name__ == "__main__":
    # Load the original or updated input DataFrame
    input_file = 'updated_input.csv'

    if os.path.exists(input_file):
        input_dataframe = pd.read_csv(input_file)
    else:
        input_dataframe = pd.read_csv('sentences.csv')
        input_dataframe.reset_index(inplace=True)

    try:
        main(input_dataframe)
        print("Sentences have been labeled and saved to labeled_sentences.csv.")
    except Exception as e:
        print(f"Error occurred: {e}. Process stopped.")