# Baseline and Tree of Thoughts prompt classification

In [2]:
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_mistralai import ChatMistralAI
from langchain_ollama.llms import OllamaLLM
from langchain_openai import ChatOpenAI
from langchain.load.dump import dumps
from pydantic import BaseModel, Field
from pydantic.config import ConfigDict
from typing import Annotated, Literal
from enum import Enum
from typing import Literal
import pandas as pd
import json
import os
import time
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

### LLM

In [4]:
model_name = 'mistral'
model = ChatMistralAI(
    model = "mistral-large-latest", 
    temperature = 0.,
    api_key = os.getenv("MISTRAL_API_KEY")
)

In [38]:
# ChatGPT API
# model_name = 'chatgpt'
# model = ChatOpenAI(
#     model = 'gpt-4o-mini',
#     openai_api_key = os.getenv('OPENAI_API_KEY'), 
#     temperature = 0.
# )

In [13]:
# Ollama local
# model_name = 'qwen'
# model = OllamaLLM(model="qwen2.5")

In [4]:
# Qwen API
# model_name = 'qwen'
# model = ChatOpenAI(
#     model = 'qwen2.5-72b-instruct',
#     openai_api_key = os.getenv('DASHSCOPE_API_KEY'), 
#     openai_api_base = 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1',
#     max_tokens = 1024,
#     temperature = 0.
# )

In [5]:
# model_name = 'deepseek'
# model = ChatOpenAI(
#     model = 'deepseek-chat', 
#     openai_api_key= os.getenv('DEEPSEEK_API_KEY'), 
#     openai_api_base = 'https://api.deepseek.com',
#     max_tokens = 1024,
#     temperature = 0.
# )

In [5]:
# Test
model.invoke("hello")

AIMessage(content="Hello! How can I assist you today? Let's have a friendly conversation. 😊 How are you doing?", additional_kwargs={}, response_metadata={'token_usage': {'prompt_tokens': 5, 'total_tokens': 29, 'completion_tokens': 24}, 'model': 'mistral-large-latest', 'finish_reason': 'stop'}, id='run-c6948115-a1ec-4cd6-8270-0fd54db1f2c2-0', usage_metadata={'input_tokens': 5, 'output_tokens': 24, 'total_tokens': 29})

### Settings

In [None]:
# True : test con 2 esempi
debug_mode = True

In [6]:
wait_time = 1.1 # impostare se ci sono rate limits, altrimenti lasciare 0

In [17]:
result_folder = './results/'
prompt_folder = './prompts/'
data_folder = './data/'

In [8]:
result_baseline_stg1_file = result_folder + model_name + '_' + 'result_baseline_stg1.csv'
result_baseline_stg2_file = result_folder + model_name + '_' + 'result_baseline_stg2.csv'
result_baseline_stg3_file = result_folder + model_name + '_' + 'result_baseline_stg3.csv'

In [9]:
result_prompt_tot_stg1_file = result_folder + model_name + '_' + 'result_prompt_tot_stg1.csv'
result_prompt_tot_stg2_file = result_folder + model_name + '_' + 'result_prompt_tot_stg2.csv'
result_prompt_tot_stg3_file = result_folder + model_name + '_' + 'result_prompt_tot_stg3.csv'

In [10]:
with open(prompt_folder + 'prompt_tot.txt') as f:
    tree_of_thoughts = f.read()

In [11]:
print(tree_of_thoughts)

Follow this procedure:

Imagine three different experts are answering this question.
They will brainstorm the answer step by step reasoning carefully and taking all facts into consideration.
All experts will write down 1 step of their thinking, then share it with the group.
They will each critique their response, and the all the responses of others.
They will check their answer on based on the nature of the language and intent.
Then all experts will go on to the next step and write down this step of their thinking.
They will keep going through steps until they reach their conclusion taking into account the thoughts of the other experts.
If at any time they realise that there uncertainty in their logic they will backtrack to where that uncertainty occurred. 
If any expert realises they're wrong at any point then they acknowledges this and start another train of thought.
Each expert will assign a likelihood of their current assertion being correct.
Continue until the experts agree on the

In [12]:
def get_prompt(base_prompt, parser):

    template = base_prompt + "\n" + "{format_instructions}'"  + "\n" + "Text : '{post}'"
    
    prompt = PromptTemplate(
        template = template,
        input_variables = ["post"],
        partial_variables = {"format_instructions": parser.get_format_instructions()}
    )
    
    return prompt

## Classification stage 1: High Level Categorization

In [13]:
class HateClassification(BaseModel):
    hate_class: Literal['implicit_hate','not_hate']
    confidence : float
    explanation : str
    

In [14]:
def classify_stage_1(model, df_hate, base_prompt, result_file, initialize = False):
    
    if initialize:
        # Regenerate output file
        df = pd.DataFrame(columns = ['post_id', 'class','confidence','explanation'])
        df.to_csv(result_file, encoding = "utf-8", index = False)
    else:
        df_elab = pd.read_csv(result_file, encoding = "utf-8")
        df_hate = df_hate[~df_hate['post_id'].isin(df_elab['post_id'])]
        
    parser = PydanticOutputParser(pydantic_object = HateClassification)
    prompt = get_prompt(base_prompt, parser)
    chain = prompt | model | parser

    for idx, row in df_hate.iterrows():
        
        try:            
            time.sleep(wait_time)
            result = chain.invoke({"post": row['post']})
    
            # Classification stage 1
            new_row = {
                'post_id': row['post_id'], 
                'class': result.hate_class, 
                'confidence': result.confidence,
                'explanation': result.explanation
            }
            df = pd.DataFrame([new_row])
            df.to_csv(result_file, encoding = "utf-8", mode='a', index=False, header=False)
                      
        except:
            print("An exception occurred " +  str(row['post_id']) ) 


In [15]:
with open(prompt_folder + 'prompt_stg1.txt') as f:
    prompt_stg1 = f.read()

In [16]:
print(prompt_stg1)

Your task is to decide whether the following text can be classified an implicit hate speech or not implicit hate speech.

Consider the following definition of hate speech:

Hate speech is content that targets individuals or groups with abuse based on their perceived membership in protected categories, including but not limited to race, ethnicity, national origin, caste, sexual orientation, gender, gender identity, religious affiliation, age, disability, or serious disease. 
Specifically, hate speech can contain:

- Hateful References: Content referencing forms of violence or violent events where a protected category was the primary target, intended to harass (e.g., genocides like the Holocaust, lynchings).
- Incitement: Content inciting fear, spreading fearful stereotypes, or encouraging others to harass or discriminate against members of protected categories (e.g., asserting that members of a religious group are terrorists, urging others to harass individuals wearing religious symbols

### Test dataset

In [19]:
df_hate_stg1 = pd.read_csv(data_folder + "implicit_hate_test_stg1.csv", encoding="utf-8")

In [20]:
# TEST
if debug_mode:
    df_hate_stg1 = df_hate_stg1[0:2]

In [21]:
if model_name == 'deepseek' and not debug_mode:
    df_hate_stg1 = df_hate_stg1[0:500]

In [22]:
len(df_hate_stg1)

2

### Baseline with simple prompt

In [23]:
prompt_baseline = prompt_stg1.replace("{tree_of_thoughts}", "")
classify_stage_1(model, df_hate_stg1, prompt_baseline, result_baseline_stg1_file, True)

### Prompt with tree of thoughts

In [24]:
prompt_tot = prompt_stg1.replace("{tree_of_thoughts}", tree_of_thoughts)
classify_stage_1(model, df_hate_stg1, prompt_tot, result_prompt_tot_stg1_file, True)

## Classification Stage 2: Fine-Grained Implicit Hate

In [25]:
class ImplicitHateClassification(BaseModel):
    implicit_class: Literal['white_grievance', 'irony', 'stereotypical', 'incitement', 'other', 'threatening', 'inferiority', 'other']
    confidence: float
    explanation : str


In [26]:
def classify_stage_2(model, df_hate, base_prompt, result_file, initialize = False):
    
    if initialize:
        # Regenerate output file
        df = pd.DataFrame(columns = ['post_id', 'implicit_class', 'confidence', 'explanation'])
        df.to_csv(result_file, encoding = "utf-8", index = False)
    else:
        df_elab = pd.read_csv(result_file, encoding = "utf-8")
        df_hate = df_hate[~df_hate['post_id'].isin(df_elab['post_id'])]
        
    parser = PydanticOutputParser(pydantic_object = ImplicitHateClassification)
    prompt = get_prompt(base_prompt, parser)
    chain = prompt | model | parser

    for idx, row in df_hate.iterrows():
        
        try:            
            time.sleep(wait_time)
            result = chain.invoke({"post": row['post']})
            
            # Classification stage 2
            new_row = {
                'post_id': row['post_id'], 
                'implicit_class' : result.implicit_class, 
                'confidence' : result.confidence, 
                'explanation' : result.explanation
            }
            df = pd.DataFrame([new_row])
            df.to_csv(result_file, encoding = "utf-8", mode='a', index=False, header=False)
                      
        except:
          print("An exception occurred " +  str(row['post_id']) ) 

In [27]:
with open(prompt_folder + 'prompt_stg2.txt') as f:
    prompt_stg2 = f.read()

In [28]:
print(prompt_stg2)

The following text has been categorized as "implicit hate speech", a more complex and subtle attack on a person or a group of people based on their protected characteristics without relying on explicit keywords.

Evaluate the text against the following taxonomy to determine the most likely class.
Class taxonomy of implicit hate speech:

1. White Grievance: includes frustration over a minority group's perceived privilege and casting majority groups as the real victims of racism. This language is linked to extremist behavior and support for violence. An example is 'Black lives matter and white lives don't? Sounds racist.'
2. Incitement to Violence: includes flaunting ingroup unity and power or elevating known hate groups and ideologies. Phrases like 'white brotherhood' operate in the former manner, while statements like 'Hitler was Germany â€“ Germans shall rise again!' operate in the latter, elevating nationalism and Nazism. Article 20 of the UN International Covenant on Civil and Polit

In [29]:
df_hate_stg2 = pd.read_csv(data_folder + "implicit_hate_test_stg2.csv", encoding="utf-8")

In [30]:
# TEST
if debug_mode:
    df_hate_stg2 = df_hate_stg2[0:2]

In [31]:
if model_name == 'deepseek' and not debug_mode:
    df_hate_stg2 = df_hate_stg2.sample(500)

### Baseline with simple prompt

In [32]:
prompt_baseline = prompt_stg2.replace("{tree_of_thoughts}", "")
classify_stage_2(model, df_hate_stg2, prompt_baseline, result_baseline_stg2_file, True)

### Prompt with tree of thoughts

In [33]:
prompt_tot = prompt_stg2.replace("{tree_of_thoughts}", tree_of_thoughts)
classify_stage_2(model, df_hate_stg2, prompt_tot, result_prompt_tot_stg2_file, True)

## Classification Stage 3: Hate Targets and Implied Statement

In [34]:
class ImpliedMeaning(BaseModel):
    targeted_group: str
    implied_statement: str    

In [35]:
def classify_stage_3(model, df_hate, base_prompt, result_file, initialize = False):
    
    if initialize:
        # Regenerate output file
        df = pd.DataFrame(columns = ['post_id', 'target', 'implied_statement'])
        df.to_csv(result_file, encoding = "utf-8", index = False)
    else:
        df_elab = pd.read_csv(result_file, encoding = "utf-8")
        df_hate = df_hate[~df_hate['post_id'].isin(df_elab['post_id'])]
        
    parser = PydanticOutputParser(pydantic_object = ImpliedMeaning)
    prompt = get_prompt(base_prompt, parser)
    chain = prompt | model | parser

    for idx, row in df_hate.iterrows():
        
        try:            
            time.sleep(wait_time)
            result = chain.invoke({"post": row['post']})
            
            # Classification stage 2
            new_row = {
                'post_id': row['post_id'], 
                'target' : result.targeted_group, 
                'implied_statement' : result.implied_statement            
            }
            df = pd.DataFrame([new_row])
            df.to_csv(result_file, encoding = "utf-8", mode='a', index=False, header=False)
                      
        except:
          print("An exception occurred " +  str(row['post_id']) ) 

In [36]:
with open(prompt_folder + 'prompt_stg3.txt') as f:
    prompt_stg3 = f.read()

In [37]:
print(prompt_stg3)

The following text has been categorized as "implicit hate speech", a more complex and subtle attack on a person or a group of people based on their protected characteristics without relying on explicit keywords.

Your job is to infer both the targeted group (GROUP) and the stereotype, characteristic, or implication about that group expressed in the text. 
You have two tasks:

Task 1: Identify the targeted group (GROUP).
The group might be defined by characteristics such as ethnicity, religion, class, sexual orientation, immigration status, or similar traits. Examples of groups include:

-Black folks
-Asian folks
-Muslims
-Jews
-Latino/Latina folks
-Immigrants


Task 2: Determine what stereotype, characteristic, or action is implied about the identified group.
Using the group identified in Task 1, describe the implication using a simple phrase. Avoid copying text directly from the text. Examples of such phrases include:

-[GROUP] do/does [action]
-[GROUP] are [characteristic]
-[GROUP] k

### Test dataset

In [38]:
df_hate_stg3 = pd.read_csv(data_folder + "implicit_hate_test_stg3.csv", encoding="utf-8")
df_hate_stg3 = df_hate_stg3[['post_id', 'post']].drop_duplicates()

In [39]:
# TEST
if debug_mode:
    df_hate_stg3 = df_hate_stg3[0:2]

### Baseline with simple prompt

In [40]:
prompt_baseline = prompt_stg3.replace("{tree_of_thoughts}", "")
classify_stage_3(model, df_hate_stg3, prompt_baseline, result_baseline_stg3_file, True)

### Prompt with tree of thoughts

In [41]:
prompt_tot = prompt_stg3.replace("{tree_of_thoughts}", tree_of_thoughts)
classify_stage_3(model, df_hate_stg3, prompt_tot, result_prompt_tot_stg3_file, True)