In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline 
import torch, datasets, pandas as pd
from trl import SFTTrainer
from transformers.pipelines.pt_utils import KeyDataset
from peft import LoraConfig
from openai import OpenAI

In [None]:
system_prompt = """You will receive JSON inputs representing discussion threads from social media during the 2016 US Presidential election. Each thread includes a post, one comment about the post, and up to five replies to the comment. Your task is to identify the stance expressed towards two politicians, Donald Trump and Hillary Clinton, in the comment and each reply. Each text may express a stance towards one, both, or none of the politicians. You will always provide a stance towards each politician separately.

Stance Options:

    Support: Positive attitude towards the politician.
    Oppose: Negative attitude towards the politician.
    Neither: No clear stance or irrelevant content.

Instructions:

    - Identify the stance for Trump and Clinton in the comment and each reply using the stance options provided.
    - Always provide a stance even if the content is offensive or ambiguous.
    - There will be between zero and five replies to each comment. If there are fewer than five replies, provide stances for the available replies only.
    
Output Format: Strictly follow this JSON format. Replace the STANCE placeholder with the actual stance. Do not add any other tokens:


{
  "comment": {
    "stanceTrump": "STANCE",
    "stanceClinton": "STANCE"
  },
  "replies": [
    {
      "reply_id": 1,
      "stanceTrump": "STANCE",
      "stanceClinton": "STANCE"
    },
    {
      "reply_id": 2,
      "stanceTrump": "STANCE",
      "stanceClinton": "STANCE"
    },
    ...
  ]
}"""

# Load thread data

In [None]:
test_thread = pd.read_csv("data/thread_test.csv") # For GPT-4o JSON zero-shot
test_cleaned = pd.read_csv("data/thread_test_cleaned.csv") # For GPT-4o Baseline zero-shot
train_texts = pd.read_csv("data/thread_train_chat_texts.csv") # For llama3
test_texts = pd.read_csv("data/thread_test_chat_texts.csv") # For llama3

# Llama 3

In [None]:
########################## Model list
# "meta-llama/Meta-Llama-3-8B-Instruct"
# "meta-llama/Meta-Llama-3-70B-Instruct"
    
target = "meta-llama/Meta-Llama-3-8B-Instruct"

device = "cuda"
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch_dtype = torch.bfloat16
quant_storage_dtype = torch.bfloat16

quantization_config = BitsAndBytesConfig( 
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch_dtype,
            bnb_4bit_quant_storage=quant_storage_dtype, 
        )
    
tokenizer = AutoTokenizer.from_pretrained(target)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
train_ds = datasets.Dataset.from_pandas(train_texts)
test_ds = datasets.Dataset.from_pandas(test_texts).map(lambda x:{"chat_text":tokenizer.apply_chat_template(x['messages'], tokenize=False, add_generation_prompt=True)})

## Load model

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    target,
    device_map=device,
    torch_dtype=quant_storage_dtype,
    quantization_config=quantization_config,
    use_cache=False
)

## Zero-shot JSON

In [None]:
pipe = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer)

In [None]:
output_labels = []
for out in pipe(KeyDataset(test_ds, "chat_text"),
                add_special_tokens=True,
                return_full_text=False,
                do_sample=True,
                temperature=0.1,
                max_new_tokens=8192,
                batch_size=1):
    output_labels.append(out[0])

preds = pd.DataFrame.from_dict(output_labels)
test_preds = pd.concat([test_texts, preds], axis=1)
test_preds.to_csv(f"predicted_labels/thread_{target.split("/")[-1]}_zero.csv", index=False)

## Instruction fine-tuning

In [None]:
training_args = TrainingArguments(
    output_dir=f"open_weights/thread/{target}",
    logging_dir=f"open_weights/thread/{target}",
    num_train_epochs=4,
    learning_rate=2e-4,
    lr_scheduler_type="constant",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant":True},
    logging_strategy="steps",
    logging_steps=0.5,
    save_strategy="steps",
    save_steps=0.5
    )

peft_config = LoraConfig(
    r=16,
    lora_alpha=64,
    lora_dropout=0.1 if target == "meta-llama/Meta-Llama-3-70B-Instruct" else 0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules="all-linear"
)

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    peft_config=peft_config,
    train_dataset=train_ds,
    max_seq_length=8192,
)

In [None]:
trainer.train()

### Evaluation

In [None]:
pipe_sft = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer)

In [None]:
output_labels = []
for out in pipe_sft(KeyDataset(test_ds, "chat_text"),
                add_special_tokens=True,
                return_full_text=False,
                do_sample=True,
                temperature=0.1,
                max_new_tokens=8192,
                batch_size=1):
    output_labels.append(out[0])

preds = pd.DataFrame.from_dict(output_labels)
test_preds = pd.concat([test_texts, preds], axis=1)
test_preds.to_csv(f"predicted_labels/thread_{target.split("/")[-1]}_instruction_tuned.csv", index=False)

# GPT-4o

## Zero-shot JSON

In [None]:
generated_output = []
        
for j, thread in enumerate(test_thread.iterrows()):
    message = thread[1]['thread']
    completion = client.chat.completions.create(
        model="gpt-4o",
        temperature=0,
        messages=[
            {"role":"system", "content":system_prompt},
            {"role": "user", "content": message}
            ]
        )
    print(completion.choices[0].message)
    generated_output.append(completion.choices[0].message.content)
preds = pd.DataFrame({"generated_text":generated_output})
test_thread_preds = pd.concat([test_thread, preds], axis=1)
test_thread_preds.to_csv(f"predicted_labels/thread_gpt4o_zero.csv", index=False)

## Zero-shot Baseline

In [None]:
# with open("../creds.json") as js:
    # api_key = json.load(js)['OPENAI_API_KEY']
    
client = OpenAI(api_key=api_key)
prompt3 = "This statement may express a STANCE towards two politicians, Trump and Clinton. Stance represents the attitude expressed towards them. The stance options are Support, Oppose or Neither. Provide the answer in the following format: {Trump: STANCE, Clinton: STANCE}\n\n"

In [None]:
generated_output = []
        
for thread in test_cleaned.iterrows():
    message = prompt3 + thread[1]['text']
    completion = client.chat.completions.create(
        model="gpt-4o",
        temperature=0,
        messages=[
            {"role": "user", "content": message}
            ]
        )
    generated_output.append(completion.choices[0].message.content)

test_cleaned_preds = pd.concat([test_cleaned, pd.DataFrame({"preds":generated_output})], axis=1)
test_cleaned_preds.to_csv(f"predicted_labels/thread_gpt4o_baseline.csv", index=False)