# Preparing JSON facebook comment threads for training

In [1]:
import pandas as pd
import glob
import json

PATH = "data/original/facebook_thread/"

## System prompt

In [2]:
system_prompt = """You will receive JSON inputs representing discussion threads from social media during the 2016 US Presidential election. Each thread includes a post, one comment about the post, and up to five replies to the comment. Your task is to identify the stance expressed towards two politicians, Donald Trump and Hillary Clinton, in the comment and each reply. Each text may express a stance towards one, both, or none of the politicians. You will always provide a stance towards each politician separately.

Stance Options:

    Support: Positive attitude towards the politician.
    Oppose: Negative attitude towards the politician.
    Neither: No clear stance or irrelevant content.

Instructions:

    - Identify the stance for Trump and Clinton in the comment and each reply using the stance options provided.
    - Always provide a stance even if the content is offensive or ambiguous.
    - There will be between zero and five replies to each comment. If there are fewer than five replies, provide stances for the available replies only.
    
Output Format: Strictly follow this JSON format. Replace the STANCE placeholder with the actual stance. Do not add any other tokens:


{
  "comment": {
    "stanceTrump": "STANCE",
    "stanceClinton": "STANCE"
  },
  "replies": [
    {
      "reply_id": 1,
      "stanceTrump": "STANCE",
      "stanceClinton": "STANCE"
    },
    {
      "reply_id": 2,
      "stanceTrump": "STANCE",
      "stanceClinton": "STANCE"
    },
    ...
  ]
}"""

In [3]:
## Load train data
# Thread
train_thread = pd.DataFrame()
for i in range(1,len(glob.glob("train/threads/*.json", root_dir=PATH))+1):
    with open(PATH+f"train/threads/comment_thread_{i}.json") as f:
        thread = pd.DataFrame({"thread":[f.read()]})
        train_thread = pd.concat([train_thread, thread], axis=0)
train_thread = train_thread.reset_index(drop=True)

# Stance label
train_label = pd.DataFrame()
for i in range(1,len(glob.glob("train/labels/*.json", root_dir=PATH))+1):
    with open(PATH+f"train/labels/comment_thread_{i}.json") as f:
        label = pd.DataFrame({"label":[f.read()]})
        train_label = pd.concat([train_label, label],axis=0)
train_label = train_label.reset_index(drop=True)

## Load test data
# Thread
test_thread = pd.DataFrame()
for i in range(1,len(glob.glob("test/threads/*.json", root_dir=PATH))+1):
    with open(PATH+f"test/threads/comment_thread_{i}.json") as f:
        thread = pd.DataFrame({"thread":[f.read()]})
        test_thread = pd.concat([test_thread, thread], axis=0)
test_thread = test_thread.reset_index(drop=True)
test_thread.to_csv("data/thread_test.csv", index=False)

# Stance label
test_label = pd.DataFrame()
for i in range(1,len(glob.glob("test/labels/*.json", root_dir=PATH))+1):
    with open(PATH+f"test/labels/comment_thread_{i}.json") as f:
        label = pd.DataFrame({"label":[f.read()]})
        test_label = pd.concat([test_label, label],axis=0)
test_label = test_label.reset_index(drop=True)

In [4]:
train_texts = []
for i in range(len(train_thread)):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": train_thread.loc[i, "thread"]},
        {"role": "assistant", "content": train_label.loc[i, "label"]},
    ]
    train_texts.append(messages)
train_texts = pd.DataFrame({"messages":train_texts})
train_texts.to_csv("data/thread_train_chat_texts.csv", index=False)

In [5]:
test_texts = []
for i in range(len(test_thread)):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": test_thread.loc[i, "thread"]}
    ]
    test_texts.append(messages)
test_texts = pd.DataFrame({"messages":test_texts})
test_texts.to_csv("data/thread_test_chat_texts.csv", index=False)

## Parse JSON and break out each comment and reply

This file plays two roles: evaluation metric calculation and GPT-4o zero-shot baseline prediction

In [6]:
test_cleaned = pd.DataFrame()
for i in range(1,len(glob.glob("test/labels/*.json", root_dir=PATH))+1):
    with open(PATH+f"test/threads/comment_thread_{i}.json") as f:
        thread = json.load(f)
        comment = pd.DataFrame.from_dict([thread['comment']])
        try:
            reply = pd.DataFrame.from_dict(thread['replies'])    
            thread_output = pd.concat([comment, reply],axis=0)
        except KeyError:
            thread_output = comment
            
        thread_output['post_id'] = i
        thread_output = thread_output.reset_index(drop=True)
    
    with open(PATH+f"test/labels/comment_thread_{i}.json") as f:
        thread = json.load(f)
        comment = pd.DataFrame.from_dict([thread['comment']])
        try:
            reply = pd.DataFrame.from_dict(thread['replies']).rename(columns={"reply_id":"reply_id_label"})  
            label_output = pd.concat([comment, reply],axis=0)
        except KeyError:
            label_output = pd.concat([comment],axis=0)
        label_output['post_id_label'] = i
        label_output = label_output.reset_index(drop=True)
        
    output = pd.concat([thread_output, label_output],axis=1)
    test_cleaned = pd.concat([test_cleaned, output],axis=0)
    
test_cleaned["completion"] = test_cleaned["stanceTrump"]  + ", " + test_cleaned["stanceClinton"]
test_cleaned = test_cleaned.reset_index(drop=True)
test_cleaned["reply_id"] = test_cleaned["reply_id"].fillna(0)
test_cleaned["reply_id_label"] = test_cleaned["reply_id_label"].fillna(0)
test_cleaned.to_csv("data/thread_test_cleaned.csv", index=False)