In [34]:
import pandas as pd
import json
from sklearn.preprocessing  import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer , TrainingArguments 
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

In [35]:
train_json_path='semeval2024_dev_release/subtask1/train.json'
f=open(train_json_path,'r')
train_json_data=json.loads(f.read())

In [36]:
val_json_path='semeval2024_dev_release/subtask1/validation.json'
f=open(val_json_path,'r')
val_data = json.load(f)
df_val = pd.DataFrame(val_data)

In [37]:
df_train=pd.DataFrame(train_json_data)

In [39]:
df_train['len']=df_train['labels'].apply(lambda x: len(x))
df_train=df_train[df_train['len']>0].drop('len',axis=1)
df_train.head()

Unnamed: 0,id,text,labels,link
0,65635,THIS IS WHY YOU NEED\n\nA SHARPIE WITH YOU AT ...,[Black-and-white Fallacy/Dictatorship],https://www.facebook.com/photo/?fbid=402355213...
1,67927,GOOD NEWS!\n\nNAZANIN ZAGHARI-RATCLIFFE AND AN...,"[Loaded Language, Glittering generalities (Vir...",https://www.facebook.com/amnesty/photos/531198...
4,67641,"WHEN YOU'RE THE FBI, THEY LET YOU DO IT.",[Thought-terminating cliché],https://www.facebook.com/AddictingInfoOrg/phot...
6,79204,JANUARY 2021\n\nYOU DON'T BUY A 12 MILLION DOL...,[Whataboutism],https://www.facebook.com/ResistanceFeed/photos...
7,79372,Term Limits Are Everywhere & Politicians Can't...,[Slogans],


In [51]:

def split_combined_labels(labels):
    new_labels = []
    for label in labels:
        if '/' in label:
            new_labels.extend(label.split('/'))
        else:
            new_labels.append(label)
    return new_labels

In [52]:
# Apply the function to the labels column
df_train['labels'] = df_train['labels'].apply(split_combined_labels)
df_train.head()

Unnamed: 0,id,text,labels,link,hierarchy
0,65635,THIS IS WHY YOU NEED\n\nA SHARPIE WITH YOU AT ...,"[Black-and-white Fallacy, Dictatorship]",https://www.facebook.com/photo/?fbid=402355213...,[]
1,67927,GOOD NEWS!\n\nNAZANIN ZAGHARI-RATCLIFFE AND AN...,"[Loaded Language, Glittering generalities (Vir...",https://www.facebook.com/amnesty/photos/531198...,[Pathos]
4,67641,"WHEN YOU'RE THE FBI, THEY LET YOU DO IT.",[Thought-terminating cliché],https://www.facebook.com/AddictingInfoOrg/phot...,[]
6,79204,JANUARY 2021\n\nYOU DON'T BUY A 12 MILLION DOL...,[Whataboutism],https://www.facebook.com/ResistanceFeed/photos...,"[Ad Hominem, Distraction]"
7,79372,Term Limits Are Everywhere & Politicians Can't...,[Slogans],,[Justification]


In [59]:
all_labels={}

for label in df_train['labels']:
    for item in label:
        if item not in all_labels:
            all_labels[item]=1
        else:
            all_labels[item]+=1
            

all_labels
        

{'Black-and-white Fallacy': 780,
 'Dictatorship': 780,
 'Loaded Language': 1750,
 'Glittering generalities (Virtue)': 488,
 'Thought-terminating cliché': 528,
 'Whataboutism': 258,
 'Slogans': 667,
 'Causal Oversimplification': 240,
 'Smears': 1990,
 'Name calling': 1518,
 'Labeling': 1518,
 'Appeal to authority': 850,
 'Exaggeration': 356,
 'Minimisation': 356,
 'Repetition': 305,
 'Flag-waving': 571,
 'Appeal to fear': 337,
 'prejudice': 337,
 'Reductio ad hitlerum': 63,
 'Doubt': 350,
 "Misrepresentation of Someone's Position (Straw Man)": 62,
 'Obfuscation, Intentional vagueness, Confusion': 21,
 'Bandwagon': 97,
 'Presenting Irrelevant Data (Red Herring)': 59}

In [60]:
# Defining the hierarchy of persuasion techniques based on the provided image
ph = {
    'Persuasion': ['Ethos', 'Pathos', 'Logos'],
    'Ethos': ['Ad Hominem', 'Bandwagon', 'Appeal to Authority', 'Glittering generalities (Virtue)', 'Transfer'],
    'Pathos': ['Appeal to Emotion', 'Exaggeration', 'Loaded Language', 'Flag Waving', 'Appeal to fear', 'Transfer'],
    'Logos': ['Justification', 'Reasoning', 'Repetition', 'Intentional vagueness'],
    'Ad Hominem': ['Name calling', 'Doubt', 'Smears', 'Reductio ad hitlerum', 'Whataboutism'],
    'Justification': ['Bandwagon', 'Appeal to Authority', 'Flag Waving', 'Appeal to fear', 'Slogans'],
    'Reasoning': ['Distraction', 'Simplification'],
    'Distraction': ['Straw Man', 'Red Herring', 'Whataboutism','Presenting Irrelevant Data (Red Herring)'],
    'Simplification': ['Causal Oversimplification', 'Black-and-white Fallacy', 'Thought-terminating cliché'],
    'Others' : ['Minimisation','Labeling','prejudice',"Misrepresentation of Someone's Position (Straw Man)"]
}

In [65]:
# Reverse lookup function
def find_hierarchy_labels(labels, ph):
    hierarchy_labels = set()
    for label in labels:
        for key, values in ph.items():
            if label in values:
                hierarchy_labels.add(key)
    return list(hierarchy_labels)

# Apply the function to create the hierarchy column
df_train['hierarchy'] = df_train['labels'].apply(lambda x: find_hierarchy_labels(x, ph))
df_val['hierarchy'] = df_val['labels'].apply(lambda x: find_hierarchy_labels(x, ph))


In [66]:
df_train.head()

Unnamed: 0,id,text,labels,link,hierarchy
0,65635,THIS IS WHY YOU NEED\n\nA SHARPIE WITH YOU AT ...,"[Black-and-white Fallacy, Dictatorship]",https://www.facebook.com/photo/?fbid=402355213...,[Simplification]
1,67927,GOOD NEWS!\n\nNAZANIN ZAGHARI-RATCLIFFE AND AN...,"[Loaded Language, Glittering generalities (Vir...",https://www.facebook.com/amnesty/photos/531198...,"[Pathos, Ethos]"
4,67641,"WHEN YOU'RE THE FBI, THEY LET YOU DO IT.",[Thought-terminating cliché],https://www.facebook.com/AddictingInfoOrg/phot...,[Simplification]
6,79204,JANUARY 2021\n\nYOU DON'T BUY A 12 MILLION DOL...,[Whataboutism],https://www.facebook.com/ResistanceFeed/photos...,"[Ad Hominem, Distraction]"
7,79372,Term Limits Are Everywhere & Politicians Can't...,[Slogans],,[Justification]


In [67]:
df_val.head()

Unnamed: 0,id,text,labels,link,hierarchy
0,63135,Critical Thinking Essentials\n\Are my biases a...,"[Doubt, Slogans]",https://www.facebook.com/SkepticalMemeSociety/...,"[Justification, Ad Hominem]"
1,78590,Trying to think of a single accomplishment...,"[Exaggeration/Minimisation, Smears]",https://www.facebook.com/unitedstatesherald/ph...,[Ad Hominem]
2,65834,Corporate needs you to find the difference bet...,[Thought-terminating cliché],https://www.facebook.com/photo/?fbid=356249413...,[Simplification]
3,67394,KYLE RITTENHOUSE\nALL CHARGES\nNOT GUILTY,[Glittering generalities (Virtue)],https://www.facebook.com/TheControversia/photo...,[Ethos]
4,67709,Al Franken explains why America should tax the...,"[Appeal to authority, Slogans, Thought-termina...",https://www.facebook.com/AddictingInfoOrg/phot...,"[Justification, Simplification]"


In [68]:
df_train['hierarchy'].value_counts()

hierarchy
[Ad Hominem]                                848
[Ad Hominem, Others]                        570
[Simplification]                            491
[Pathos, Ad Hominem, Others]                369
[Pathos]                                    317
                                           ... 
[Pathos, Logos, Ethos]                        1
[Ethos, Ad Hominem, Pathos, Distraction]      1
[Ethos, Ad Hominem, Pathos, Others]           1
[Logos, Distraction]                          1
[Justification, Logos, Pathos]                1
Name: count, Length: 145, dtype: int64

In [5]:
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
# encode the labels 

mlb=MultiLabelBinarizer()
train_labels_encoded=mlb.fit_transform(df_train['labels'])
val_labels_encoded=mlb.fit_transform(df_val['labels'])



In [69]:
## for hie

tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
# encode the labels 

mlb=MultiLabelBinarizer()
train_labels_encoded=mlb.fit_transform(df_train['hierarchy'])
val_labels_encoded=mlb.fit_transform(df_val['hierarchy'])



In [70]:
train_labels_encoded.shape

(5736, 8)

In [71]:
val_labels_encoded.shape

(500, 8)

In [72]:
# tokenize the text
train_encodings = tokenizer(df_train['text'].tolist(), truncation=True, padding=True, max_length=512)
val_encodings=tokenizer(df_val['text'].tolist(), truncation=True, padding=True, max_length=512)


In [73]:
# labels to tensor
train_labels=torch.tensor(train_labels_encoded)
val_labels=torch.tensor(val_labels_encoded)

In [74]:
train_labels.shape,val_labels.shape

(torch.Size([5736, 8]), torch.Size([500, 8]))

In [75]:
# Define a custom dataset class
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

In [76]:
train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

In [77]:
# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(mlb.classes_))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [78]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./sub1_results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

In [15]:
# # Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset
# )

In [79]:
# # Train the model
# trainer.train()

# # Evaluate the model
# eval_result = trainer.evaluate()

# print("Evaluation results:", eval_result)

In [80]:
predicted_labels = []
actual_labels = []

def compute_metrics(pred):
    labels = pred.label_ids
    preds = (pred.predictions > 0.5).astype(int)
    
    # Store the labels
    predicted_labels.extend(preds)
    actual_labels.extend(labels)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [81]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

In [82]:
# Trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()

print("Evaluation results:", eval_result)

# Convert predicted and actual labels back to their original form
predicted_labels = mlb.inverse_transform(np.array(predicted_labels))
actual_labels = mlb.inverse_transform(np.array(actual_labels))

  item['labels'] = torch.tensor(self.labels[idx]).float()


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4069,0.384754,0.308,0.291515,0.326244,0.305769


  item['labels'] = torch.tensor(self.labels[idx]).float()
  _warn_prf(average, modifier, msg_start, len(result))
  item['labels'] = torch.tensor(self.labels[idx]).float()


Evaluation results: {'eval_loss': 0.38475385308265686, 'eval_accuracy': 0.308, 'eval_f1': 0.2915154703802221, 'eval_precision': 0.3262441359809781, 'eval_recall': 0.3057692307692308, 'eval_runtime': 103.0449, 'eval_samples_per_second': 4.852, 'eval_steps_per_second': 0.611, 'epoch': 1.0}


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Print or save the predicted and actual labels for comparison
for i, (pred, actual) in enumerate(zip(predicted_labels, actual_labels)):
    print(f"Sample {i}:")
    print(f"  Predicted: {pred}")
    print(f"  Actual: {actual}")

# Optionally, save to a file
with open('predictions_vs_actuals.txt', 'w') as f:
    for i, (pred, actual) in enumerate(zip(predicted_labels, actual_labels)):
        f.write(f"Sample {i}:\n")
        f.write(f"  Predicted: {pred}\n")
        f.write(f"  Actual: {actual}\n")