Installations required for execution

In [1]:
!pip install datasets
# Install the latest version of pip
!pip install --upgrade pip
!pip install sentencepiece==0.1.91
!pip install transformers==4.27.1
!pip install datsets transformers[sentencepiece]
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m75.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0

All the required imports

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset
#transformers for T5 hugging model
from transformers import T5Tokenizer
from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer
#textblob to calculate the polarity of sentences
from textblob import TextBlob
import numpy as np

In [4]:
# Read the CSV file
data = pd.read_csv('/content/sample_data/Multitarget-CONAN.csv')
data.head()

Unnamed: 0,INDEX,HATE_SPEECH,COUNTER_NARRATIVE,TARGET,VERSION
0,0,Maybe the UN could talk to those asian and afr...,"You're right, ocean pollution is one of the ma...",other,V1
1,1,‘Juice (Jews) are worse that nukes.’ Caption t...,You'd rather be nuked then?,JEWS,V1
2,2,Foreigners on UK benefits leaps 41% in 5 years...,You seem to be worried about government expend...,MIGRANTS,V1
3,3,"Jews cannot be patriots, since their allegianc...",You can have parents and grandparents born els...,JEWS,V1
4,4,"Israel, Zionism and Jewish Supremacist control...",Wouldn't you say its colonialism from many dif...,JEWS,V1


In [6]:
# Drop unnecessary columns as we are not concered with index, target and version columns
data = data[['HATE_SPEECH', 'COUNTER_NARRATIVE']]

# Renaming columns to 'input_text' and 'target_text'
data.columns = ['input_text', 'target_text']

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Save the train and val data as CSV files
train_data.to_csv('train_data.csv', index=False)
val_data.to_csv('val_data.csv', index=False)

In [7]:
dataset = load_dataset('csv', data_files={'train': 'train_data.csv', 'validation': 'val_data.csv'})

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-ac20f27f8f8428b0/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-ac20f27f8f8428b0/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
#using pretrained tokenizer with t5-base model
tokenizer = T5Tokenizer.from_pretrained('t5-base',use_fast=False)

# Test the tokenizer by tokenizing a sample sentence
# input_sentence = "This is a test sentence."
# input_tokens = tokenizer(input_sentence, padding=True, truncation=True, return_tensors='pt')
# print(input_tokens)

# def tokenize(batch):
#     input_text = batch['input_text']
#     target_text = batch['target_text']
#     input_tokenized = tokenizer(input_text, padding=True, truncation=True, return_tensors="np")
#     target_tokenized = tokenizer(target_text, padding=True, truncation=True, return_tensors="np")
#     return {"input_ids": input_tokenized["input_ids"], "attention_mask": input_tokenized["attention_mask"],
#             "labels": target_tokenized["input_ids"]}

# tokenized_dataset = dataset.map(tokenize, batched=True, batch_size=512)


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [9]:
model = T5ForConditionalGeneration.from_pretrained('t5-base')

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [10]:
def tokenize(batch):
    input_text = batch['input_text']
    target_text = batch['target_text']
    input_tokenized = tokenizer(input_text, padding=True, truncation=True, max_length=48, return_tensors="pt")
    target_tokenized = tokenizer(target_text, padding=True, truncation=True, max_length=48, return_tensors="pt")
    return {"input_ids": input_tokenized["input_ids"], "attention_mask": input_tokenized["attention_mask"],
            "labels": target_tokenized["input_ids"]}

#tokenizing the dataset
tokenized_dataset = dataset.map(tokenize, batched=True, batch_size=128)

Map:   0%|          | 0/4002 [00:00<?, ? examples/s]

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

In [13]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4002
    })
    validation: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1001
    })
})

In [14]:
#configuring training arguments to be used in the model
training_args = TrainingArguments(
    optim='adamw_torch',
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='steps', # save at every step
    save_strategy='steps', # save at every step
    save_total_limit=3, # Only last 3 models are saved. Older ones are deleted.
    load_best_model_at_end=True,
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
)

In [16]:
trainer.train()

Step,Training Loss,Validation Loss
100,2.9969,1.87315
200,1.9252,1.802232
300,1.8495,1.76795
400,1.8049,1.746628
500,1.7989,1.733015
600,1.7235,1.721706
700,1.7666,1.713196
800,1.7407,1.705538
900,1.7088,1.700607
1000,1.6903,1.69498


TrainOutput(global_step=2505, training_loss=1.7316145419122693, metrics={'train_runtime': 876.3267, 'train_samples_per_second': 22.834, 'train_steps_per_second': 2.859, 'total_flos': 1142366942822400.0, 'train_loss': 1.7316145419122693, 'epoch': 5.0})

In [None]:
#trainer.save_model('fine_tuned_t5_multitarget_conan2')
# input_text = "Race is a social construct, and if you see any difference between these two women, it’s because you have a mental illness. [image showing a blonde, European female athlete, juxtaposed against an image of an African woman wearing a lip plate]"
# input_tokenized = tokenizer(input_text, truncation=True,max_length=100, return_tensors="pt")
# input_tokenized = {k: v.to(model.device) for k, v in input_tokenized.items()}  
# generated_tokens = model.generate(**input_tokenized)
# counter_narrative = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
# print(counter_narrative)



The image of a blonde, European female athlete juxtaposed against an image of an African


In [17]:
val_data

Unnamed: 0,input_text,target_text
414,migrants steal our jobs,When we give refugees and asylum seekers the o...
1815,Persons with learning difficulties should be d...,Even if individuals with learning difficulties...
642,Many philosophers argue that humans are not bo...,This is a so-called 'argument from authority' ...
1056,LGBTs are a drain on society.,"Sexual abuse, physical abuse and even the lack..."
4507,Mentally disabled folks are gullible and don't...,A person can have a diagnosable mental disorde...
...,...,...
1623,There are millions of illegal immigrants in th...,There are thousands of legal immigrants who ha...
2005,Jews are addicted to their religion and take t...,It's not true that Jews are addicted to Judais...
4174,Migrants from the Middle East and Africa are r...,Why do you say this and include people that ha...
3467,Gypsies are not like us.,Many of them have been persecuted their whole ...


In [19]:
#generating counter speech for the input hate speech, and appending it to val_data
predicted=[]
for text in val_data['input_text']:
  input_text = text
  input_tokenized = tokenizer(input_text, truncation=True,max_length=100, return_tensors="pt")
  input_tokenized = {key: value.to(model.device) for key, value in input_tokenized.items()}  
  generated_tokens = model.generate(**input_tokenized)
  counter_speech = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
  predicted.append(counter_speech)
val_data["predicted_text"]=predicted



In [20]:
val_data

Unnamed: 0,input_text,target_text,predicted_text
414,migrants steal our jobs,When we give refugees and asylum seekers the o...,"The migrants who work here steal our jobs, and..."
1815,Persons with learning difficulties should be d...,Even if individuals with learning difficulties...,The education system is designed to help peopl...
642,Many philosophers argue that humans are not bo...,This is a so-called 'argument from authority' ...,"Human beings are born equal, and they are born..."
1056,LGBTs are a drain on society.,"Sexual abuse, physical abuse and even the lack...",I don't think that LGBT people are a 'drain'. ...
4507,Mentally disabled folks are gullible and don't...,A person can have a diagnosable mental disorde...,"Mentally disabled people are not gullible, the..."
...,...,...,...
1623,There are millions of illegal immigrants in th...,There are thousands of legal immigrants who ha...,"Immigration is a part of our country, not a cr..."
2005,Jews are addicted to their religion and take t...,It's not true that Jews are addicted to Judais...,The idea that a person is addicted to their re...
4174,Migrants from the Middle East and Africa are r...,Why do you say this and include people that ha...,"The rise in terrorism, extremism and violence ..."
3467,Gypsies are not like us.,Many of them have been persecuted their whole ...,Gypsies are people who have lived in different...


In [21]:
val_data.to_csv(index=False)

'input_text,target_text,predicted_text\nmigrants steal our jobs,"When we give refugees and asylum seekers the opportunity to work, giving them legal access to the labor market, many of them agree to do works that Italians do not intend to do anymore. Isn\'t this an advantage for our country?","The migrants who work here steal our jobs, and we are not allowed to do this."\nPersons with learning difficulties should be discouraged from attending secondary school and they should be excluded from education.,"Even if individuals with learning difficulties would be more likely to attend school with more efforts than others, it does not justify denying them education in the first place.",The education system is designed to help people with learning difficulties to succeed in life.\n"Many philosophers argue that humans are not born equal, especially when it comes to race.",This is a so-called \'argument from authority\' and thus a logical fallacy. I would like to see some real arguments to supp

In [None]:
#type(val_data)

pandas.core.frame.DataFrame

In [22]:
target_np = np.array(val_data['target_text'])
predicted_np = np.array(val_data['predicted_text'])

polarity_target=[]
polarity_predicted=[]
k=0
for x in range(0,len(target_np)):
  text1=TextBlob(target_np[x])
  text2=TextBlob(predicted_np[x])
  polarity_target.append(text1.sentiment)
  polarity_predicted.append(text2.sentiment)   

In [24]:
polarity_mismatch = 0
for i in range(0,len(polarity_target)):
  if (polarity_target[i].polarity>0 and polarity_predicted[i].polarity<0) or (polarity_target[i].polarity<0 and polarity_predicted[i].polarity>0):
    polarity_mismatch = polarity_mismatch+1
print("Accuracy Score:",(len(polarity_target)-polarity_mismatch)/len(polarity_target)*100) 

Accuracy Score: 83.51648351648352
