In [1]:
# !pip install transformers[torch]
# !pip install accelerate -U
# !pip install nlpaug

In [2]:
import numpy as np
import pandas as pd
import torch
import os
import random

from transformers import DebertaForSequenceClassification, DebertaTokenizer, Trainer, TrainingArguments
import nltk
import nlpaug.augmenter.word as naw
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

cuda_available = torch.cuda.is_available()
device = 'cuda:0' if cuda_available else 'cpu'

torch.manual_seed(0)

data = pd.read_csv("dontpatronizeme_pcl.tsv",
                       sep="\t",
                       names=['par_id', 'art_id', 'keyword', 'country', 'text', 'label'],
                       skiprows=4)

data['label'] = data['label'].apply(lambda x: 0 if x in [0, 1] else 1)

trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')

trids.par_id = trids.par_id
teids.par_id = teids.par_id

data

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,par_id,art_id,keyword,country,text,label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0
2,3,@@16584954,immigrant,ie,White House press secretary Sean Spicer said t...,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0
4,5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0
...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,Sri Lankan norms and culture inhibit women fro...,0
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0
10466,10467,@@20282330,in-need,ng,""" She has one huge platform , and information ...",1
10467,10468,@@16753236,hopeless,in,""" Anja Ringgren Loven I ca n't find a word to ...",1


In [3]:
# Rebuild training set

rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

train_set = pd.DataFrame(rows)
# Split train into train and internal validation set 80:20

val_size = int(len(train_set) * 0.2)

train_set = train_set.sample(frac=1)
val_set = train_set.iloc[0:val_size].reset_index(drop=True).copy()
train_set = train_set.iloc[val_size:].reset_index(drop=True).copy()

In [4]:
# Assuming 'train_set' is your training dataset and it's already prepared
# Make sure to filter only positive examples as before
train_data_positive = train_set[train_set["label"] == 1]

# Initialize the contextual word embeddings augmenter
augmenter = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased',  # Using DistilBERT for augmentation
    action='substitute',  # We'll substitute words
    top_k=20,  # Consider top 20 candidates for substitution
    device=device
)

# Collect augmented data
augmented_data_all = []

# Loop through positive examples and augment
for idx, row in train_data_positive.iterrows():
    original_sentence = row["text"]
    # Augment each sentence 3 times
    for _ in range(3):
        augmented_sentence = augmenter.augment(original_sentence)[0]
        # Create a new record for each augmented sentence
        augmented_data = {
            "par_id": row["par_id"],  # Optionally, adjust if you want to track augmented data back to original
            "community": row["community"],  # Assuming 'keyword' is actually 'community' as per your dataset
            "text": augmented_sentence,  # No need to select the first element, as each call returns a single augmentation
            "label": 1  # Ensuring the label is correctly set for augmented data
        }
        augmented_data_all.append(augmented_data)

# Convert augmented data into a DataFrame
augmented_data_df = pd.DataFrame(augmented_data_all)

# Concatenate original training set with augmented data
train_set_augmented = pd.concat([train_set, augmented_data_df])

# Shuffle the combined dataset
train_set_augmented = train_set_augmented.sample(frac=1, random_state=1).reset_index(drop=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
train_set_augmented

Unnamed: 0,par_id,community,text,label
0,6781,immigrant,Peter Dutton is a post Fitzgerald ex-police of...,0
1,7506,homeless,it reveals the tragic fate of the arts movemen...,1
2,6686,vulnerable,"Director of the RSU , Kenute Hare , cautioned ...",0
3,5820,hopeless,Coming from any other body this might seem hop...,0
4,9410,vulnerable,She was only 26 at the time and looks like a t...,1
...,...,...,...,...
8567,3196,women,While Warriors ' spectators are expected to be...,0
8568,3721,in-need,"children'em hospice, a program from child foun...",1
8569,4503,in-need,"10 . Rather , it was friends of the deceased w...",0
8570,2932,hopeless,""" It dashes the hopes of the young people who ...",0


In [6]:
tokenizer = DebertaTokenizer.from_pretrained("microsoft/deberta-base")
train_encodings = tokenizer(list(train_set_augmented["text"]), return_tensors="pt", truncation=True, padding=True).to(device)
val_encodings = tokenizer(list(val_set["text"]), return_tensors="pt", truncation=True, padding=True).to(device)
train_labels = list(train_set_augmented["label"])
val_labels = list(val_set["label"])

In [7]:
# Rebuild test set

rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

test_set = pd.DataFrame(rows)
test_set = test_set.sample(frac=1)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

# define model

def compute_metrics(input):
    y_pred = np.argmax(input.predictions, axis=1)
    y_true = input.label_ids
    accuracy = accuracy_score(y_true, y_pred)
    f1score = f1_score(y_true, y_pred)
    return {'accuracy': accuracy, 'f1 score': f1score}

model = DebertaForSequenceClassification.from_pretrained("microsoft/deberta-base", num_labels=2).to(device)

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
training_args = TrainingArguments(
    output_dir="Deberta/",
    learning_rate=1e-5,
    weight_decay=0.05,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1 score",
    greater_is_better=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1 score
1,0.2266,0.274198,0.908657,0.37551
2,0.1727,0.274941,0.91403,0.466667
3,0.0996,0.433143,0.91403,0.414634
4,0.0349,0.605049,0.912239,0.441065
5,0.0206,0.717788,0.896119,0.502857
6,0.0047,0.776876,0.907463,0.501608
7,0.005,0.888838,0.907463,0.474576
8,0.0019,0.875992,0.909254,0.5
9,0.0008,0.923103,0.906866,0.486842
10,0.0016,0.937857,0.906269,0.478405


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=10720, training_loss=0.06232314574160377, metrics={'train_runtime': 4484.6914, 'train_samples_per_second': 19.114, 'train_steps_per_second': 2.39, 'total_flos': 2.628154620616704e+16, 'train_loss': 0.06232314574160377, 'epoch': 10.0})

In [9]:
trainer.save_model(f"deberta_contextual_embedding")

In [10]:
predictions = trainer.predict(val_dataset)

# Step 1: Extract predictions and true labels
raw_predictions = predictions.predictions
true_labels = predictions.label_ids

# Step 2: Convert logits to predicted labels
predicted_labels = np.argmax(raw_predictions, axis=1)

# Step 3: Compute the F1 score
f1 = f1_score(true_labels, predicted_labels, average='binary')

print(f"F1 Score: {f1}")
# run predictions

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


F1 Score: 0.5028571428571429


In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
