<a href="https://colab.research.google.com/github/zhpinkman/hugging-face-projects/blob/master/Hate_speech_Classification_pre_train_on_white_supremacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets



In [None]:
from datasets import load_dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
dataset = load_dataset('csv', data_files = {'train': '/content/drive/MyDrive/datasets/White Supremacists/cleaned_data.csv'})

Using custom data configuration default-ca90ededb9f099c5
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-ca90ededb9f099c5/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
dataset = dataset['train'].train_test_split(test_size = 0.1)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 9706
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 1079
    })
})

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification


checkpoint = "mrm8488/distilroberta-finetuned-tweets-hate-speech"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
sample = dataset['train'].shuffle().select(range(1))[0]
sample

{'label': 0,
 'sentence': 'There are no dumb questions and there are plenty in this thread who will answer to the best of our abilities .'}

In [None]:
inputs = tokenizer(sample['sentence'], return_tensors='pt')
outputs = model(**inputs)
outputs

SequenceClassifierOutput([('logits',
                           tensor([[ 2.4463, -2.3534]], grad_fn=<AddmmBackward0>))])

In [None]:
model.config.id2label

{0: 'LABEL_0', 1: 'LABEL_1'}

In [None]:
def tokenize_dataset(examples): 
  sentences = examples['sentence']

  inputs = tokenizer(sentences, truncation = True)
  return inputs

In [None]:
tokenized_dataset = dataset.map(tokenize_dataset, batched = True, remove_columns=['sentence'])

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
sample = tokenized_dataset['train'].shuffle().select(range(3))[:3]
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 9706
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 1079
    })
})

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
for entity in sample['input_ids']: 
  print(len(entity))
batch = data_collator(sample)
batch['input_ids'].shape

42
12
12


torch.Size([3, 42])

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
from collections import Counter
counter = Counter(tokenized_dataset['train']['label'])
loss_weights = {
    0: counter[1] / tokenized_dataset['train'].num_rows,
    1: counter[0] / tokenized_dataset['train'].num_rows
}

loss_weights, counter

({0: 0.11724706367195549, 1: 0.8827529363280445}, Counter({0: 8568, 1: 1138}))

In [None]:
import torch
from torch import nn
from transformers import Trainer

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
      
        outputs = model(**inputs)
        logits = outputs.get("logits")
      
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(list(loss_weights.values()), device = device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 8

logging_steps = len(tokenized_dataset["train"]) // batch_size
  

args = TrainingArguments(
    output_dir = 'white_supremacy_roberta', 
    do_train = True,
    do_eval = True, 
    eval_steps = 500,
    evaluation_strategy="steps",
    learning_rate=1e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=logging_steps,
)

In [None]:
trainer = CustomTrainer(
    model = model,
    args = args, 
    train_dataset = tokenized_dataset["train"].shuffle(),
    eval_dataset = tokenized_dataset["test"],
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

In [None]:
trainer.evaluate(tokenized_dataset['test'])

***** Running Evaluation *****
  Num examples = 1079
  Batch size = 8


{'eval_accuracy': 0.7275254865616312,
 'eval_f1': 0.30331753554502366,
 'eval_loss': 1.545031189918518,
 'eval_precision': 0.22695035460992907,
 'eval_recall': 0.45714285714285713,
 'eval_runtime': 5.382,
 'eval_samples_per_second': 200.483,
 'eval_steps_per_second': 25.084}

In [None]:
trainer.train()

***** Running training *****
  Num examples = 9706
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6070


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,No log,0.776489,0.892493,0.462963,0.657895,0.357143
1000,No log,0.598658,0.911956,0.621514,0.702703,0.557143
1500,0.668800,0.614602,0.904541,0.625455,0.637037,0.614286
2000,0.668800,0.481269,0.875811,0.623596,0.513889,0.792857
2500,0.569200,0.708842,0.918443,0.658915,0.720339,0.607143
3000,0.569200,0.74697,0.92215,0.688889,0.715385,0.664286
3500,0.569200,0.939282,0.911956,0.605809,0.722772,0.521429
4000,0.458900,0.911507,0.908248,0.657439,0.637584,0.678571
4500,0.458900,1.05687,0.906395,0.587755,0.685714,0.514286
5000,0.365900,0.939674,0.910102,0.666667,0.642384,0.692857


***** Running Evaluation *****
  Num examples = 1079
  Batch size = 8


Saving model checkpoint to white_supremacy_roberta/checkpoint-500
Configuration saved in white_supremacy_roberta/checkpoint-500/config.json
Model weights saved in white_supremacy_roberta/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1079
  Batch size = 8
Saving model checkpoint to white_supremacy_roberta/checkpoint-1000
Configuration saved in white_supremacy_roberta/checkpoint-1000/config.json
Model weights saved in white_supremacy_roberta/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1079
  Batch size = 8
Saving model checkpoint to white_supremacy_roberta/checkpoint-1500
Configuration saved in white_supremacy_roberta/checkpoint-1500/config.json
Model weights saved in white_supremacy_roberta/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1079
  Batch size = 8
Saving model checkpoint to white_supremacy_roberta/checkpoint-2000
Configuration saved in white_supremacy_roberta/checkpoint

TrainOutput(global_step=6070, training_loss=0.4643807547950973, metrics={'train_runtime': 982.867, 'train_samples_per_second': 49.376, 'train_steps_per_second': 6.176, 'total_flos': 640398632729928.0, 'train_loss': 0.4643807547950973, 'epoch': 5.0})

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('white_supremacy_roberta/checkpoint-3000/')

trainer = CustomTrainer(
    model = model,
    args = args, 
    train_dataset = tokenized_dataset["train"].shuffle(),
    eval_dataset = tokenized_dataset["test"],
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

trainer.evaluate(tokenized_dataset['test'])

loading configuration file white_supremacy_roberta/checkpoint-3000/config.json
Model config RobertaConfig {
  "_name_or_path": "white_supremacy_roberta/checkpoint-3000/",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file white_supremacy_roberta/checkpoint-3000/pytorch_model.bin
All model checkpo

{'eval_accuracy': 0.9221501390176089,
 'eval_f1': 0.6888888888888889,
 'eval_loss': 0.7469704151153564,
 'eval_precision': 0.7153846153846154,
 'eval_recall': 0.6642857142857143,
 'eval_runtime': 5.4104,
 'eval_samples_per_second': 199.429,
 'eval_steps_per_second': 24.952}