In [1]:
from transformers import Trainer, AutoModel, AutoTokenizer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch


import numpy as np
import pandas as pd

from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

In [3]:
data = pd.read_csv('train1.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,comment_text,toxic
0,0,Explanation\nWhy the edits made under my usern...,0
1,1,D'aww! He matches this background colour I'm s...,0
2,2,"Hey man, I'm really not trying to edit war. It...",0
3,3,"""\nMore\nI can't make any real suggestions on ...",0
4,4,"You, sir, are my hero. Any chance you remember...",0


In [5]:
positive_data = data[data.toxic == 0].index
negative_data = data[data.toxic == 1].index

In [6]:
positive_data = data.loc[positive_data][:750]
negative_data = data.loc[negative_data][:750]

In [8]:
df = positive_data._append(negative_data)

In [9]:
len(df)

1500

In [10]:
np.random.seed(42)

df = df.sample(frac=1).reset_index(drop=True)

In [11]:
checkpoint = 'distilbert/distilbert-base-uncased'

In [12]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [44]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels = 2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
# model.save_pretrained('distilbert-base-uncased')
# tokenizer.save_pretrained('distilbert-base-uncased')

('distilbert-base-uncased/tokenizer_config.json',
 'distilbert-base-uncased/special_tokens_map.json',
 'distilbert-base-uncased/vocab.txt',
 'distilbert-base-uncased/added_tokens.json')

In [46]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, comment_text, toxic, tokenizer, max_length):
        self.comment_text = comment_text
        self.toxic = toxic
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, idx):

        encoding = self.tokenizer(self.comment_text[idx],
                                  padding = 'max_length',
                                 truncation = True,
                                 max_length = self.max_length,
                                 return_tensors = 'pt')

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {'input_ids': input_ids, 'attention_mask':attention_mask,
               'label': self.toxic[idx]}

In [47]:
train_data = {'comment_text': df['comment_text'][:1000].tolist(), 'toxic': df['toxic'][:1000].tolist()}

val_data = {'comment_text': df['comment_text'][1000:].tolist(), 'toxic': df['toxic'][1000:].tolist()}

In [48]:
max_length = 512
train_custom_dataset = CustomDataset(train_data['comment_text'], train_data['toxic'], tokenizer, max_length)

val_custom_dataset = CustomDataset(val_data['comment_text'], val_data['toxic'], tokenizer, max_length)

In [49]:
# tokenized_dataset = tokenizer([example['input_ids'] for example in custom_dataset],
# padding=True, truncation=True, return_tensors='pt', max_length=max_length)

In [50]:
# sample_data = ["I am eating"]
# tokenizer(sample_data, padding=True, truncation=True, max_length=256)

In [51]:
optimizer = AdamW(model.parameters(), lr = 0.0003)

In [52]:
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 1, gamma = 0.1)

In [53]:
batch_size = 16

In [54]:
train_dataloader = DataLoader(train_custom_dataset, batch_size=batch_size, shuffle=True)

val_dataloader = DataLoader(val_custom_dataset, batch_size=batch_size, shuffle=True)

In [55]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [60]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [61]:
epochs = 3
for epoch in range(epochs):
    
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f'Training Epoch {epoch + 1}/{epochs}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)

    
    model.eval()
    total_val_loss = 0
    val_preds = []
    val_labels = []
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f'Validation Epoch {epoch + 1}/{epochs}'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_val_loss += loss.item()
            val_preds.append(outputs.logits.detach().cpu().numpy())
            val_labels.append(labels.detach().cpu().numpy())

    avg_val_loss = total_val_loss / len(val_dataloader)
    val_preds = np.concatenate(val_preds, axis=0)
    val_labels = np.concatenate(val_labels, axis=0)
    metrics = compute_metrics((val_preds, val_labels))

    
    print(f'Epoch {epoch + 1}/{epochs}:')
    print(f'Average training loss: {avg_train_loss:.4f}')
    print(f'Average validation loss: {avg_val_loss:.4f}')
    print(f'Validation Metrics: {metrics}')

    scheduler.step()


Training Epoch 1/3: 100%|██████████| 63/63 [00:47<00:00,  1.32it/s]
Validation Epoch 1/3: 100%|██████████| 32/32 [00:08<00:00,  3.73it/s]


Epoch 1/3:
Average training loss: 0.4420
Average validation loss: 0.2735
Validation Metrics: {'accuracy': 0.902, 'precision': 0.8785714285714286, 'recall': 0.9425287356321839, 'f1': 0.9094269870609981}


Training Epoch 2/3: 100%|██████████| 63/63 [00:46<00:00,  1.36it/s]
Validation Epoch 2/3: 100%|██████████| 32/32 [00:08<00:00,  3.69it/s]


Epoch 2/3:
Average training loss: 0.1794
Average validation loss: 0.2151
Validation Metrics: {'accuracy': 0.918, 'precision': 0.9230769230769231, 'recall': 0.9195402298850575, 'f1': 0.9213051823416507}


Training Epoch 3/3: 100%|██████████| 63/63 [00:46<00:00,  1.36it/s]
Validation Epoch 3/3: 100%|██████████| 32/32 [00:08<00:00,  3.70it/s]

Epoch 3/3:
Average training loss: 0.1156
Average validation loss: 0.1973
Validation Metrics: {'accuracy': 0.918, 'precision': 0.9230769230769231, 'recall': 0.9195402298850575, 'f1': 0.9213051823416507}





In [62]:
# saving model state using pytorch .

torch.save(model.state_dict(), 'final_model_for_inference')

In [64]:
# model saving

model.save_pretrained('fm-sa')
tokenizer.save_pretrained('fm-sa')

('fm-sa/tokenizer_config.json',
 'fm-sa/special_tokens_map.json',
 'fm-sa/vocab.txt',
 'fm-sa/added_tokens.json')

In [25]:
# model loading
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

model3 = DistilBertForSequenceClassification.from_pretrained('fm-sa', num_labels=2)
tokenizer2 = DistilBertTokenizer.from_pretrained('fm-sa')

In [7]:
# testing

text = "go to hell"
inputs = tokenizer2(text,padding = True, truncation = True, return_tensors='pt')
outputs = model3(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()
predictions

array([[0.02688701, 0.97311306]], dtype=float32)

In [26]:

text = ["You're so stupid, you can't even tie your own shoes!",
"I hope you fail miserably, you don't deserve any success.",
"You're worthless, just go crawl back into your hole.",
"You're a valuable member of the team, keep up the good work!",
"Congratulations on your promotion, you've earned it!",
"You're always so kind and helpful, it's a pleasure to work with you."]

In [27]:
for i in text:
    inputs = tokenizer2(i,padding = True, truncation = True, return_tensors='pt', max_length = 512)
    outputs = model3(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predictions = predictions.cpu().detach().numpy()
    print(predictions)
    
    max_index = np.argmax(predictions, axis = 1)
    if max_index == 1:
         print('Toxic')
    else:
        print('Non-toxic')

[[0.02659355 0.9734065 ]]
Toxic
[[0.48947707 0.5105229 ]]
Toxic
[[0.02659394 0.9734061 ]]
Toxic
[[0.81340986 0.18659009]]
Non-toxic
[[0.6715507  0.32844925]]
Non-toxic
[[0.49707934 0.50292075]]
Toxic


In [17]:
import joblib

In [24]:
joblib.dump(model3, 'model.joblib', compress=())

['model.joblib']

In [22]:
model4 = joblib.load('model.joblib')

In [23]:
model4 = DistilBertForSequenceClassification.from_pretrained(model4, num_labels=2)

OSError: fm-sa/model.safetensors is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`