<a href="https://colab.research.google.com/github/tangxsusan/fine_tune_bert/blob/main/3_Manning_1_HateSpeechDetection_submit2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers==4.18.0 --quiet

[K     |████████████████████████████████| 4.0 MB 16.5 MB/s 
[K     |████████████████████████████████| 6.6 MB 63.6 MB/s 
[K     |████████████████████████████████| 880 kB 62.3 MB/s 
[K     |████████████████████████████████| 84 kB 3.0 MB/s 
[K     |████████████████████████████████| 596 kB 50.3 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
!pip install SentencePiece

Collecting SentencePiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 31.9 MB/s eta 0:00:01[K     |▌                               | 20 kB 39.6 MB/s eta 0:00:01[K     |▉                               | 30 kB 41.2 MB/s eta 0:00:01[K     |█                               | 40 kB 29.8 MB/s eta 0:00:01[K     |█▍                              | 51 kB 22.4 MB/s eta 0:00:01[K     |█▋                              | 61 kB 25.6 MB/s eta 0:00:01[K     |██                              | 71 kB 25.9 MB/s eta 0:00:01[K     |██▏                             | 81 kB 27.3 MB/s eta 0:00:01[K     |██▍                             | 92 kB 29.3 MB/s eta 0:00:01[K     |██▊                             | 102 kB 31.2 MB/s eta 0:00:01[K     |███                             | 112 kB 31.2 MB/s eta 0:00:01[K     |███▎                            | 122 kB 31.2 MB/s eta 0:00:01[K     |██

In [None]:
!pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade 

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import transformers
from transformers import AlbertTokenizer, AlbertModel

import torch
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df=pd.read_csv('./drive/MyDrive/Colab Notebooks/Khilnani_LP_hate_speech_data.csv', index_col=0)
df.head()

Unnamed: 0,tweet,class
0,!!! RT @mayasolovely: As a woman you shouldn't...,0
1,""" momma said no pussy cats inside my doghouse """,0
2,"""@Addicted2Guys: -SimplyAddictedToGuys http://...",0
3,"""@AllAboutManFeet: http://t.co/3gzUpfuMev"" woo...",0
4,"""@Allyhaaaaa: Lemmie eat a Oreo &amp; do these...",0


In [None]:
df.shape

(5593, 2)

In [None]:
def remove_username(text):
  return re.sub(r'@[\w?.!/;:]+','',text)

In [None]:
df['tweet']=df['tweet'].apply(remove_username)

In [None]:
df.head()

Unnamed: 0,tweet,class
0,!!! RT As a woman you shouldn't complain abou...,0
1,""" momma said no pussy cats inside my doghouse """,0
2,""" -SimplyAddictedToGuys http://t.co/1jL4hi8ZMF...",0
3,""" http://t.co/3gzUpfuMev"" woof woof and hot soles",0
4,""" Lemmie eat a Oreo &amp; do these dishes."" On...",0


In [None]:
train_df,test_df=train_test_split(df, test_size=0.3, random_state=42)
print(train_df.shape, test_df.shape)

(3915, 2) (1678, 2)


In [None]:
max_token_len=64

In [None]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained("albert-base-v2")

Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.decoder.bias', 'predictions.dense.weight', 'predictions.bias', 'predictions.decoder.weight', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Create tensor datasets
class HateSpeechDataset(Dataset):
  def __init__(self, data:pd.DataFrame, tokenizer=tokenizer, max_token_len=64):
    self.tokenizer=tokenizer
    self.data=data 
    self.max_token_len=max_token_len

  def __len__(self):  
    return len(self.data)  

  def __getitem__(self, index:int): 
    data_row=self.data.iloc[index]
    tweet_text=data_row['tweet']
    labels=data_row['class']  

    encoding=self.tokenizer.encode_plus(
                      tweet_text,
                      add_special_tokens=True, 
                      max_length=self.max_token_len,
                      return_token_type_ids=True,
                      padding='max_length', 
                      truncation=True,
                      return_attention_mask=True,
                      return_tensors='pt')
    return dict(
        input_ids=encoding['input_ids'].flatten(),
        attention_mask=encoding['attention_mask'].flatten(), 
        token_type_ids=encoding['token_type_ids'].flatten(),
        labels=torch.FloatTensor([labels]) )

In [None]:
# Create dataloaders in torch and Lightning.    
class HateSpeechDataModule(pl.LightningDataModule): 
  def __init__(self, train_df, test_df, tokenizer, batch_size=32,max_token_len=64): 
    super().__init__()
    self.train_df=train_df 
    self.test_df =test_df  
    self.batch_size=batch_size
    self.tokenizer=tokenizer 
    self.max_token_len=max_token_len  


  def setup(self, stage=None):
    self.train_dataset=HateSpeechDataset(self.train_df, self.tokenizer,max_token_len=self.max_token_len)
    self.test_dataset =HateSpeechDataset(self.test_df, self.tokenizer, max_token_len=self.max_token_len)

  def train_dataloader(self):
    return DataLoader(self.train_dataset, shuffle=True, batch_size=self.batch_size, num_workers=2)

  def val_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=2)

  def test_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=2)

In [None]:
# validate the dataloader
max_token_len=64
batch_size=32
dm=HateSpeechDataModule(train_df, test_df,tokenizer=tokenizer,max_token_len=max_token_len,batch_size=batch_size)

In [None]:
dm.setup()
input_data=next(iter(dm.train_dataloader()))

In [None]:
input_data.keys()

dict_keys(['input_ids', 'attention_mask', 'token_type_ids', 'labels'])

In [None]:
input_data['input_ids'].shape

torch.Size([32, 64])

In [None]:
input_data['attention_mask'].shape

torch.Size([32, 64])

In [None]:
input_data['token_type_ids'].shape

torch.Size([32, 64])

In [None]:
input_data['labels'].shape

torch.Size([32, 1])

In [None]:
class HateSpeechTagger(pl.LightningModule):
  def __init__(self,n_classes: int=2, learning_rate:float=0.0001*8,
               batch_size: int=16,
               num_workers: int=4,
               **kwargs):
    super().__init__()

    self.bert=AlbertModel.from_pretrained('albert-base-v2')
    self.linear1=torch.nn.Linear(768, 768)
    self.linear2=torch.nn.Linear(768, 2)
    self.relu=torch.nn.ReLU()
    self.dropout=torch.nn.Dropout(0.5)

  def forward(self,input_ids, attention_mask, labels):
    output=self.bert(input_ids=input_ids, attention_mask=attention_mask)
    hidden_state=output[0]
    output=hidden_state[:,0]
    output=self.linear1(output)
    output=self.relu(output)
    output=self.dropout(output)
    logits=self.linear2(output)
    output=torch.nn.functional.log_softmax(logits)
    return output

  def training_step(self, batch, batch_nb):
    input_ids=batch['input_ids']
    attention_mask=batch['attention_mask'] 
    label=batch['labels']
    y_hat=self(input_ids, attention_mask, label)

    loss=torch.nn.functional.nll_loss(y_hat.view(-1,2), label.view(-1).long())
    return loss

  def validation_step(self, batch, batch_nb):
    input_ids=batch['input_ids']
    attention_mask=batch['attention_mask'] 
    label=batch['labels']
    y_hat=self(input_ids, attention_mask, label)
    loss=torch.nn.functional.nll_loss(y_hat.view(-1,2), label.view(-1).long())
    return loss

  def configure_optimizers(self):
    optimizer=torch.optim.Adam(self.parameters(), lr=1e-5)
    return [optimizer]

In [None]:
model=HateSpeechTagger()

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.decoder.bias', 'predictions.dense.weight', 'predictions.bias', 'predictions.decoder.weight', 'predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
trainer=pl.Trainer(accelerator='gpu', devices=1, max_epochs=1)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model,dm)

Missing logger folder: /content/lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type        | Params
----------------------------------------
0 | bert    | AlbertModel | 11.7 M
1 | linear1 | Linear      | 590 K 
2 | linear2 | Linear      | 1.5 K 
3 | relu    | ReLU        | 0     
4 | dropout | Dropout     | 0     
----------------------------------------
12.3 M    Trainable params
0         Non-trainable params
12.3 M    Total params
49.103    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:
model

HateSpeechTagger(
  (bert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
     