In [1]:
!pip install transformers==4.18.0 --quiet

[K     |████████████████████████████████| 4.0 MB 8.0 MB/s 
[K     |████████████████████████████████| 596 kB 6.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 44.2 MB/s 
[K     |████████████████████████████████| 77 kB 2.1 MB/s 
[K     |████████████████████████████████| 880 kB 54.7 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
!pip install SentencePiece

Collecting SentencePiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 7.3 MB/s 
[?25hInstalling collected packages: SentencePiece
Successfully installed SentencePiece-0.1.96


In [None]:
!pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade 

In [4]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import transformers
from transformers import AlbertTokenizer, AlbertModel

import torch
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
df=pd.read_csv('./drive/MyDrive/Colab Notebooks/Khilnani_LP_hate_speech_data.csv', index_col=0)
df.head()

Unnamed: 0,tweet,class
0,!!! RT @mayasolovely: As a woman you shouldn't...,0
1,""" momma said no pussy cats inside my doghouse """,0
2,"""@Addicted2Guys: -SimplyAddictedToGuys http://...",0
3,"""@AllAboutManFeet: http://t.co/3gzUpfuMev"" woo...",0
4,"""@Allyhaaaaa: Lemmie eat a Oreo &amp; do these...",0


In [7]:
df.shape

(5593, 2)

In [8]:
def remove_username(text):
  return re.sub(r'@[\w?.!/;:]+','',text)

In [9]:
df['tweet']=df['tweet'].apply(remove_username)

In [10]:
df.head()

Unnamed: 0,tweet,class
0,!!! RT As a woman you shouldn't complain abou...,0
1,""" momma said no pussy cats inside my doghouse """,0
2,""" -SimplyAddictedToGuys http://t.co/1jL4hi8ZMF...",0
3,""" http://t.co/3gzUpfuMev"" woof woof and hot soles",0
4,""" Lemmie eat a Oreo &amp; do these dishes."" On...",0


In [11]:
train_df,test_df=train_test_split(df, test_size=0.3, random_state=42)
print(train_df.shape, test_df.shape)

(3915, 2) (1678, 2)


In [12]:
max_token_len=64

In [13]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained("albert-base-v2")

Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.decoder.weight', 'predictions.dense.weight', 'predictions.bias', 'predictions.LayerNorm.weight', 'predictions.dense.bias', 'predictions.decoder.bias', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
# Create tensor datasets
class HateSpeechDataset(Dataset):
  def __init__(self, data:pd.DataFrame, tokenizer=tokenizer, max_token_len=64):
    self.tokenizer=tokenizer
    self.data=data 
    self.max_token_len=max_token_len

  def __len__(self):  
    return len(self.data)  

  def __getitem__(self, index:int): 
    data_row=self.data.iloc[index]
    tweet_text=data_row['tweet']
    labels=data_row['class']  

    encoding=self.tokenizer.encode_plus(
                      tweet_text,
                      add_special_tokens=True, 
                      max_length=self.max_token_len,
                      return_token_type_ids=True,
                      padding='max_length', 
                      truncation=True,
                      return_attention_mask=True,
                      return_tensors='pt')
    return dict(
        input_ids=encoding['input_ids'].flatten(),
        attention_mask=encoding['attention_mask'].flatten(), 
        token_type_ids=encoding['token_type_ids'].flatten(),
        labels=torch.FloatTensor([labels]) )

In [15]:
# Create dataloaders in torch and Lightning.    
class HateSpeechDataModule(pl.LightningDataModule): 
  def __init__(self, train_df, test_df, tokenizer, batch_size=32,max_token_len=64): 
    super().__init__()
    self.train_df=train_df 
    self.test_df =test_df  
    self.batch_size=batch_size
    self.tokenizer=tokenizer 
    self.max_token_len=max_token_len  


  def setup(self):
    self.train_dataset=HateSpeechDataset(self.train_df, self.tokenizer,max_token_len=self.max_token_len)
    self.test_dataset =HateSpeechDataset(self.test_df, self.tokenizer, max_token_len=self.max_token_len)

  def train_dataloader(self):
    return DataLoader(self.train_dataset, shuffle=True, batch_size=self.batch_size, num_workers=2)

  def val_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=2)

  def test_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=2)

In [16]:
# validate the dataloader
max_token_len=64
batch_size=32
dm=HateSpeechDataModule(train_df, test_df,tokenizer=tokenizer,max_token_len=max_token_len,batch_size=batch_size)

In [17]:
dm.setup()
input_data=next(iter(dm.train_dataloader()))

In [18]:
input_data.keys()

dict_keys(['input_ids', 'attention_mask', 'token_type_ids', 'labels'])

In [19]:
input_data['input_ids'].shape

torch.Size([32, 64])

In [20]:
input_data['attention_mask'].shape

torch.Size([32, 64])

In [21]:
input_data['token_type_ids'].shape

torch.Size([32, 64])

In [22]:
input_data['labels'].shape

torch.Size([32, 1])