In [1]:
!pip install transformers==4.18.0 --quiet

[K     |████████████████████████████████| 4.0 MB 653 kB/s 
[K     |████████████████████████████████| 596 kB 40.5 MB/s 
[K     |████████████████████████████████| 77 kB 6.6 MB/s 
[K     |████████████████████████████████| 880 kB 43.7 MB/s 
[K     |████████████████████████████████| 6.6 MB 41.2 MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
!pip install SentencePiece

Collecting SentencePiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 660 kB/s 
[?25hInstalling collected packages: SentencePiece
Successfully installed SentencePiece-0.1.96


In [None]:
!pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@master --upgrade 

In [4]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import transformers
from transformers import AlbertTokenizer, AlbertModel

import torch
from torch.utils.data import DataLoader, Dataset

import pytorch_lightning as pl

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
df=pd.read_csv('./drive/MyDrive/Colab Notebooks/Khilnani_LP_hate_speech_data.csv', index_col=0)
df.head()

Unnamed: 0,tweet,class
0,!!! RT @mayasolovely: As a woman you shouldn't...,0
1,""" momma said no pussy cats inside my doghouse """,0
2,"""@Addicted2Guys: -SimplyAddictedToGuys http://...",0
3,"""@AllAboutManFeet: http://t.co/3gzUpfuMev"" woo...",0
4,"""@Allyhaaaaa: Lemmie eat a Oreo &amp; do these...",0


In [7]:
df.shape

(5593, 2)

In [8]:
def remove_username(text):
  return re.sub(r'@[\w?.!/;:]+','',text)

In [9]:
df['tweet']=df['tweet'].apply(remove_username)

In [10]:
df.head()

Unnamed: 0,tweet,class
0,!!! RT As a woman you shouldn't complain abou...,0
1,""" momma said no pussy cats inside my doghouse """,0
2,""" -SimplyAddictedToGuys http://t.co/1jL4hi8ZMF...",0
3,""" http://t.co/3gzUpfuMev"" woof woof and hot soles",0
4,""" Lemmie eat a Oreo &amp; do these dishes."" On...",0


In [11]:
train_df,test_df=train_test_split(df, test_size=0.3, random_state=42)
print(train_df.shape, test_df.shape)

(3915, 2) (1678, 2)


In [12]:
max_token_len=64

In [13]:
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained("albert-base-v2")

Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.dense.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.bias', 'predictions.decoder.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
# Create tensor datasets
class HateSpeechDataset(Dataset):
  def __init__(self, data:pd.DataFrame, tokenizer=tokenizer, max_token_len=64):
    self.tokenizer=tokenizer
    self.data=data 
    self.max_token_len=max_token_len

  def __len__(self):  
    return len(self.data)  

  def __getitem__(self, index:int): 
    data_row=self.data.iloc[index]
    tweet_text=data_row['tweet']
    labels=data_row['class']  

    encoding=self.tokenizer.encode_plus(
                      tweet_text,
                      add_special_tokens=True, 
                      max_length=self.max_token_len,
                      return_token_type_ids=True,
                      padding='max_length', 
                      truncation=True,
                      return_attention_mask=True,
                      return_tensors='pt')
    return dict(
        input_ids=encoding['input_ids'].flatten(),
        attention_mask=encoding['attention_mask'].flatten(), 
        token_type_ids=encoding['token_type_ids'].flatten(),
        labels=torch.FloatTensor([labels]) )

In [16]:
# Create dataloaders in torch and Lightning.    
class HateSpeechDataModule(pl.LightningDataModule): 
  def __init__(self, train_df, test_df, tokenizer, batch_size=32,max_token_len=64): 
    super().__init__()
    self.train_df=train_df 
    self.test_df =test_df  
    self.batch_size=batch_size
    self.tokenizer=tokenizer 
    self.max_token_len=max_token_len  


  def setup(self):
    self.train_dataset=HateSpeechDataset(self.train_df, self.tokenizer,max_token_len=self.max_token_len)
    self.test_dataset =HateSpeechDataset(self.test_df, self.tokenizer, max_token_len=self.max_token_len)

  def train_dataloader(self):
    return DataLoader(self.train_dataset, shuffle=True, batch_size=self.batch_size, num_workers=2)

  def val_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=2)

  def test_dataloader(self):
    return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=2)

In [17]:
# validate the dataloader
max_token_len=64
batch_size=32
dm=HateSpeechDataModule(train_df, test_df,tokenizer=tokenizer,max_token_len=max_token_len,batch_size=batch_size)

In [19]:
dm.setup()
input_data=next(iter(dm.train_dataloader()))

In [20]:
input_data.keys()

dict_keys(['input_ids', 'attention_mask', 'token_type_ids', 'labels'])

In [21]:
input_data['input_ids'].shape

torch.Size([32, 64])

In [22]:
input_data['attention_mask'].shape

torch.Size([32, 64])

In [23]:
input_data['token_type_ids'].shape

torch.Size([32, 64])

In [25]:
input_data['labels'].shape

torch.Size([32, 1])

In [None]:
### sample
from transformers import pipeline
>>> unmasker = pipeline('fill-mask', model='albert-base-v2')
>>> unmasker("Hello I'm a [MASK] model.")

In [None]:
from transformers import pipeline

In [None]:
unmarker=pipeline('fill-mask', model='albert-base-v2')

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/742k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

In [None]:
unmarker("hello I'm a [MASK] model")

[{'score': 0.030899707227945328,
  'sequence': "hello i'm a joyah model",
  'token': 28153,
  'token_str': 'joyah'},
 {'score': 0.010693277232348919,
  'sequence': "hello i'm a klu model",
  'token': 15744,
  'token_str': 'klu'},
 {'score': 0.007535903714597225,
  'sequence': "hello i'm a surfer model",
  'token': 22668,
  'token_str': 'surfer'},
 {'score': 0.0063470955938100815,
  'sequence': "hello i'm a gecko model",
  'token': 28456,
  'token_str': 'gecko'},
 {'score': 0.005700427107512951,
  'sequence': "hello i'm a lady model",
  'token': 1402,
  'token_str': 'lady'}]

In [None]:
## sample

from transformers import AlbertTokenizer, AlbertModel
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained("albert-base-v2")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [None]:
text = "Replace me by any text you'd like."
encoded_input=tokenizer(text, return_tensors='pt')


In [None]:
for item,text in encoded_input.items():
  print(item,':', text)

input_ids : tensor([[   2, 3934,   55,   34,  186, 1854,   42,   22,   43,  101,    9,    3]])
token_type_ids : tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask : tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [None]:
from transformers import AlbertTokenizer, AlbertModel

In [None]:
tokenizer=AlbertTokenizer.from_pretrained('albert-base-v2')
model=AlbertModel.from_pretrained('albert-base-v2')

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.dense.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.bias', 'predictions.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# dataset.map sample
# the tokenizer of bert works on a string, a list/tuple of strings or a list/tuple of integers. So, check is your data getting converted to string or not. 
# To apply tokenizer on whole dataset I used Dataset.map, but this runs on graph mode. So, I need to wrap it in a tf.py_function. 
# The tf.py_function will pass regular tensors (with a value and a .numpy() method to access it), to the wrapped python function. 
# My data was getting converted to bytes after using py_function hence I applied tf.compat.as_str to convert bytes to string.
# https://stackoverflow.com/questions/61555097/mapping-text-data-through-huggingface-tokenizer

In [None]:
from transformers import BertTokenizer
import tensorflow as tf

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def encode(lang1, lang2):
    lang1 = tokenizer.encode(tf.compat.as_str(lang1.numpy()), add_special_tokens=True)
    lang2 = tokenizer.encode(tf.compat.as_str(lang2.numpy()), add_special_tokens=True)
    return lang1, lang2
def tf_encode(pt, en):
    result_pt, result_en = tf.py_function(func = encode, inp = [pt, en], Tout=[tf.int64, tf.int64])
    result_pt.set_shape([None])
    result_en.set_shape([None])
    return result_pt, result_en

df_train = pd.DataFrame({'comment_text': ['Today was a good day']*5})

# train_dataset = dataset3.map(tf_encode)
train_dataset = df_train.map(tf_encode)
BUFFER_SIZE = 200
BATCH_SIZE = 64


train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE, 
                                                           padded_shapes=(60, 60))
a,p = next(iter(train_dataset))

https://stackoverflow.com/questions/61555097/mapping-text-data-through-huggingface-tokenizer

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
!gdown --id 1VuQ-U7TtggShMeuRSA_hzC8qGDl2LRkr

Downloading...
From: https://drive.google.com/uc?id=1VuQ-U7TtggShMeuRSA_hzC8qGDl2LRkr
To: /content/toxic_comments.csv
100% 68.8M/68.8M [00:00<00:00, 74.9MB/s]


In [None]:
df=pd.read_csv('toxic_comments.csv')

In [None]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
MODEL = 'bert-base-multilingual-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL)

In [None]:
def encode(texts, tokenizer=tokenizer, maxlen=10):
#     import pdb; pdb.set_trace()
    inputs = tokenizer.encode_plus(
        texts,
        return_tensors='tf',
        return_attention_masks=True, 
        return_token_type_ids=True,
        pad_to_max_length=True,
        max_length=maxlen
    )

    return inputs['input_ids'], inputs["token_type_ids"], inputs["attention_mask"]

In [None]:
x_train = (tf.data.Dataset.from_tensor_slices(df_dataset['comment_text'].astype(str).values)
           .map(encode))

In [None]:
!pip install --quiet 'datasets'

In [None]:

from datasets import load_dataset

In [None]:
import os

In [None]:
os.getcwd()

'/content'

In [None]:
os.listdir()

['.config', 'toxic_comments.csv', 'drive', 'sample_data']

In [None]:
df_dataset=load_dataset('csv', data_files='toxic_comments.csv')

In [None]:
df_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'],
        num_rows: 159571
    })
})