In [1]:
import torch
from torchtext import data
from torchtext import datasets
import random
import numpy as np
import time
from sklearn.metrics import classification_report,accuracy_score,f1_score


SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [2]:
#reading data

import pandas as pd
from glob import glob

file_train = "/content/tamil_train_transliterated.csv"
file_dev = "/content/tamil_off_dev_transliterated.csv"
file_test = "/content/tamil_off_test_transliterated.csv"


import csv
df_train = pd.read_csv(file_train,sep="\t",encoding='utf-8',quoting=csv.QUOTE_NONE,usecols=[0,1])
df_dev = pd.read_csv(file_dev,sep="\t",encoding='utf-8',quoting=csv.QUOTE_NONE,usecols=[0,1])
df_test = pd.read_csv(file_test,sep=",",encoding='utf-8',quoting=csv.QUOTE_NONE,usecols=[0,1])

df_train = df_train.dropna()
df_dev = df_dev.dropna()
df_test = df_test.dropna()


#train_sentences = df_train.values

train_sentences = list(df_train['text'].values)
train_labels = list(df_train['label'].values)

#dev_sentences, dev_labels = df_dev.values

dev_sentences = list(df_dev['text'].values)
dev_labels = list(df_dev['label'].values)

test = df_test.values
test_sentences = list(test[:,0])
test_labels = test[:,1]

def clear_labels(labels_list):
  new_labels_list = []
  for item in labels_list:
    item = item.replace("\n","").replace("\"","")
    item = item.strip()
    if item == 'not-Kannada':
      item = 'not-kannada'
    new_labels_list.append(item)
  return new_labels_list

train_labels = clear_labels(train_labels)
dev_labels = clear_labels(dev_labels)
test_labels = clear_labels(test_labels)

print(set(train_labels))

{'Offensive_Targeted_Insult_Individual', 'Offensive_Targeted_Insult_Other', 'Offensive_Targeted_Insult_Group', 'not-Tamil', 'Offensive_Untargetede', 'Not_offensive'}


In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 49.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 520 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.7 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 39.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting 

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model_mbert = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=6)
model_mbert = model_mbert.to(device)

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [5]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.3 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [6]:
from transformers import XLMRobertaTokenizerFast, XLMRobertaForSequenceClassification
tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')
#model_cm_xlmr = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=6)
model_cm_xlmr = XLMRobertaForSequenceClassification.from_pretrained('../../CM_bert/', num_labels=6)
model_cm_xlmr = model_cm_xlmr.to(device)

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

In [7]:
#building tokenizer models

from transformers import AutoTokenizer, AutoModel

models = []
tokenizers = []

model_names = [
    'bert-base-multilingual-cased',
    'xlm-roberta-base',
]
tokenizers = [
    BertTokenizer.from_pretrained('bert-base-multilingual-cased'),
    XLMRobertaTokenizerFast.from_pretrained('../../CM_bert/'),
]

for name in model_names:
    model = AutoModel.from_pretrained(name)
    model.eval()
    models.append(model)

for model in models:
  for param in model.parameters():
      param.requires_grad = False

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.bi

In [8]:
n_models = len(models)

In [9]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(train_labels)
encoded_train_labels = le.transform(train_labels)
encoded_dev_labels = le.transform(dev_labels)
encoded_test_labels = le.transform(test_labels)

print(set(encoded_train_labels))
print(le.classes_)

{0, 1, 2, 3, 4, 5}
['Not_offensive' 'Offensive_Targeted_Insult_Group'
 'Offensive_Targeted_Insult_Individual' 'Offensive_Targeted_Insult_Other'
 'Offensive_Untargetede' 'not-Tamil']


In [10]:
print(train_sentences[:10])

['திரைப்படம் இரு நிலை தி எரிகா பொகுது', 'I love Ajith Kumar Vivegam movie inki mjy bht achi lgi', 'படம் நல்ல நகைச்சுவை படாம இருகும் போலை', 'கார்த்திக் சுப்பராஜ் அன்னி இந்த படம் வெற்றி அடைய உணகளுக்கு என்னுடய வழ்துக்கள்', 'கவுண்டர் தேவர்சார்பாக வெற்றி பெற வாழ்த்துக்கள் :lion_face:', 'இப்போ இந்த டிரெய்லர் ஆ பற்குறவன ஒரு போன்ற பொடுங்க', 'இல் தலைவன் யோகி பாபு இருக்கார் படம் வேரா நிலை லா இருக்கும்', 'நெற்கொண்டா பர்வை சேமா சேமா சேமா டிரெய்லர்', 'ஏய் இது 96 யார் ஏமாத்தறீங்க செம பின்னிட்டீங்க', 'படம் கண்டிப்பாக வெற்றி பெற வேண்டும் செம்ம இரு நிலை']


In [11]:
for tokenizer in tokenizers:
  print(tokenizer)

PreTrainedTokenizer(name_or_path='bert-base-multilingual-cased', vocab_size=119547, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
PreTrainedTokenizerFast(name_or_path='xlm-roberta-base', vocab_size=250002, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})


In [12]:
train_tokenized = [tokenizer(train_sentences, padding='max_length', truncation=True, max_length=64, return_tensors="pt") for tokenizer in tokenizers]
train_labels = torch.tensor(encoded_train_labels)
dev_tokenized = [tokenizer(dev_sentences, padding='max_length', truncation=True, max_length=64, return_tensors="pt") for tokenizer in tokenizers]
dev_labels = torch.tensor(encoded_dev_labels)
test_tokenized = [tokenizer(test_sentences, padding='max_length', truncation=True, max_length=64, return_tensors="pt") for tokenizer in tokenizers]
test_labels = torch.tensor(encoded_test_labels)

In [13]:
from torch.utils.data import Dataset

class fusion_Dataset(Dataset):
    def __init__(self, data, labels = None):
        self.data = data
        self.labels = labels
        self.n_models = 2 #2 models

    def __getitem__(self, idx):
        item = {}
        for i in range(self.n_models):
            item.update({key+'_'+str(i): torch.tensor(val[idx]) for key, val in self.data[i].items()})
        item['index'] = idx
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.data[0]['input_ids'])

# Defining Datasets
train_dataset = fusion_Dataset(train_tokenized, train_labels)
dev_dataset = fusion_Dataset(dev_tokenized, dev_labels)
test_dataset = fusion_Dataset(test_tokenized,test_labels)

In [14]:
print(train_dataset[0])

{'input_ids_0': tensor([   101,  98498,  12520,  61233,  87886,  55993,   1146,  21426, 105076,
          1162, 111313,  48305,  14124,    102,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0]), 'token_type_ids_0': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask_0': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

  if sys.path[0] == '':
  


In [15]:
import torch.nn.functional as F
import torch.nn as nn

# Basic Fully-Connected (Linear => BatchNorm => ReLU)
class BasicFC(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(BasicFC, self).__init__()
        self.fc = nn.Linear(in_channels, out_channels, **kwargs)
        self.bn = nn.BatchNorm1d(out_channels, eps=0.001)

    def forward(self, x):
        x = self.fc(x)
        x = self.bn(x)
        return F.relu(x, inplace=True)

class FusionNet(torch.nn.Module):
    def __init__(self, D_in, H1, H2, H3, D_out):
        super(FusionNet, self).__init__()
        self.linear1_1 = BasicFC(D_in, H1)
        self.linear1_2 = BasicFC(H1, H2)
        self.linear1_3 = BasicFC(H2, H3)
        self.dp = nn.Dropout(0.1)
        self.linear2 = torch.nn.Linear(H3, D_out, bias = False)

    def forward(self, x):
        h_relu_1 = self.linear1_1(x)
        h_relu_2 = self.dp(self.linear1_2(h_relu_1))
        h_relu_3 = self.dp(self.linear1_3(h_relu_2))
        y_pred = self.linear2(h_relu_3)
        return y_pred

In [16]:
from transformers import AdamW
loss = nn.CrossEntropyLoss(reduction='mean').float()

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
fin = open('bad-words.txt')
hate_speech_lexicon = []
for line in fin:
  hate_speech_lexicon.append(line.replace("\n",""))
print(hate_speech_lexicon)

profanity_vector = CountVectorizer(vocabulary=set(hate_speech_lexicon))
print(profanity_vector.get_feature_names)
train_profanity_vector = csr_matrix.toarray(profanity_vector.transform(train_sentences))
dev_profanity_vector = csr_matrix.toarray(profanity_vector.transform(dev_sentences))
test_profanity_vector = csr_matrix.toarray(profanity_vector.transform(test_sentences))

['', 'abbo', 'abo', 'abortion', 'abuse', 'addict', 'addicts', 'adult', 'africa', 'african', 'alla', 'allah', 'alligatorbait', 'amateur', 'american', 'anal', 'analannie', 'analsex', 'angie', 'angry', 'anus', 'arab', 'arabs', 'areola', 'argie', 'aroused', 'arse', 'arsehole', 'asian', 'ass', 'assassin', 'assassinate', 'assassination', 'assault', 'assbagger', 'assblaster', 'assclown', 'asscowboy', 'asses', 'assfuck', 'assfucker', 'asshat', 'asshole', 'assholes', 'asshore', 'assjockey', 'asskiss', 'asskisser', 'assklown', 'asslick', 'asslicker', 'asslover', 'assman', 'assmonkey', 'assmunch', 'assmuncher', 'asspacker', 'asspirate', 'asspuppies', 'assranger', 'asswhore', 'asswipe', 'athletesfoot', 'attack', 'australian', 'babe', 'babies', 'backdoor', 'backdoorman', 'backseat', 'badfuck', 'balllicker', 'balls', 'ballsack', 'banging', 'baptist', 'barelylegal', 'barf', 'barface', 'barfface', 'bast', 'bastard ', 'bazongas', 'bazooms', 'beaner', 'beast', 'beastality', 'beastial', 'beastiality', 'b

In [81]:
import torch.nn.functional as F
import torch.nn as nn

class BasicFC(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(BasicFC, self).__init__()
        self.fc = nn.Linear(in_channels, out_channels, **kwargs)
        self.bn = nn.BatchNorm1d(out_channels, eps=0.001)

    def forward(self, x):
        x = self.fc(x)
        x = self.bn(x)
        return F.relu(x, inplace=True)

class fusionNN(torch.nn.Module):
    def __init__(self, Embed_dim, hidden_dim_1, hidden_dim_2, hidden_dim_3, out_dim):
        super(fusionNN, self).__init__()
        self.linear1 = BasicFC(Embed_dim, hidden_dim_1)
        self.linear2 = BasicFC(hidden_dim_1, hidden_dim_2)
        self.linear3 = BasicFC(hidden_dim_2, hidden_dim_3)
        self.dropout = nn.Dropout(0.1)
        self.linear_out = torch.nn.Linear(hidden_dim_3, out_dim, bias = False)

    def forward(self, x):
        h1 = self.linear1(x)
        h2 = self.dropout(self.linear2(h1))
        h3 = self.dropout(self.linear3(h2))
        y_pred = self.linear_out(h3)
        #print(y_pred.shape)
        return y_pred

In [82]:
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, f1_score

emb_dim = 768*2+train_profanity_vector.shape[1]
print(emb_dim)
fusion_model = fusionNN(emb_dim, 1024, 256, 64, len(le.classes_))

optimizer = AdamW(fusion_model.parameters(), lr=1e-5)
fusion_model.to(device)

for model in models:
    model.to(device)

best_val_f1 = 0
count = 0

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

2920


In [83]:
print(fusion_model)

fusionNN(
  (linear1): BasicFC(
    (fc): Linear(in_features=2920, out_features=1024, bias=True)
    (bn): BatchNorm1d(1024, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (linear2): BasicFC(
    (fc): Linear(in_features=1024, out_features=256, bias=True)
    (bn): BatchNorm1d(256, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (linear3): BasicFC(
    (fc): Linear(in_features=256, out_features=64, bias=True)
    (bn): BatchNorm1d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (linear_out): Linear(in_features=64, out_features=6, bias=False)
)


In [84]:
from tqdm import tqdm
def train():
  total_train_loss = 0
  fusion_model.train()
  for batch in tqdm(train_loader):
      optimizer.zero_grad()
      outputs_all = []
      for i in range(n_models):
          model = models[i]
          input_ids = batch['input_ids'+'_'+str(i)].to(device)
          attention_mask = batch['attention_mask'+'_'+str(i)].to(device)
          labels = batch['labels'].to(device)
          outputs = model(input_ids, attention_mask=attention_mask)
          outputs_all.append(outputs[1])
      outputs_all.append(torch.Tensor(train_profanity_vector[batch['index'], :]).to(device))
      bert_models_output = torch.cat(outputs_all, dim = -1)
      #print(bert_models_output.shape)
      out = fusion_model(bert_models_output)
      loss_value = loss(out, labels)
      #print(loss_value)
      loss_value.backward()
      optimizer.step()

In [85]:
def evaluate():
  preds = []
  fusion_model.eval()
  total_val_loss = 0
  with torch.set_grad_enabled(False):
      for batch in tqdm(dev_loader):
          outputs_all = []
          for i in range(n_models):
              model = models[i]
              input_ids = batch['input_ids'+'_'+str(i)].to(device)
              attention_mask = batch['attention_mask'+'_'+str(i)].to(device)
              labels = batch['labels'].to(device)
              outputs = model(input_ids, attention_mask=attention_mask)
              outputs_all.append(outputs[1])
          outputs_all.append(torch.Tensor(dev_profanity_vector[batch['index'], :]).to(device))

          bert_models_output = torch.cat(outputs_all, dim = -1) 
          out = fusion_model(bert_models_output)
          loss_value = loss(out, labels)
          total_val_loss += loss_value.item()/len(dev_loader)
          
          for logits in out.cpu().numpy():
              preds.append(np.argmax(logits))
  
  y_true = encoded_dev_labels
  y_pred = preds
  target_names = le.classes_
  print(classification_report(y_true, y_pred, target_names=target_names))
  macro_f1 = f1_score(y_true, y_pred, average='macro')
  return total_val_loss,macro_f1

In [None]:
import copy
epochs =100
count = 0
best_macro_f1 = float('0')

model_save_name1 = 'fusion_net_tamil.pt'
path = F"/content/gdrive/MyDrive/{model_save_name1}"

for epoch in range(epochs):
  print(f'Epoch: {epoch+1:02}')
  train()
  valid_loss,macro_f1= evaluate()
      
  if macro_f1 > best_macro_f1:
      best_macro_f1 = macro_f1
      torch.save(fusion_model.state_dict(), path)
      best_model = copy.deepcopy(fusion_model)
      count = 0
  else:
      count += 1
    
  if count == 10:
    print("Early Stopping, as there is no increase in value")
    break

Epoch: 01


  if sys.path[0] == '':
  
 89%|████████▉ | 491/550 [09:16<01:06,  1.13s/it]

In [None]:
def test():
  preds = []
  fusion_model.eval()
  total_test_loss = 0
  with torch.set_grad_enabled(False):
      for batch in tqdm(test_loader):
          outputs_all = []
          for i in range(n_models):
              model = models[i]
              input_ids = batch['input_ids'+'_'+str(i)].to(device)
              attention_mask = batch['attention_mask'+'_'+str(i)].to(device)
              labels = batch['labels'].to(device)
              outputs = model(input_ids, attention_mask=attention_mask)
              outputs_all.append(outputs[1])
          outputs_all.append(torch.Tensor(test_profanity_vector[batch['index'], :]).to(device))

          bert_models_output = torch.cat(outputs_all, dim = -1) 
          out = fusion_model(bert_models_output)
          loss = loss(out, labels)
          total_test_loss += loss.item()/len(test_loader)
          
          for logits in out.cpu().numpy():
              preds.append(np.argmax(logits))
  
  y_true = encoded_test_labels
  y_pred = preds
  target_names = le.classes_
  print(classification_report(y_true, y_pred, target_names=target_names))