In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m83.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [2]:
from tqdm.auto import tqdm
import os

import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split , StratifiedKFold


import tensorflow as tf 
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, load_model, save_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Input,Dense, LSTM, RNN, Bidirectional, GlobalAveragePooling2D , Dropout

from transformers import TFAutoModel , AutoTokenizer

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
class config:
  seed = 43
  train_path = "/content/drive/MyDrive/Social Media Mining/SMM.csv"
  test_path = "/content/drive/MyDrive/Social Media Mining/SMM_test.csv"
  langs = {'Hindi':'hi','Telugu':'te','Marathi':'mr','Tamil':'ta','Malayalam':'ml','Bengali':'bn','Kannada':'kn','Odia':'or','Gujarati':'gu',}
  save_dir = "./result"
  AUTOTUNE = tf.data.AUTOTUNE
  
  #model params
  epochs = 12
  max_len = 64
  batch_size = 128
  hf_path = "google/muril-base-cased" 

def seed_everything(seed = config.seed):
  print(f"seeded everything to seed {seed}")
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  tf.random.set_seed(seed)

if not os.path.exists(config.save_dir):
  os.makedirs(config.save_dir)
  
seed_everything()

seeded everything to seed 43


In [5]:
df_train = pd.read_csv(config.train_path)
df_test = pd.read_csv(config.test_path)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(config.hf_path)
tokenizer.save_pretrained(os.path.join(config.save_dir , "muril_tokenizer"))

Downloading (…)okenizer_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

('./result/muril_tokenizer/tokenizer_config.json',
 './result/muril_tokenizer/special_tokens_map.json',
 './result/muril_tokenizer/vocab.txt',
 './result/muril_tokenizer/added_tokens.json',
 './result/muril_tokenizer/tokenizer.json')

In [7]:
def fast_encode(texts, tokenizer, chunk_size=512, maxlen=config.max_len):
    
    input_ids = []
    tt_ids = []
    at_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size]
        encs = tokenizer(
                    text_chunk,
                    max_length = config.max_len,
                    padding='max_length',
                    truncation=True
                    )
        
        input_ids.extend(encs['input_ids'])
        tt_ids.extend(encs['token_type_ids'])
        at_ids.extend(encs['attention_mask'])
    
    return {'input_ids': input_ids, 'token_type_ids': tt_ids, 'attention_mask':at_ids}

In [8]:
token_data = fast_encode(list(df_train['commentText'].values), tokenizer)
token_data['index'] = list(df_train['post_index'].values)
token_data['label'] = list(df_train['label'].values)
token_data['language'] = list(df_train['language'].values)

  0%|          | 0/1299 [00:00<?, ?it/s]

In [9]:
df_tokenized = pd.DataFrame(token_data)

In [10]:
@tf.function
def train_prep_function(embeddings , target):
  input_ids = embeddings['input_ids']
  attention_mask = embeddings['attention_mask']

  target = tf.cast(target, tf.int32)
  
  return {'input_ids': input_ids , 'attention_mask': attention_mask}, target

In [11]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.66.153.250:8470




REPLICAS:  8


In [12]:
from keras.layers import Bidirectional, BatchNormalization
from keras.layers import Flatten, LSTM

In [13]:
def create_model(transformer_model):
  input_id_layer = Input(shape=(config.max_len,) ,dtype = tf.int32 , name = 'input_ids')
  attention_mask_layer = Input(shape=(config.max_len,) , dtype = tf.int32 , name = 'attention_mask')

  transformer = transformer_model(input_ids = input_id_layer , attention_mask = attention_mask_layer)[0]
  transformer_output = transformer[:,0,:]

  bd_layer = BatchNormalization()(transformer)
  x = Dropout(0.1)(bd_layer)
  x1 = tf.keras.layers.Conv1D(1,1)(x)
  x1 = tf.keras.layers.Flatten()(x1)
  bd_layer_1 = BatchNormalization()(x1)
  predictions = Dense(1, activation = "sigmoid")(bd_layer_1)

  model = Model(inputs=[input_id_layer , attention_mask_layer], outputs = predictions)
  model.compile(
      optimizer = Adam(learning_rate= 1e-5),
      metrics = 'accuracy',
      loss = 'binary_crossentropy'
  )

  return model

In [14]:
with strategy.scope():
  transformer_model = TFAutoModel.from_pretrained(config.hf_path)
  model = create_model(transformer_model)
model.summary()

Downloading tf_model.h5:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

Some layers from the model checkpoint at google/muril-base-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 64)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 64)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  237556224   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 64,                                            

In [15]:
df_tokenized.head()

Unnamed: 0,input_ids,token_type_ids,attention_mask,index,label,language
0,"[104, 10478, 14318, 2254, 1115, 105, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",238566,0,Hindi
1,"[104, 116969, 101565, 179, 1113, 48907, 1206, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...",7009,0,Hindi
2,"[104, 87541, 9535, 94108, 1278, 1274, 14604, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",404648,1,Hindi
3,"[104, 7313, 100, 4430, 1159, 3032, 95948, 2003...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5057,0,Hindi
4,"[104, 9480, 6985, 81441, 2003, 25124, 60885, 9...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",107146,1,Hindi


In [16]:
df_twolang = df_tokenized[(df_tokenized['language'] == 'Hindi') | (df_tokenized['language'] == 'Tamil')]
df_twolang_label = df_twolang['label']
df_twolang = df_twolang.drop(columns=['label'])
df_lang_test = df_tokenized[df_tokenized['language'] == 'Telugu']
df_lang_test_label = df_lang_test['label']
df_lang_test = df_lang_test.drop(columns=['label'])
df_tokenized_label = df_tokenized['label']
df_tokenized = df_tokenized.drop(columns=['label'])

In [17]:
y_test_lang = df_lang_test_label.tolist()
print(len(y_test_lang))

97012


In [18]:
test_lang_embeddings = {'input_ids': df_lang_test['input_ids'].tolist() ,"attention_mask":df_lang_test['attention_mask'].tolist()}
test_lang_data = tf.data.Dataset.from_tensor_slices((test_lang_embeddings, y_test_lang))
test_lang_data = (
    test_lang_data
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)

In [19]:
scores_lang = []
hists_lang = []
from sklearn.model_selection import train_test_split
x_train_lang,x_val_lang,y_train_lang,y_val_lang=train_test_split(df_twolang, df_twolang_label, test_size=0.1,random_state=123)
train_lang_embeddings = {'input_ids': x_train_lang['input_ids'].tolist() ,"attention_mask":x_train_lang['attention_mask'].tolist()}
val_lang_embeddings = {'input_ids': x_val_lang['input_ids'].tolist() ,"attention_mask":x_val_lang['attention_mask'].tolist()}
y_train_lang = y_train_lang.tolist()
y_val_lang = y_val_lang.tolist()
train_lang_steps = len(train_lang_embeddings['input_ids'])//config.batch_size//4
validation_lang_steps = len(test_lang_embeddings['input_ids'])//config.batch_size
print(f"training steps {train_lang_steps} , validation steps {validation_lang_steps}")
train_lang_dataset = tf.data.Dataset.from_tensor_slices((train_lang_embeddings , y_train_lang))
train_lang_dataset = (
    train_lang_dataset
    .shuffle(1024*2)
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .repeat()
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)
val_lang_dataset = tf.data.Dataset.from_tensor_slices((val_lang_embeddings , y_val_lang ))
val_lang_dataset = (
    val_lang_dataset
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)

hist = model.fit(train_lang_dataset,steps_per_epoch= train_lang_steps,validation_data= val_lang_dataset, epochs = config.epochs)

training steps 662 , validation steps 757
Epoch 1/12




Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [20]:
y_orig_lang = model.predict(test_lang_data , verbose = 1).round()
score = roc_auc_score(y_test_lang ,y_orig_lang)
print(score)

0.5654949751063847


In [21]:
from sklearn.metrics import classification_report
print('Classification report for Telugu language from the model trained using Hindi and Tamil')
print(classification_report(y_orig_lang, y_test_lang, target_names=['Non-Abusive','Abusive']))

Classification report for Telugu language from the model trained using Hindi and Tamil
              precision    recall  f1-score   support

 Non-Abusive       0.93      0.54      0.68     83854
     Abusive       0.20      0.74      0.32     13158

    accuracy                           0.57     97012
   macro avg       0.57      0.64      0.50     97012
weighted avg       0.83      0.57      0.63     97012

