In [None]:
!pip install transformers



In [None]:
from tqdm.auto import tqdm
import os

import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split , StratifiedKFold


import tensorflow as tf 
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, load_model, save_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Input,Dense, LSTM, RNN, Bidirectional, GlobalAveragePooling2D , Dropout

from transformers import TFAutoModel , AutoTokenizer

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
class config:
  seed = 43
  train_path = "/content/drive/MyDrive/Social Media Mining/SMM.csv"
  test_path = "/content/drive/MyDrive/Social Media Mining/SMM_test.csv"
  langs = {'Hindi':'hi','Telugu':'te','Marathi':'mr','Tamil':'ta','Malayalam':'ml','Bengali':'bn','Kannada':'kn','Odia':'or','Gujarati':'gu',}
  save_dir = "./result"
  AUTOTUNE = tf.data.AUTOTUNE
  
  #model params
  epochs = 12
  max_len = 64
  batch_size = 128
  hf_path = "google/muril-base-cased" 

def seed_everything(seed = config.seed):
  print(f"seeded everything to seed {seed}")
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  tf.random.set_seed(seed)

if not os.path.exists(config.save_dir):
  os.makedirs(config.save_dir)
  
seed_everything()

seeded everything to seed 43


In [None]:
df_train = pd.read_csv(config.train_path)
df_test = pd.read_csv(config.test_path)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.hf_path)
tokenizer.save_pretrained(os.path.join(config.save_dir , "muril_tokenizer"))

('./result/muril_tokenizer/tokenizer_config.json',
 './result/muril_tokenizer/special_tokens_map.json',
 './result/muril_tokenizer/vocab.txt',
 './result/muril_tokenizer/added_tokens.json',
 './result/muril_tokenizer/tokenizer.json')

In [None]:
def fast_encode(texts, tokenizer, chunk_size=512, maxlen=config.max_len):
    
    input_ids = []
    tt_ids = []
    at_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size]
        encs = tokenizer(
                    text_chunk,
                    max_length = config.max_len,
                    padding='max_length',
                    truncation=True
                    )
        
        input_ids.extend(encs['input_ids'])
        tt_ids.extend(encs['token_type_ids'])
        at_ids.extend(encs['attention_mask'])
    
    return {'input_ids': input_ids, 'token_type_ids': tt_ids, 'attention_mask':at_ids}

In [None]:
token_data = fast_encode(list(df_train['commentText'].values), tokenizer)
token_data['index'] = list(df_train['post_index'].values)
token_data['label'] = list(df_train['label'].values)
token_data['language'] = list(df_train['language'].values)

  0%|          | 0/1299 [00:00<?, ?it/s]

In [None]:
df_tokenized = pd.DataFrame(token_data)

In [None]:
@tf.function
def train_prep_function(embeddings , target):
  input_ids = embeddings['input_ids']
  attention_mask = embeddings['attention_mask']

  target = tf.cast(target, tf.int32)
  
  return {'input_ids': input_ids , 'attention_mask': attention_mask}, target

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


Running on TPU  grpc://10.97.213.2:8470
INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.


INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.


INFO:tensorflow:Initializing the TPU system: grpc://10.97.213.2:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.97.213.2:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


REPLICAS:  8


In [None]:
from keras.layers import Bidirectional, BatchNormalization
from keras.layers import Flatten, LSTM

In [None]:
def create_model(transformer_model):
  input_id_layer = Input(shape=(config.max_len,) ,dtype = tf.int32 , name = 'input_ids')
  attention_mask_layer = Input(shape=(config.max_len,) , dtype = tf.int32 , name = 'attention_mask')

  transformer = transformer_model(input_ids = input_id_layer , attention_mask = attention_mask_layer)[0]
  transformer_output = transformer[:,0,:]

  bd_layer = BatchNormalization()(transformer)
  x = Dropout(0.1)(bd_layer)
  x1 = tf.keras.layers.Conv1D(1,1)(x)
  x1 = tf.keras.layers.Flatten()(x1)
  bd_layer_1 = BatchNormalization()(x1)
  predictions = Dense(1, activation = "sigmoid")(bd_layer_1)

  model = Model(inputs=[input_id_layer , attention_mask_layer], outputs = predictions)
  model.compile(
      optimizer = Adam(learning_rate= 1e-5),
      metrics = 'accuracy',
      loss = 'binary_crossentropy'
  )

  return model

In [None]:
with strategy.scope():
  transformer_model = TFAutoModel.from_pretrained(config.hf_path)
  model = create_model(transformer_model)
model.summary()

Downloading:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

Some layers from the model checkpoint at google/muril-base-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['bert/pooler/dense/kernel:0', 'bert/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 64)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 64)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  237556224   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 64,                                            

In [None]:
df_tokenized.head()

Unnamed: 0,input_ids,token_type_ids,attention_mask,index,label,language
0,"[104, 10478, 14318, 2254, 1115, 105, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",238566,0,Hindi
1,"[104, 116969, 101565, 179, 1113, 48907, 1206, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, ...",7009,0,Hindi
2,"[104, 87541, 9535, 94108, 1278, 1274, 14604, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",404648,1,Hindi
3,"[104, 7313, 100, 4430, 1159, 3032, 95948, 2003...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5057,0,Hindi
4,"[104, 9480, 6985, 81441, 2003, 25124, 60885, 9...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",107146,1,Hindi


In [None]:
df_tokenized_onelang = df_tokenized[df_tokenized['language'] == 'Hindi']
df_tokenized_onelang_label = df_tokenized_onelang['label']
df_tokenized_onelang = df_tokenized_onelang.drop(columns=['label'])
df_tokenized_alllang = df_tokenized[df_tokenized['language'] != 'Hindi']
df_tokenized_alllang_label = df_tokenized_alllang['label']
df_tokenized_alllang = df_tokenized_alllang.drop(columns=['label'])

In [None]:
df_tokenized_onelang_t = df_tokenized[df_tokenized['language'] == 'Telugu']
df_tokenized_onelang_t_label = df_tokenized_onelang_t['label']
df_tokenized_onelang_t = df_tokenized_onelang_t.drop(columns=['label'])
df_tokenized_alllang_t = df_tokenized[df_tokenized['language'] != 'Telugu']
df_tokenized_alllang_t_label = df_tokenized_alllang_t['label']
df_tokenized_alllang_t = df_tokenized_alllang_t.drop(columns=['label'])

In [None]:
df_tokenized_onelang_a = df_tokenized[df_tokenized['language'] == 'Malayalam']
df_tokenized_onelang_a_label = df_tokenized_onelang_a['label']
df_tokenized_onelang_a = df_tokenized_onelang_a.drop(columns=['label'])
df_tokenized_alllang_a = df_tokenized[df_tokenized['language'] != 'Malayalam']
df_tokenized_alllang_a_label = df_tokenized_alllang_a['label']
df_tokenized_alllang_a = df_tokenized_alllang_a.drop(columns=['label'])

In [None]:
from sklearn.model_selection import train_test_split
x_train_onelang,x_val_onelang,y_train_onelang,y_val_onelang=train_test_split(df_tokenized_onelang, df_tokenized_onelang_label, test_size=0.1,random_state=123)

In [None]:
from sklearn.model_selection import train_test_split
x_train_onelang_t,x_val_onelang_t,y_train_onelang_t,y_val_onelang_t=train_test_split(df_tokenized_onelang_t, df_tokenized_onelang_t_label, test_size=0.1,random_state=123)

In [None]:
from sklearn.model_selection import train_test_split
x_train_onelang_a,x_val_onelang_a,y_train_onelang_a,y_val_onelang_a=train_test_split(df_tokenized_onelang_a, df_tokenized_onelang_a_label, test_size=0.1,random_state=123)

In [None]:
x_train_alllang = df_tokenized_alllang.append(x_train_onelang)
y_train_alllang = df_tokenized_alllang_label.append(y_train_onelang)

In [None]:
x_train_alllang_t = df_tokenized_alllang_t.append(x_train_onelang_t)
y_train_alllang_t = df_tokenized_alllang_t_label.append(y_train_onelang_t)

In [None]:
x_train_alllang_a = df_tokenized_alllang_a.append(x_train_onelang_a)
y_train_alllang_a = df_tokenized_alllang_a_label.append(y_train_onelang_a)

In [None]:
test_embeddings = {'input_ids': x_val_onelang['input_ids'].tolist() ,"attention_mask":x_val_onelang['attention_mask'].tolist()}
test_data = tf.data.Dataset.from_tensor_slices((test_embeddings, y_val_onelang))
test_data = (
    test_data
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)

In [None]:
test_embeddings_t = {'input_ids': x_val_onelang_t['input_ids'].tolist() ,"attention_mask":x_val_onelang_t['attention_mask'].tolist()}
test_data_t = tf.data.Dataset.from_tensor_slices((test_embeddings_t, y_val_onelang_t))
test_data_t = (
    test_data_t
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)

In [None]:
test_embeddings_a = {'input_ids': x_val_onelang_a['input_ids'].tolist() ,"attention_mask":x_val_onelang_a['attention_mask'].tolist()}
test_data_a = tf.data.Dataset.from_tensor_slices((test_embeddings_a, y_val_onelang_a))
test_data_a = (
    test_data_a
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)

In [None]:
train_embeddings = {'input_ids': x_train_alllang['input_ids'].tolist() ,"attention_mask":x_train_alllang['attention_mask'].tolist()}
train_steps = len(train_embeddings['input_ids'])//config.batch_size//4
validation_steps = len(test_embeddings['input_ids'])//config.batch_size
print(f"training steps {train_steps} , validation steps {validation_steps}")
train_dataset = tf.data.Dataset.from_tensor_slices((train_embeddings , y_train_alllang))
train_dataset = (
    train_dataset
    .shuffle(1024*2)
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .repeat()
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)
test_dataset = tf.data.Dataset.from_tensor_slices((test_embeddings , y_val_onelang))
test_dataset = (
    test_dataset
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)
with strategy.scope():
  transformer_model = TFAutoModel.from_pretrained(config.hf_path)
  model = create_model(transformer_model)
hist = model.fit(train_dataset,steps_per_epoch= train_steps,validation_data= test_dataset, epochs = config.epochs)

training steps 1238 , validation steps 239


Some layers from the model checkpoint at google/muril-base-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['bert/pooler/dense/kernel:0', 'bert/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/12


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'IteratorGetNext:0' shape=(None, 64) dtype=int32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, 64) dtype=int32>, <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int32>]




  num_elements)


Instructions for updating:
use `experimental_local_results` instead.


Instructions for updating:
use `experimental_local_results` instead.
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'IteratorGetNext:0' shape=(None, 64) dtype=int32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, 64) dtype=int32>, <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int32>]








INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 64) dtype=int32>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 64) dtype=int32>, <tf.Tensor 'cond/Identity_16:0' shape=(None,) dtype=int32>]


Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [None]:
y_orig_alllang = model.predict(test_data , verbose = 1).round()

In [None]:
from sklearn.metrics import classification_report
print('Classification report for Hindi after training on all languages')
print(classification_report(y_orig_alllang, y_val_onelang, target_names=['Non-Abusive','Abusive']))

In [None]:
train_embeddings_onelang = {'input_ids': x_train_onelang['input_ids'].tolist() ,"attention_mask":x_train_onelang['attention_mask'].tolist()}
train_steps_onelang = len(train_embeddings_onelang['input_ids'])//config.batch_size//4
validation_steps= len(test_embeddings['input_ids'])//config.batch_size
print(f"training steps {train_steps_onelang} , validation steps {validation_steps}")
train_dataset_onelang = tf.data.Dataset.from_tensor_slices((train_embeddings_onelang , y_train_onelang))
train_dataset_onelang = (
    train_dataset_onelang
    .shuffle(1024*2)
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .repeat()
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)
test_dataset = tf.data.Dataset.from_tensor_slices((test_embeddings , y_val_onelang))
test_dataset = (
    test_dataset
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)
with strategy.scope():
  transformer_model = TFAutoModel.from_pretrained(config.hf_path)
  model1 = create_model(transformer_model)
hist = model1.fit(train_dataset_onelang,steps_per_epoch= train_steps_onelang,validation_data= test_dataset, epochs = config.epochs)

training steps 539 , validation steps 239


Downloading:   0%|          | 0.00/1.45G [00:00<?, ?B/s]

Some layers from the model checkpoint at google/muril-base-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/12


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'IteratorGetNext:0' shape=(None, 64) dtype=int32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, 64) dtype=int32>, <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int32>]




  num_elements)


Instructions for updating:
use `experimental_local_results` instead.


Instructions for updating:
use `experimental_local_results` instead.
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'IteratorGetNext:0' shape=(None, 64) dtype=int32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, 64) dtype=int32>, <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int32>]








INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 64) dtype=int32>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 64) dtype=int32>, <tf.Tensor 'cond/Identity_16:0' shape=(None,) dtype=int32>]


Epoch 2/12
Epoch 3/12
Epoch 4/12

KeyboardInterrupt: ignored

In [None]:
y_orig_onelang = model.predict(test_data , verbose = 1).round()

In [None]:
from sklearn.metrics import classification_report
print('Classification report for Hindi after training on Hindi language')
print(classification_report(y_orig_onelang, y_val_onelang, target_names=['Non-Abusive','Abusive']))

In [None]:
train_embeddings_t = {'input_ids': x_train_alllang_t['input_ids'].tolist() ,"attention_mask":x_train_alllang_t['attention_mask'].tolist()}
train_steps_t = len(train_embeddings_t['input_ids'])//config.batch_size//4
validation_steps_t = len(test_embeddings_t['input_ids'])//config.batch_size
print(f"training steps {train_steps_t} , validation steps {validation_steps_t}")
train_dataset_t = tf.data.Dataset.from_tensor_slices((train_embeddings_t , y_train_alllang_t))
train_dataset_t = (
    train_dataset_t
    .shuffle(1024*2)
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .repeat()
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)
test_dataset_t = tf.data.Dataset.from_tensor_slices((test_embeddings_t , y_val_onelang_t))
test_dataset_t = (
    test_dataset_t
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)
with strategy.scope():
  transformer_model = TFAutoModel.from_pretrained(config.hf_path)
  model = create_model(transformer_model)
hist = model.fit(train_dataset_t,steps_per_epoch= train_steps_t,validation_data= test_dataset_t, epochs = config.epochs)

In [None]:
y_orig_alllang_t = model.predict(test_data_t , verbose = 1).round()

In [None]:
from sklearn.metrics import classification_report
print('Classification report for Telugu after training on all languages')
print(classification_report(y_orig_alllang_t, y_val_onelang_t, target_names=['Non-Abusive','Abusive']))

In [None]:
train_embeddings_onelang_t = {'input_ids': x_train_onelang_t['input_ids'].tolist() ,"attention_mask":x_train_onelang_t['attention_mask'].tolist()}
train_steps_onelang_t = len(train_embeddings_onelang_t['input_ids'])//config.batch_size//4
validation_steps_t= len(test_embeddings_t['input_ids'])//config.batch_size
print(f"training steps {train_steps_onelang_t} , validation steps {validation_steps_t}")
train_dataset_onelang_t = tf.data.Dataset.from_tensor_slices((train_embeddings_onelang_t , y_train_onelang_t))
train_dataset_onelang_t = (
    train_dataset_onelang_t
    .shuffle(1024*2)
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .repeat()
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)
test_dataset_t = tf.data.Dataset.from_tensor_slices((test_embeddings_t , y_val_onelang_t))
test_dataset_t = (
    test_dataset_t
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)
with strategy.scope():
  transformer_model = TFAutoModel.from_pretrained(config.hf_path)
  model1 = create_model(transformer_model)
hist = model.fit(train_dataset_onelang_t,steps_per_epoch= train_steps_onelang_t,validation_data= test_dataset_t, epochs = config.epochs)

In [None]:
y_orig_onelang_t = model.predict(test_data_t , verbose = 1).round()

In [None]:
from sklearn.metrics import classification_report
print('Classification report for Telugu after training on Telugu language')
print(classification_report(y_orig_onelang_t, y_val_onelang_t, target_names=['Non-Abusive','Abusive']))

In [None]:
train_embeddings_a = {'input_ids': x_train_alllang_a['input_ids'].tolist() ,"attention_mask":x_train_alllang_a['attention_mask'].tolist()}
train_steps_a = len(train_embeddings_a['input_ids'])//config.batch_size//4
validation_steps_a = len(test_embeddings_a['input_ids'])//config.batch_size
print(f"training steps {train_steps_a} , validation steps {validation_steps_a}")
train_dataset_a = tf.data.Dataset.from_tensor_slices((train_embeddings_a , y_train_alllang_a))
train_dataset_a = (
    train_dataset_a
    .shuffle(1024*2)
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .repeat()
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)
test_dataset_a = tf.data.Dataset.from_tensor_slices((test_embeddings_a , y_val_onelang_a))
test_dataset_a = (
    test_dataset_a
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)
with strategy.scope():
  transformer_model = TFAutoModel.from_pretrained(config.hf_path)
  model_a_all = create_model(transformer_model)
hist = model_a_all.fit(train_dataset_a,steps_per_epoch= train_steps_a,validation_data= test_dataset_a, epochs = config.epochs)

training steps 1290 , validation steps 32


Some layers from the model checkpoint at google/muril-base-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/12


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'IteratorGetNext:0' shape=(None, 64) dtype=int32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, 64) dtype=int32>, <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int32>]




  num_elements)
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'IteratorGetNext:0' shape=(None, 64) dtype=int32>, <tf.Tensor 'IteratorGetNext:1' shape=(None, 64) dtype=int32>, <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int32>]








INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 64) dtype=int32>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 64) dtype=int32>, <tf.Tensor 'cond/Identity_16:0' shape=(None,) dtype=int32>]


Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [None]:
y_orig_alllang_a = model_a_all.predict(test_data_a , verbose = 1).round()

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond/Identity:0' shape=(None, 64) dtype=int32>, <tf.Tensor 'cond/Identity_8:0' shape=(None, 64) dtype=int32>, <tf.Tensor 'cond/Identity_16:0' shape=(None,) dtype=int32>]




In [None]:
from sklearn.metrics import classification_report
print('Classification report for Malayalam after training on all languages')
print(classification_report(y_orig_alllang_a, y_val_onelang_a, target_names=['Non-Abusive','Abusive']))

Classification report for Malayalam after training on all languages
              precision    recall  f1-score   support

 Non-Abusive       0.93      0.94      0.93      3121
     Abusive       0.79      0.78      0.78       976

    accuracy                           0.90      4097
   macro avg       0.86      0.86      0.86      4097
weighted avg       0.90      0.90      0.90      4097



In [25]:
train_embeddings_onelang_a = {'input_ids': x_train_onelang_a['input_ids'].tolist() ,"attention_mask":x_train_onelang_a['attention_mask'].tolist()}
train_steps_onelang_a = len(train_embeddings_onelang_a['input_ids'])//config.batch_size//4
validation_steps_a= len(test_embeddings_a['input_ids'])//config.batch_size
print(f"training steps {train_steps_onelang_a} , validation steps {validation_steps_a}")
train_dataset_onelang_a = tf.data.Dataset.from_tensor_slices((train_embeddings_onelang_a , y_train_onelang_a))
train_dataset_onelang_a = (
    train_dataset_onelang_a
    .shuffle(1024*2)
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .repeat()
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)
test_dataset_a = tf.data.Dataset.from_tensor_slices((test_embeddings_a , y_val_onelang_a))
test_dataset_a = (
    test_dataset_a
    .map(train_prep_function , num_parallel_calls = config.AUTOTUNE)
    .batch(config.batch_size)
    .prefetch(config.AUTOTUNE)
)
with strategy.scope():
  transformer_model = TFAutoModel.from_pretrained(config.hf_path)
  model_a_one = create_model(transformer_model)
model_a_one = create_model(transformer_model)
hist = model_a_one.fit(train_dataset_onelang_a,steps_per_epoch= train_steps_onelang_a,validation_data= test_dataset_a, epochs = config.epochs)

training steps 72 , validation steps 32


Some layers from the model checkpoint at google/muril-base-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/12








Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [26]:
y_orig_onelang_a = model_a_one.predict(test_data_a , verbose = 1).round()



In [27]:
from sklearn.metrics import classification_report
print('Classification report for Malayalam after training on Malayalam language')
print(classification_report(y_orig_onelang_a, y_val_onelang_a, target_names=['Non-Abusive','Abusive']))

Classification report for Malayalam after training on Malayalam language
              precision    recall  f1-score   support

 Non-Abusive       0.63      0.79      0.70      2490
     Abusive       0.45      0.27      0.34      1607

    accuracy                           0.58      4097
   macro avg       0.54      0.53      0.52      4097
weighted avg       0.56      0.58      0.56      4097

