In [73]:
!pip install tensorflow-text



In [74]:
!pip install bert-for-tf2



In [75]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tensorflow_text as text

import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import numpy as np
from sklearn.metrics import accuracy_score, log_loss
from sklearn.utils import all_estimators

from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

def transform_label(raw_label):
    return raw_label == 'HOF'


def transform_Yset(Y_raw):
    Y = []
    for label in Y_raw:
        Y.append(transform_label(label))
    Y = np.asarray(Y)
    return Y


def get_train_test_from_tsv(tsv_name, train_data=True):
    tsv_name = "/content/drive/My Drive/mural_model/hindi_datasets/"+tsv_name 
    df = pd.read_csv(tsv_name,delimiter="\t")
    print("Dataset Shape: {}".format(df.shape))
    X_raw = df['text']
    Y_raw = df['task_1']
    # X_ = transform_Xset(X_raw)
    # del (X_raw)
    if train_data:
      Y_ = transform_Yset(Y_raw)
      del (df)
      del (Y_raw)
      return X_raw, Y_
    else:
      del (df)
      return X_raw



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset Shape: (4665, 5)


In [102]:
#Reference # https://tfhub.dev/google/MuRIL/1 
class OffenseMural():
  def __init__(self):
    self.max_seq_length = 128
    self.model_url = "https://tfhub.dev/google/MuRIL/1"
    self.bert_dim = 768
    self.mlp_dim = 200
    self.muril_model, self.muril_layer = self.get_model()
    self.vocab_file = self.muril_layer.resolved_object.vocab_file.asset_path.numpy()
    self.do_lower_case = self.muril_layer.resolved_object.do_lower_case.numpy()
    self.tokenizer = bert_tokenization.FullTokenizer(self.vocab_file, self.do_lower_case)
    # self.early_stopping_monitor = EarlyStopping(patience=3)
    self.mlp = Sequential([Dense(units=self.mlp_dim,activation='relu',input_shape=(self.bert_dim,)),
                           Dense(units=self.mlp_dim/2,activation='relu'),
                           Dense(units=self.mlp_dim/4,activation='relu'),
                           Dense(1, activation='sigmoid')
    ])
    self.mlp.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'], run_eagerly=True)

  def get_model(self):
    # Define input.
      inputs = dict(
          input_word_ids=tf.keras.layers.Input(shape=(self.max_seq_length,), dtype=tf.int32),
          input_mask=tf.keras.layers.Input(shape=(self.max_seq_length,), dtype=tf.int32),
          input_type_ids=tf.keras.layers.Input(shape=(self.max_seq_length,), dtype=tf.int32),
      )
    # Define muril layer.
      muril_layer = hub.KerasLayer(self.model_url, trainable=False)
      outputs = muril_layer(inputs)
      assert 'sequence_output' in outputs and 'pooled_output' in outputs and 'encoder_outputs' in outputs and 'default' in outputs
      return tf.keras.Model(inputs=inputs,outputs=outputs["pooled_output"]), muril_layer

### ref : tensorflow hub
  def create_input(self, input_strings):
    input_ids_all, input_mask_all, input_type_ids_all = [], [], []
    for input_string in input_strings:
      # Tokenize input.
      input_tokens = ["[CLS]"] + self.tokenizer.tokenize(input_string) + ["[SEP]"]
      input_ids = self.tokenizer.convert_tokens_to_ids(input_tokens)
      sequence_length = min(len(input_ids), self.max_seq_length)
      # Padding or truncation.
      if len(input_ids) >= self.max_seq_length:
        input_ids = input_ids[:self.max_seq_length]
      else:
        input_ids = input_ids + [0] * (self.max_seq_length - len(input_ids))
      input_mask = [1] * sequence_length + [0] * (self.max_seq_length - sequence_length)
      input_ids_all.append(input_ids)
      input_mask_all.append(input_mask)
      input_type_ids_all.append([0] * self.max_seq_length)

    return np.array(input_ids_all), np.array(input_mask_all), np.array(input_type_ids_all)

  def encode(self, input_text):
      input_ids, input_mask, input_type_ids = self.create_input(input_text)
      inputs = dict(
          input_word_ids=input_ids,
          input_mask=input_mask,
          input_type_ids=input_type_ids,
      )
      return self.muril_model(inputs)

In [103]:
off_mural = OffenseMural()

In [106]:
off_mural.mlp.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_25 (Dense)             (None, 200)               153800    
_________________________________________________________________
dense_26 (Dense)             (None, 100)               20100     
_________________________________________________________________
dense_27 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_28 (Dense)             (None, 1)                 51        
Total params: 179,001
Trainable params: 179,001
Non-trainable params: 0
_________________________________________________________________


In [128]:
X_train, Y_train = get_train_test_from_tsv("hasoc_19_train.tsv")
batch_size = 100
num_batches_train = int(len(X_train)/batch_size)+1
print(num_batches_train)

In [130]:
epochs=1
for epoch in range(epochs):
  start = 0
  end = 0
  for batch in range(num_batches_train):
    end = start + batch_size
    print(epoch,batch, start, end)
    X_batch_train = X_train[start:end]
    Y_batch_train = Y_train[start:end]
    embeddings_train = off_mural.encode(X_batch_train)
    # print(embeddings_train.shape)
    off_mural.mlp.fit(embeddings_train,Y_batch_train,shuffle=True)
    start=end

0 0 0 100
0 1 100 200
0 2 200 300
0 3 300 400
0 4 400 500
0 5 500 600
0 6 600 700
0 7 700 800
0 8 800 900
0 9 900 1000
0 10 1000 1100
0 11 1100 1200
0 12 1200 1300
0 13 1300 1400
0 14 1400 1500
0 15 1500 1600
0 16 1600 1700
0 17 1700 1800
0 18 1800 1900
0 19 1900 2000
0 20 2000 2100
0 21 2100 2200
0 22 2200 2300
0 23 2300 2400
0 24 2400 2500
0 25 2500 2600
0 26 2600 2700
0 27 2700 2800
0 28 2800 2900
0 29 2900 3000
0 30 3000 3100
0 31 3100 3200
0 32 3200 3300
0 33 3300 3400
0 34 3400 3500
0 35 3500 3600
0 36 3600 3700
0 37 3700 3800
0 38 3800 3900
0 39 3900 4000
0 40 4000 4100
0 41 4100 4200
0 42 4200 4300
0 43 4300 4400
0 44 4400 4500
0 45 4500 4600
0 46 4600 4700


In [142]:
X_test, Y_test = get_train_test_from_tsv("hasoc_19_gold.tsv")
len(X_test)

batch_size = 500
num_batches_train = int(len(X_test)/batch_size)+1
print(num_batches_train)

epochs=1
for epoch in range(epochs):
  start = 0
  end = 0
  for batch in range(num_batches_train):
    end = start + batch_size
    print(epoch,batch, start, end)
    X_batch_test = X_test[start:end]
    Y_batch_test = Y_test[start:end]
    embeddings_test = off_mural.encode(X_batch_test)
    off_mural.mlp.evaluate(embeddings_test,Y_batch_test)
    # print(embeddings_train.shape)
    # off_mural.mlp.fit(embeddings_train,Y_batch_train,shuffle=True)
    start=end

Dataset Shape: (1318, 5)
3
0 0 0 500
0 1 500 1000
0 2 1000 1500
