In [None]:
!pip install tensorflow-gpu
!pip install --upgrade grpcio
!pip install bert-for-tf2
!pip install sentencepiece

In [None]:
import tensorflow as tf
print (tf.__version__)

In [None]:
import os
import math
import datetime

from tqdm import tqdm

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from matplotlib import rc

from sklearn.metrics import confusion_matrix, classification_report


RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [None]:
data_all2 = pd.read_csv('../input/training2110/Dummy_public_train_73.csv')
data_all2.head()

In [None]:
data_all = data_all2
print (len(data_all))

In [None]:
print (data_all['class'].unique())

In [None]:
data_all3 = pd.read_csv('../input/training2110/Dummy_public_test_73.csv')
data_all3.head()

In [None]:
data_all3_topic_only = data_all3[data_all3['class'] != 0]
data_all_topic_only = data_all[data_all['class'] != 0]
print (data_all_topic_only['class'].unique())
print (data_all3_topic_only['class'].unique())

In [None]:
train = data_all_topic_only
test = data_all3_topic_only

In [None]:
#from sklearn.model_selection import train_test_split
#train, test = train_test_split(data_all, test_size=0.2, random_state=42)
#print (len(train))
#print (len(test))

In [None]:
train.shape
print (train['class'])

In [None]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

In [None]:
import zipfile
folder = 'model_folder'
with zipfile.ZipFile("uncased_L-12_H-768_A-12.zip","r") as zip_ref:
    zip_ref.extractall(folder)

In [None]:
bert_model_name="uncased_L-12_H-768_A-12"
BERT_PRETRAINED_DIR = f'{folder}/'
bert_ckpt_dir = os.path.join(BERT_PRETRAINED_DIR, bert_model_name)
bert_ckpt_file = os.path.join(bert_ckpt_dir, "bert_model.ckpt")
bert_config_file = os.path.join(bert_ckpt_dir, "bert_config.json")

In [None]:
class PrepareData:
  DATA_COLUMN = "sentence"
  LABEL_COLUMN = "class"

  def __init__(self, train, test, tokenizer: FullTokenizer, classes, max_seq_len=192):
    self.tokenizer = tokenizer
    self.max_seq_len = 250
    self.classes = classes
    
    ((self.train_x, self.train_y), (self.test_x, self.test_y)) = map(self._prepare, [train, test])

    print("max seq_len", self.max_seq_len)
    self.max_seq_len = min(self.max_seq_len, max_seq_len)
#     self.max_seq_len = max(self.max_seq_len, max_seq_len)
    print("max seq_len", self.max_seq_len)
    self.train_x, self.test_x = map(self._pad, [self.train_x, self.test_x])

  def _prepare(self, df):
    x, y = [], []
    
    for _, row in tqdm(df.iterrows()):
      text, label = row[PrepareData.DATA_COLUMN], row[PrepareData.LABEL_COLUMN]
      tokens = self.tokenizer.tokenize(str(text))
      tokens = ["[CLS]"] + tokens + ["[SEP]"]
      token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
      self.max_seq_len = max(self.max_seq_len, len(token_ids))
      x.append(token_ids)
      y.append(self.classes.index(label))

    return np.array(x), np.array(y)

  def _pad(self, ids):
    x = []
    for input_ids in ids:
      input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
      input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
      x.append(np.array(input_ids))
    return np.array(x)

In [None]:
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_ckpt_dir, "vocab.txt"))
tokens = tokenizer.tokenize("I can't wait to visit Bulgaria again!")
tokenizer.convert_tokens_to_ids(tokens)

In [None]:
def create_model(max_seq_len, bert_ckpt_file):
  with tf.io.gfile.GFile(bert_config_file, "r") as reader:
      bc = StockBertConfig.from_json_string(reader.read())
      bert_params = map_stock_config_to_params(bc)
      bert_params.adapter_size = None
      bert = BertModelLayer.from_params(bert_params, name="bert")
        
  input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids")
  bert_output = bert(input_ids)
  print("bert shape", bert_output.shape)
  cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
  cls_out = keras.layers.Dropout(0.2)(cls_out)
  logits = keras.layers.Dense(units=128, activation="relu")(cls_out)
  logits = keras.layers.Dropout(0.2)(logits)
  logits = keras.layers.Dense(units=len(classes), activation="softmax")(logits)
  model = keras.Model(inputs=input_ids, outputs=logits)
  model.build(input_shape=(None, max_seq_len))
  load_stock_weights(bert, bert_ckpt_file)
        
  return model

In [None]:
classes = sorted(train['class'].unique().tolist())

data = PrepareData(train, test, tokenizer, classes, max_seq_len=250)

In [None]:
y = pd.get_dummies(train['class'])

In [None]:
y1 = train['class']
y1

In [None]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
y_integers = y1
class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers)
d_class_weights = dict(enumerate(class_weights))
d_class_weights

In [None]:
model = create_model(data.max_seq_len, bert_ckpt_file)

In [None]:
model.summary()

In [None]:
model.compile(
#optimizer=keras.optimizers.Adam(1e-5),
optimizer=keras.optimizers.Adam(1e-5),
loss=keras.losses.CategoricalCrossentropy(from_logits=True),
metrics=[keras.metrics.CategoricalAccuracy(name="acc")]
)

In [None]:
log_dir = "log/intent_detection/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%s")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=log_dir)

epochs=50
history = model.fit(x=data.train_x,
                    y=y,
                    validation_split=0.2,
                    batch_size=16,
                    epochs=epochs,
                    #callbacks=callbacks_list,
                    class_weight = d_class_weights,
                    verbose=1,
                    shuffle=True)
#                     validation_data=(x_test, y_test))

In [None]:
# y_test = pd.get_dummies(data.test_y)
y_test = pd.get_dummies(test['class'])

In [None]:
_, train_acc = model.evaluate(data.train_x,y)
_, test_acc = model.evaluate(data.test_x,y_test)

print("train acc", train_acc)
print("test acc", test_acc)

In [None]:
y_pred = model.predict(data.test_x).argmax(axis=-1)

In [None]:
target_class_name = [
    'Net Sales',
    'Operating Profit Margin',
    'Risk',
    'Non Current Assets',
    'Total Operating Expenses',
    'Net Worth',
    'Total Debt',
    'NPAT Margin',
    'Cashflow',
    'Current Asset',
    'Operating Profit',
    'Contingent Liabilities/Guarantee',
    'Current Liabilities',
    'Cash and Equivalents',
    'EBITDA Margin',
    'GP margin',
    'Non Current Liabilities'
    ]
target_class_name

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print(classification_report(np.argmax(y_test.values, axis = 1),y_pred, target_names = target_class_name))

In [None]:
model.save_weights("DNN_model_2010.h5")