# Confidence score of Create Debate comments
* __Objective__: Training BERT (tensorflow) model on annotated Change My View dataset, and using it to compute confidence score of comments in Facebook corpus.
* __File Management__: Using Google Drive
* __Runtime Type__: GPU

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

## Training Bert on Change My View dataset

In [None]:
!pip install -q tensorflow-text
!pip install -q tf-models-official

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer

tf.get_logger().setLevel('ERROR')

In [None]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
!nvidia-smi

In [None]:
import os

if tf.test.is_gpu_available():
  strategy = tf.distribute.MirroredStrategy()
  print('Using GPU')
else:
  raise ValueError('Running on CPU is not recomended.')

In [None]:
os.environ["TFHUB_MODEL_LOAD_FORMAT"]="UNCOMPRESSED"
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
train_dir = '/content/gdrive/MyDrive/DL/dataset/bert/train'

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [None]:
with strategy.scope():
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    metrics = tf.metrics.BinaryAccuracy()
    epochs = 3
    steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = int(0.1*num_train_steps)

    init_lr = 3e-5
    optimizer = optimization.create_optimizer(init_lr=init_lr, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, optimizer_type='adamw')
    classifier_model = build_classifier_model()
    classifier_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    print(f'Training model with {tfhub_handle_encoder}')
    history = classifier_model.fit(x=train_ds, epochs=epochs)

## Classifying Facebook comments

In [None]:
import pickle
from tqdm import tqdm

In [None]:
comments = pickle.load(open('/content/gdrive/MyDrive/DL/Facebook/dataset/comments.pkl', 'rb'))

In [None]:
classified_comments = []
ad_hominem_cnt = 0

In [None]:
for i in tqdm(range(len(comments))):
    x = comments[i]
    result = tf.sigmoid(classifier_model(tf.constant([x['text']])))
    score = float(result[0][0])
    x['score'] = score
    classified_comments.append(x)
    if score < 0.5:
        ad_hominem_cnt += 1

In [None]:
print(ad_hominem_cnt)

In [None]:
ad_hominem_cnt / len(classified_comments)

In [None]:
with open('/content/gdrive/MyDrive/DL/Facebook/dataset/classified_comments.pkl', 'wb') as f:
    pickle.dump(classified_comments, f)

In [None]:
classified_comments[0]