# Classifying Create Debate comments
* __Objective__: Training BERT (tensorflow) model on annotated Change My View dataset, and using it to compute confidence score of comments in Create Debate corpus and also using Perspective API to get scores for various attributes like toxicity, insult, etc.
* __File Management__: Using Google Drive
* __Runtime Type__: GPU
* __Note__: Run this notebook only for a new version of Create Debate corpus, data of previous versions are already saved in the Google Drive

## Mounting Google Drive and cloning CreateDebate-Scraper API

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

In [None]:
!git clone https://github.com/utkarsh512/CreateDebateScraper.git

In [None]:
%cd CreateDebateScraper/src/nested/

In [None]:
from thread import Thread, Comment
import pickle
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import re

## Training BERT TF model on Change-My-View dataset

In [None]:
!pip install -q tensorflow-text
!pip install -q tf-models-official

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer

tf.get_logger().setLevel('ERROR')

In [None]:
# Allow memory growth for the GPU
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
!nvidia-smi

In [None]:
import os

if tf.test.is_gpu_available():
  strategy = tf.distribute.MirroredStrategy()
  print('Using GPU')
else:
  raise ValueError('Running on CPU is not recomended.')

In [None]:
os.environ["TFHUB_MODEL_LOAD_FORMAT"]="UNCOMPRESSED"
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
train_dir = '/content/gdrive/MyDrive/DL/dataset/bert/train'

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
with strategy.scope():
    loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    metrics = tf.metrics.BinaryAccuracy()
    epochs = 3
    steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = int(0.1*num_train_steps)

    init_lr = 3e-5
    optimizer = optimization.create_optimizer(init_lr=init_lr, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, optimizer_type='adamw')
    classifier_model = build_classifier_model()
    classifier_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
    print(f'Training model with {tfhub_handle_encoder}')
    history = classifier_model.fit(x=train_ds, epochs=epochs)

## Classifying CreateDebate Comments

In [None]:
TOPIC = 'world' # change this to access different datasets

#Extracting thread objects

reader_addr = f'/content/gdrive/MyDrive/DL/CreateDebate/{TOPIC}/threads.log'
reader = open(reader_addr, 'rb')
threads = []
e = Thread()
try:
    while True:
        e = pickle.load(reader)
        threads.append(e)
except:
    reader.close()

authors = dict()

tot_comment_cnt = 0

for thread in threads:
    for key in thread.comments.keys():
        tot_comment_cnt += 1
        comment = thread.comments[key]
        cur_text = comment.body
        cur_author = comment.author
        try:
            authors[cur_author].append(cur_text)
        except:
            authors[cur_author] = list()
            authors[cur_author].append(cur_text)

ah_comment_cnt = dict()
none_comment_cnt = dict()
comment_score = dict()

comments_with_score = list()

for author in authors.keys():
    ah_comment_cnt[author] = 0
    none_comment_cnt[author] = 0
    comment_score[author] = list()

# Classifying comments

cur_author_cnt = 0
cur_comment_cnt = 0
tot_author_cnt = len(authors.keys())

with tqdm(total=tot_comment_cnt) as pbar:
    for author in authors.keys():
        cur_author_cnt += 1
        for i in range(len(authors[author])):
            cur_comment_cnt += 1
            text = [authors[author][i]]
            result = tf.sigmoid(classifier_model(tf.constant(text)))
            score = float(result[0][0])
            comments_with_score.append((score, text))
            comment_score[author].append(score)
            if score < 0.5:
                ah_comment_cnt[author] += 1
            else:
                none_comment_cnt[author] += 1
            pbar.update(1)

# Saving the results in the drive

dir = f'/content/gdrive/MyDrive/DL/CreateDebate/{TOPIC}/'

with open(dir + 'ah_comment_cnt.log', 'wb') as f:
    pickle.dump(ah_comment_cnt, f)

with open(dir + 'none_comment_cnt.log', 'wb') as f:
    pickle.dump(none_comment_cnt, f)

with open(dir + 'comment_score.log', 'wb') as f:
    pickle.dump(comment_score, f)

with open(dir + 'comments_with_score.log', 'wb') as f:
    pickle.dump(comments_with_score, f)

with open(dir + 'authors.log', 'wb') as f:
    pickle.dump(authors, f)