This will serve as the notebook for testing my topic modeling portion of this project.

After verifying that everything is working, it will be changed to a .py file to be runnable on new incoming data

In [8]:
import os
from sklearn.model_selection import train_test_split
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [50]:
df = pd.read_csv(r"data/2014_funny_comments.csv")

For our categories, we must encode them starting from 0 for ktrain to work.

In [51]:
df = df[~df.body.isnull()]  # Remove the missed null values

conditions = [ (df['score'] < 1), (df['score'] >= 1) & (df['score'] <= 13), (df['score'] > 13) ] 
values = ['bad', 'okay', 'good']
df['flag'] = np.select(conditions, values)  # Bin our scores into categories under the column name 'flag'

In [None]:
df.drop(columns=['HOUR_int', 'DAY_int', 'MONTH_int', 'score'], inplace=True)
df.head()

## Start BERT modeling

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)  # tfhub_handle_preprocess from the "Choose a BERT model to fine-tune" code at https://www.tensorflow.org/text/tutorials/classify_text_with_bert
bert_model = hub.KerasLayer(tfhub_handle_encoder)  # tfhub_handle_encoder from the "Choose a BERT model to fine-tune" code at https://www.tensorflow.org/text/tutorials/classify_text_with_bert

def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='body')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

classifier_model =build_Classifier_model()
tf.keras.utils.plot_model(classifier_model)  # Check model structure


In [None]:
#### Model Training

In [None]:
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)  # Multi-class classification
metrics = tf.metrics.Accuracy()

#### Optimizer

In [None]:
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()  ##### Must change to our actual training data instead of train_ds
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)  # first 10% of training steps to warmup optimizer

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

#### Load BERT model and train

In [None]:
classifier_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)
print(f"Training model with {tfhub_handle_encoder}")
history = classifier_model.fit(x=train_ds,  #### Must change to actual training data
                               validation_data=val_ds,   #### Must change to actual validation data
                               epochs=epochs)

#### Evaluate Model

In [None]:
loss, accuracy = classifier_model.evaluate(test_ds)   #### Must change to actual test data
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

#### Export Model

In [None]:
dataset_name = "redditPredict"
saved_model_path = './data/models/{}_bert'.format(dataset_name.replace('/', '_'))
classifier_model.save(saved_model_path, include_optimizer=False)

#### Fine Tuning BERT

In [52]:
ohe_flags = pd.get_dummies(df.flag)  # Tensorflow requires OHE variables
df = pd.concat([df, pd.DataFrame(ohe_flags)], axis=1)
df.drop(columns=['flag'], inplace=True) # Drop the original flag column as we now have OHE columns
df.drop(columns=['HOUR_int', 'DAY_int', 'MONTH_int'], inplace=True)  # We won't use time info here, so drop them
df.drop(columns=['score'], inplace=True)  # Original score doesn't matter any more
# df.drop(columns=['score','HOUR_int', 'DAY_int', 'MONTH_int','flag'], inplace=True)

In [56]:
df.head()

Unnamed: 0,body,flag,bad,good,okay
0,Ain't no half steppin,okay,0,0,1
1,Inside the gas tank cover or on top of a tire,okay,0,0,1
2,Brought to you by /r/SummerReddit,okay,0,0,1
3,This makes me want to murder.,okay,0,0,1
4,I know someone who got a DUI sleeping it off i...,okay,0,0,1


In [54]:
print(f"The full df is: {len(df)}")
print(f"The 'okay' is: {len(df[df.okay==1])} which is {len(df[df.okay==1])/len(df)*100:.2f}%")
print(f"The 'bad' is: {len(df[df.bad==1])} which is {len(df[df.bad==1])/len(df)*100:.2f}%")
print(f"The 'good' is: {len(df[df.good==1])} which is {len(df[df.good==1])/len(df)*100:.2f}%")

The full df is: 10633648
The 'okay' is: 8004385 which is 75.27%
The 'bad' is: 1582452 which is 14.88%
The 'good' is: 1046811 which is 9.84%


## Tensorflow full

In [None]:
teststr = "testThis dog ðŸ˜‚"  # Testing emoji replacement

In [None]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')

In [None]:
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)