In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf # takes 3 min to load!
import transformers
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import re
import emoji
import gc

In [2]:
## Constants
target_key = "score"
text_key = "text" #"txt"
batch_size = 32
bert_path =  "bert-base-uncased"
train_path = "../input/clean-civil-data-jigsaw-downsampled/clean_civil.csv"
lstm_hidden_dim = 64 # from 768 (bert) to 64  # the decrease is steep; u may lose info
dropout_rate = 0.3
output_dim = 1
max_length = 350
n_epochs = 1
checkpoint_filepath = "./bert/ckpt-loss={loss:.5f}-epoch={epoch}-batch={batch}"
save_after_batches = 5000
test_size_percent =  0.05
test_path = "./train/removed_redundant_ruddit_with_text.csv"
val_path = "./train/validation_data.csv"
load_model_path = "./output/bert-2-bilstm/bert/final_model"

## COMMENTS TO SCORE EVAL

In [5]:
comments_to_score_df = pd.read_csv(test_path)
comments_to_score_df.shape

(5710, 3)

In [4]:
RE_PATTERNS = {
    ' american ':
        [
            'amerikan'
        ],
    ' adolf ':
        [
            'adolf'
        ],
    ' hitler ':
        [
            'hitler'
        ],
    ' fuck':
        [
            '(f)(u|[^a-z0-9 ])(c|[^a-z0-9 ])(k|[^a-z0-9 ])([^ ])*',
            '(f)([^a-z]*)(u)([^a-z]*)(c)([^a-z]*)(k)',
            ' f[!@#\$%\^\&\*]*u[!@#\$%\^&\*]*k', 'f u u c',
            '(f)(c|[^a-z ])(u|[^a-z ])(k)', r'f\*',
            'feck ', ' fux ', 'f\*\*', 
            'f\-ing', 'f\.u\.', 'f###', ' fu ', 'f@ck', 'f u c k', 'f uck', 'f ck'
        ],
    ' ass ':
        [
            '[^a-z]ass ', '[^a-z]azz ', 'arrse', ' arse ', '@\$\$'
                                                           '[^a-z]anus', ' a\*s\*s', '[^a-z]ass[^a-z ]',
            'a[@#\$%\^&\*][@#\$%\^&\*]', '[^a-z]anal ', 'a s s'
        ],
    ' ass hole ':
        [
            ' a[s|z]*wipe', 'a[s|z]*[w]*h[o|0]+[l]*e', '@\$\$hole'
        ],
    ' bitch ':
        [
            'b[w]*i[t]*ch', 'b!tch',
            'bi\+ch', 'b!\+ch', '(b)([^a-z]*)(i)([^a-z]*)(t)([^a-z]*)(c)([^a-z]*)(h)',
            'biatch', 'bi\*\*h', 'bytch', 'b i t c h'
        ],
    ' bastard ':
        [
            'ba[s|z]+t[e|a]+rd'
        ],
    ' trans gender':
        [
            'transgender'
        ],
    ' gay ':
        [
            'gay'
        ],
    ' cock ':
        [
            '[^a-z]cock', 'c0ck', '[^a-z]cok ', 'c0k', '[^a-z]cok[^aeiou]', ' cawk',
            '(c)([^a-z ])(o)([^a-z ]*)(c)([^a-z ]*)(k)', 'c o c k'
        ],
    ' dick ':
        [
            ' dick[^aeiou]', 'deek', 'd i c k'
        ],
    ' suck ':
        [
            'sucker', '(s)([^a-z ]*)(u)([^a-z ]*)(c)([^a-z ]*)(k)', 'sucks', '5uck', 's u c k'
        ],
    ' cunt ':
        [
            'cunt', 'c u n t'
        ],
    ' bull shit ':
        [
            'bullsh\*t', 'bull\$hit'
        ],
    ' homo sex ual':
        [
            'homosexual'
        ],
    ' jerk ':
        [
            'jerk'
        ],
    ' idiot ':
        [
            'i[d]+io[t]+', '(i)([^a-z ]*)(d)([^a-z ]*)(i)([^a-z ]*)(o)([^a-z ]*)(t)', 'idiots'
                                                                                      'i d i o t'
        ],
    ' dumb ':
        [
            '(d)([^a-z ]*)(u)([^a-z ]*)(m)([^a-z ]*)(b)'
        ],
    ' shit ':
        [
            'shitty', '(s)([^a-z ]*)(h)([^a-z ]*)(i)([^a-z ]*)(t)', 'shite', '\$hit', 's h i t'
        ],
    ' shit hole ':
        [
            'shythole'
        ],
    ' retard ':
        [
            'returd', 'retad', 'retard', 'wiktard', 'wikitud'
        ],
    ' rape ':
        [
            ' raped'
        ],
    ' dumb ass':
        [
            'dumbass', 'dubass'
        ],
    ' ass head':
        [
            'butthead'
        ],
    ' sex ':
        [
            'sexy', 's3x', 'sexuality'
        ],
    ' nigger ':
        [
            'nigger', 'ni[g]+a', ' nigr ', 'negrito', 'niguh', 'n3gr', 'n i g g e r'
        ],
    ' shut the fuck up':
        [
            'stfu'
        ],
    ' pussy ':
        [
            'pussy[^c]', 'pusy', 'pussi[^l]', 'pusses'
        ],
    ' faggot ':
        [
            'faggot', ' fa[g]+[s]*[^a-z ]', 'fagot', 'f a g g o t', 'faggit',
            '(f)([^a-z ]*)(a)([^a-z ]*)([g]+)([^a-z ]*)(o)([^a-z ]*)(t)', 'fau[g]+ot', 'fae[g]+ot',
        ],
    ' mother fucker':
        [
            ' motha ', ' motha f', ' mother f', 'motherucker',
        ],
    ' whore ':
        [
            'wh\*\*\*', 'w h o r e'
        ],
}

In [5]:
CONTRACTION_MAPPING = {"ain't": "is not", "'cause": "because", "could've": "could have", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

def translateAbuseWords(text, patterns=RE_PATTERNS):
    text = re.sub(r"(.)\1{2,}", r'\1', text)
    for target, patterns in patterns.items():
        for pat in patterns:
            text = re.sub(pat, target, text)
    return text


def clean_contractions(text, mapping=CONTRACTION_MAPPING):
    '''
    Expand contractions
    '''
     
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

def social_media_clean(full_line):
    full_line = re.sub(r'#([^ ]*)', r'\1', full_line) # #BanTrump -> BanTrump
    full_line = re.sub(r'https?://\S+|www\.\S+', ' ', full_line) # URL -> " "
    full_line = re.sub(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9_]+)', ' ', full_line) # @user -> " "
    full_line = emoji.demojize(full_line) # emoji -> text
    full_line = re.sub(r'(:.*?:)', r' \1 ', full_line) # :emoji-desc: -> emoji-desc
    full_line = re.sub(' +', ' ', full_line) #  extra blank spaces have been replaced with a single space.

    # repattern = re.compile(r"(.)\1{2,}", re.DOTALL) 
    # ds = ds.str.replace(repattern, r"\1")
    return full_line


def preprocess(full_line):
    full_line = full_line.lower()
    full_line = clean_contractions(full_line)
    full_line = translateAbuseWords(full_line)
    full_line = social_media_clean(full_line)
    full_line = re.sub(r"[^a-zA-Z\d]", " ", full_line) # messes with emoji
    return full_line

In [6]:
# comments_to_score_df[text_key] = comments_to_score_df[text_key].apply(lambda x: preprocess(x)) # PREPROCESS

In [6]:
class CivilDataGenerator(tf.keras.utils.Sequence): # could optimize more like BucketIterator for padding
    def __init__(self, texts, scores, tokenizer, batch_size=batch_size, shuffle=True, include_targets=True): # texts -> numpy array
        self.texts = texts
        self.scores = scores
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        self.tokenizer =  tokenizer # 
        self.indexes = np.arange(len(self.texts))
        self.on_epoch_end()
        
    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.texts) // self.batch_size + 1 if (len(self.texts) % self.batch_size) != 0 else 0
    
    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)
            
    def __getitem__(self, idx): # idx -> index batch
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        texts = self.texts[indexes]
        
        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            texts.tolist(), # num
            add_special_tokens=True, # not really needed in our case. 
            max_length=max_length, # bert has 512 max length # providing our own
            return_attention_mask=True, # need bcos to pad to max length
            return_token_type_ids=False, # not needed # needed when u have two sentences
            padding='max_length', #pad_to_max_length=True, # needed
            return_tensors="tf",
            truncation=True,
        )
        
        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        
        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            scores = np.array(self.scores[indexes], dtype="float32")
            return [input_ids, attention_masks], scores
        else:
            return [input_ids, attention_masks]
        

In [8]:

# # Encoded token ids from BERT tokenizer.
# input_ids = tf.keras.layers.Input(
#     shape=(max_length,), dtype=tf.int32, name="input_ids"
# )
# # Attention masks indicates to the model which tokens should be attended to.
# attention_masks = tf.keras.layers.Input(
#     shape=(max_length,), dtype=tf.int32, name="attention_masks"
# )

# # Loading pretrained BERT model.
# bert_model = transformers.TFBertModel.from_pretrained(bert_path)
# # Freeze the BERT model to reuse the pretrained features without modifying them.
# bert_model.trainable = False ## not training bert ##

# bert_output = bert_model.bert(input_ids, attention_mask=attention_masks) # by default hidden_size = 768
# sequence_output = bert_output.last_hidden_state  # (batch_size, sequence_length, hidden_size) # ie each word representation 

# ## for the warning we are good. not using pooled_output for now, https://github.com/huggingface/transformers/issues/5421
# # pooled_output = bert_output.pooler_output # (batch_size, hidden_size) # ie whole text representational (kinda)

# # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
# bi_lstm = tf.keras.layers.Bidirectional(
#     tf.keras.layers.LSTM(lstm_hidden_dim, return_sequences=True) 
# )(sequence_output) # (batch_size,  sequence_length, lstm_hidden_dim*2) # merge_mode="concat"

# bi_lstm = tf.keras.layers.Bidirectional(
#     tf.keras.layers.LSTM(lstm_hidden_dim, return_sequences=True)
# )(bi_lstm) # (batch_size,  sequence_length, lstm_hidden_dim*2) # stacked one more BiLSTM bcos with one stack its converging slowly (kind of plateau)

# # Applying hybrid pooling approach to bi_lstm sequence output.
# avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm) # averages over sequence length # (batch_size, lstm_hidden_dim*2)
# max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm) # (batch_size, lstm_hidden_dim*2)
# concat = tf.keras.layers.concatenate([avg_pool, max_pool]) #(batch_size, lstm_hidden_dim*3)
# dropout = tf.keras.layers.Dropout(dropout_rate)(concat) #(batch_size, lstm_hidden_dim*3)

# output = tf.keras.layers.Dense(output_dim)(dropout) # 1 since our target is 1 bcos regression 


# model = tf.keras.models.Model(
#     inputs=[input_ids, attention_masks], outputs=output
# )
# model.compile(
#     optimizer=tf.keras.optimizers.Adam(),
#     loss= 'mse',# tf.keras.losses.MeanSquaredError(),
#     metrics=[tf.keras.metrics.MeanSquaredError(), tf.keras.metrics.RootMeanSquaredError()],
# )

# model.summary()

2022-01-27 19:04:15.718113: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-01-27 19:04:15.782073: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-01-27 19:04:15.782479: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-01-27 19:04:15.783464: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow wi

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 350)]        0                                            
__________________________________________________________________________________________________
attention_masks (InputLayer)    [(None, 350)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 attention_masks[0][0]            
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 350, 128)     426496      bert[0][0]                   

In [7]:
# model.load_weights(load_model_path)
model = tf.keras.models.load_model(load_model_path)

2022-02-03 21:28:57.097865: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-03 21:28:57.157336: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-03 21:28:57.157707: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:923] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-02-03 21:28:57.158919: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow wi

In [8]:
tokenizer = transformers.BertTokenizer.from_pretrained(bert_path, do_lower_case=True)

In [9]:
test_data = CivilDataGenerator(
    comments_to_score_df[text_key].values,
    None, # no target while inferring
    tokenizer,
    batch_size=batch_size,
    shuffle=False,
    include_targets=False # added for inference
)

In [10]:
# preds = np.zeros((comments_to_score_df.shape[0],1))
with tf.device('/device:GPU:0'):
    preds = model.predict(
        test_data,
        use_multiprocessing=True, # can only be used when x, y are generators
        workers=-1,
        verbose=1,
    )

preds.shape

2022-02-03 21:29:19.754690: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-02-03 21:29:22.331327: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2022-02-03 21:29:26.040137: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8201




(5710, 1)

In [12]:
comments_to_score_df["score"] = preds # negatives as well
# comments_to_score_df.head()
comments_to_score_df[["comment_id", "score"]].to_csv("./output/bert-2-bilstm-epoch-2-ruddit-preds.csv", index=False)