<a href="https://colab.research.google.com/github/uwa234/Data-Science-Projects/blob/master/Jigsaw_Toxic_Comments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install bert-for-tf2 



In [0]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
!pip install sentencepiece



In [0]:
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert

In [12]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [32]:
#Training data
train = pd.read_csv('/content/gdrive/My Drive/jigsaw-toxic-comment-train.csv').sample(frac = 0.40)
print('Training data shape: ', train.shape)
train.head()

Training data shape:  (89420, 8)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
43982,755f5cd81440fbae,In what sense do you find the article dishones...,0,0,0,0,0,0
49222,8390fb673ddbbe1c,"""\nAgain, you have not provided aqequate proof...",0,0,0,0,0,0
87901,eb1f6356e6a13e47,in my entry on Raymond Arritt's talk page,0,0,0,0,0,0
190160,7a150db044d5d08c,"Fairly ridiculous, go ahead Evb-wiki and merge...",0,0,0,0,0,0
102647,2552f56639fe1cd9,User:Notespace is Getting Blocked \n\nNo objec...,0,0,0,0,0,0


In [34]:
#Validation data
val = pd.read_csv('/content/gdrive/My Drive/validation.csv')
print('Validation data shape: ', val.shape)
val.head()

Validation data shape:  (8000, 4)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
43982,755f5cd81440fbae,In what sense do you find the article dishones...,0,0,0,0,0,0
49222,8390fb673ddbbe1c,"""\nAgain, you have not provided aqequate proof...",0,0,0,0,0,0
87901,eb1f6356e6a13e47,in my entry on Raymond Arritt's talk page,0,0,0,0,0,0
190160,7a150db044d5d08c,"Fairly ridiculous, go ahead Evb-wiki and merge...",0,0,0,0,0,0
102647,2552f56639fe1cd9,User:Notespace is Getting Blocked \n\nNo objec...,0,0,0,0,0,0


In [35]:
#Test data
test = pd.read_csv('/content/gdrive/My Drive/test.csv')
print('Test shape: ', test.shape)
test.head()

Test shape:  (63812, 3)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
43982,755f5cd81440fbae,In what sense do you find the article dishones...,0,0,0,0,0,0
49222,8390fb673ddbbe1c,"""\nAgain, you have not provided aqequate proof...",0,0,0,0,0,0
87901,eb1f6356e6a13e47,in my entry on Raymond Arritt's talk page,0,0,0,0,0,0
190160,7a150db044d5d08c,"Fairly ridiculous, go ahead Evb-wiki and merge...",0,0,0,0,0,0
102647,2552f56639fe1cd9,User:Notespace is Getting Blocked \n\nNo objec...,0,0,0,0,0,0


In [0]:
import re

def clean(text):
    text = text.fillna("fillna").str.lower()
    text = text.map(lambda x: re.sub('\\n',' ',str(x)))
    text = text.map(lambda x: re.sub("\[\[User.*",'',str(x)))
    text = text.map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    text = text.map(lambda x: re.sub("\(http://.*?\s\(http://.*\)",'',str(x)))
    return text

val["comment_text"] = clean(val["comment_text"])
test["content"] = clean(test["content"])
train["comment_text"] = clean(train["comment_text"])

In [0]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [0]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/2",
                            trainable=True)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [39]:
tokenizer.tokenize("don't be so judgmental")

['don', "'", 't', 'be', 'so', 'judgment', '##al']

In [0]:
train_input = bert_encode(train.comment_text.values, tokenizer, max_len=160)
test_input = bert_encode(test.content.values, tokenizer, max_len=160)
val_input = bert_encode(val.comment_text.values, tokenizer, max_len=160)
train_labels = train.toxic.values
val_labels = val.toxic.values

In [0]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

In [0]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    hidden1 = Dense(100, activation='relu')(clf_output)
    hidden2 = Dense(50, activation='relu')(hidden1)
    out = Dense(1, activation='sigmoid')(hidden2)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [43]:
model = build_model(bert_layer, max_len=160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 177853441   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [44]:
train_history = model.fit(
    train_input, train_labels,
    validation_split = 0.2,
    epochs=2,
    batch_size=16
)

Epoch 1/2
Epoch 2/2


In [0]:
model.save('bert100_50_tocix.h5')

In [47]:
print(train_history)

<tensorflow.python.keras.callbacks.History object at 0x7f37664ee400>


In [0]:
test_pred = model.predict(test_input)

In [0]:
#from sklearn.metrics import confusion_matrix, classification_report
#print(confusion_matrix(test_input, test_pred))
#print(classification_report(y_test, y_pred.round().astype(int)))

In [51]:
#Sample Submission
sub = pd.read_csv('/content/gdrive/My Drive/sample_submission.csv')
print('Submission shape: ', test.shape)
sub.head()

Submission shape:  (63812, 3)


Unnamed: 0,id,toxic
0,0,0.5
1,1,0.5
2,2,0.5
3,3,0.5
4,4,0.5


In [0]:
submission_toxic = sub
submission_toxic['toxic'] = test_pred.round().astype(int)
submission_toxic.to_csv('submission_toxic.csv', index=False)

In [0]:
#submission = pd.read_csv("sample_submission.csv")
#submission['target'] = test_pred.round().astype(int)
#submission.to_csv('submission.csv', index=False)