In [1]:
##########################################
#Text Classification using BERT Embeddings
##########################################

import tensorflow as tf

In [2]:
!pip install -q tensorflow_text

In [3]:
# Importing Dependencies

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [4]:
# Import the Dataset from given csv file - spam

import pandas as pd

df = pd.read_csv("hate.csv")
df.head(5)

Unnamed: 0,label,text
0,hate,"Dear mixed raced mutts, think twice before you..."
1,hate,"Dear mixed racers, think twice before you pass..."
2,hate,"Dear mormons, think twice before you pass by m..."
3,hate,"Dear Muslims, stop saying you have pure intent..."
4,hate,"Dear Muslims, think twice before you pass by m..."


In [5]:
df.groupby('label').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
hate,16590,16557,gay people deserve to die,2
nothate,13953,13909,There is so much I love about the United Kingd...,3


In [6]:
df.shape

(30543, 2)

In [7]:
df['label'].value_counts()

hate       16590
nothate    13953
Name: label, dtype: int64

In [8]:
df_hate = df[df['label']=='hate']
df_hate.shape

(16590, 2)

In [9]:
df_nothate = df[df['label']=='nothate']
df_nothate.shape

(13953, 2)

In [10]:
# Balance the Dataset with hate and non hate speeches

df_hate_downsampled = df_hate.sample(df_nothate.shape[0])
df_hate_downsampled.shape

(13953, 2)

In [11]:
df_balanced = pd.concat([df_hate_downsampled, df_nothate])
df_balanced.shape

(27906, 2)

In [12]:
df_balanced['label'].value_counts()

hate       13953
nothate    13953
Name: label, dtype: int64

In [13]:
# Assign binary values as per label

df_balanced['hate']=df_balanced['label'].apply(lambda x: 1 if x=='hate' else 0)
df_balanced.sample(5)

Unnamed: 0,label,text,hate
3600,nothate,Hitler's Army were the true criminals. If you ...,0
24539,nothate,They gave a lot of young people a free pass be...,0
3105,nothate,He is literally the most problematic dick on t...,0
4989,nothate,I do not accept it when others call my colleag...,0
509,nothate,Donald Trump deserves to die,0


In [14]:
# Splitting data into training and testing using stratified sampling based on class labels


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced['text'],df_balanced['hate'], stratify=df_balanced['hate'])

In [15]:
X_train.head(4)

857      East Asians can not run fast and can not join ...
17007    my fucking arab neighbours make my life miserable
6127     I get upset when people call me slurs just bec...
683               don't deal with women with heavy tummies
Name: text, dtype: object

In [16]:
# Importing BERT Model and get example sentence Embeddings

bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")



In [17]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

get_sentence_embeding([
    "house party is fun. hurry up", 
    "Suhrud, will you be in class tomorrow?"]
)

<tf.Tensor: shape=(2, 768), dtype=float32, numpy=
array([[-0.9304614 , -0.48000717, -0.85827845, ..., -0.6189298 ,
        -0.75098   ,  0.9464813 ],
       [-0.8292077 , -0.4938137 , -0.975732  , ..., -0.89208204,
        -0.7484012 ,  0.7592745 ]], dtype=float32)>

In [18]:
# Build Functional Model

# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [19]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [20]:
len(X_train)

20929

In [21]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [22]:
# Train the model

model.fit(X_train, y_train, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f5ff1e9e160>

In [23]:
model.evaluate(X_test, y_test)



[0.6402515769004822, 0.6127275228500366]

In [31]:
# Inference Check

speech = [
    'Dear friends: i love you all',
    'live life to fullest',
    'politicians are murderers, bloody and dirty minded',
    'your always so convincing.',
    'tell me anything about you.'
]
model.predict(speech)



array([[0.22455612],
       [0.23635142],
       [0.59506506],
       [0.21237172],
       [0.12775895]], dtype=float32)