### Universal Sentence encoder ###

----- Tested on Google Colab: Place cleaned data files in the 'Files' section

Note: Version specific TF v1-----

In [None]:
import tensorflow.compat.v1 as tfv1

"""tf v1 compatible settings"""
tfv1.disable_eager_execution()
tfv1.disable_v2_behavior()



import tensorflow as tf
import tensorflow_hub as hub
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
module = hub.Module(module_url)

-Read train data

-Split train data into train and validation sets in the ration 90%-10%

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train, val = train_test_split(train, test_size=0.1)

-Get Universal Encoder Embeddings for sequences

In [None]:
def get_sentence_embedding(train, val, test):
    train_x = train['comment_text'].str.lower()
    val_x = val['comment_text'].str.lower()
    test_x = test['comment_text'].str.lower()
    
    train_y = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
    val_y = val[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values
    
    
    return train_x, val_x, test_x, train_y, val_y
    


def UniversalEmbedding(x):
    return module(tf.squeeze(tf.cast(x, tf.string)), 
                 signature="default", as_dict=True)["default"]


In [None]:
train_x, val_x, test_x, train_y, val_y = get_sentence_embedding(train, val, test)  

#### Model for Universal Sentence Encoder ####

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Lambda, Dense
from keras import backend as K

In [None]:
MAX_LEN = 100 
MAX_WORDS = 50000
EMBED_SIZE = 512 #USE embed size

class USE(Model):
    def __init__(self, ):
        super(USE, self).__init__()

        self.embedding = Lambda(UniversalEmbedding,output_shape=(EMBED_SIZE,))
        self.l2 = Dense(6, activation='sigmoid')

    def call(self, inp):
        x = self.embedding(inp)
        x = self.l2(x)
        return x


- Start session and fit model
- Optimizer: Adam, Criterion: BCE

In [None]:
BATCH_SIZE = 128

model = USE()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

with tfv1.Session() as session:
    K.set_session(session)
    session.run(tfv1.global_variables_initializer())
    session.run(tfv1.tables_initializer())
    history = model.fit(np.array(train_x), 
            train_y,
            validation_data=(np.array(val_x), val_y),
            epochs=1, 
            batch_size=BATCH_SIZE)
    model.save_weights('./model.h5')

- Make predictions in session
- Match format of sample_submission

In [None]:
BATCH_SIZE = 1024
with tfv1.Session() as session:
    K.set_session(session)
    session.run(tfv1.global_variables_initializer())
    session.run(tfv1.tables_initializer())
    predictions = model.predict(test_x, batch_size=BATCH_SIZE, verbose=1)
    submission = pd.read_csv('input/sample_submission.csv')
    submission[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']] = predictionssubmission.to_csv('submission.csv', index=False)