## Here we are treating it as a Classification problem.

## The final score is the probability predicted by the Model.

In [None]:
# Importing libraries

import math
import os
import random
import numpy as np
import pandas as pd
import re
import unidecode
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from imblearn.under_sampling import RandomUnderSampler

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, GRU, Embedding, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# Defining constants

voc_size = 50000
max_sequence_length = 200
embedding_dim = 300
Batch_size = 32


# train_prev_comp_2 = '../input/toxic-comment/jigsaw-unintended-bias-train.csv'
# train_prev_comp = "../input/toxic-comment/jigsaw-toxic-comment-train.csv"
# test_cur_comp = "../input/jigsaw-toxic-severity-rating/comments_to_score.csv"

train = pd.read_csv('../input/ruddit-jigsaw-dataset-combined-cleaned/toxic_train.csv')
test = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
EMBEDDING_FILE = "../input/glove840b300dtxt/glove.840B.300d.txt"

def seed_everything():
    np.random.seed(123)
    random.seed(123)
    tf.random.set_seed(123)
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
    os.environ['PYTHONHASHSEED'] = str(123)

seed_everything()

In [None]:
# Function for cleaning comments

def clean_data(data):
    final = []
    for sent in data:
        sent = sent.replace('\\n', ' ').replace('\n', ' ').replace('\t',' ').replace('\\', ' ').replace('. com', '.com')
        soup = BeautifulSoup(sent, "html.parser")
        sent = soup.get_text(separator=" ")
        remove_https = re.sub(r'http\S+', '', sent)
        sent = re.sub(r"\ [A-Za-z]*\.com", " ", remove_https)
        sent = unidecode.unidecode(sent)
        sent = sent.lower()
        sent = re.sub(r"[^a-zA-Z0-9:$-,()%.?!]+", ' ', sent) 
        sent = re.sub(r"[:$-,()%.?!]+", ' ',sent)
        stoplist = stopwords.words("english")
        sent = [word for word in word_tokenize(sent) if word not in stoplist]
        sent = " ".join(sent)
        final.append(sent)
    
    return final

In [None]:
train.drop(['txt','isOffensive'],axis=1, inplace=True)
train.head()
df = train

In [None]:
train.head()

In [None]:
# Reading train file from previous competition

# df1 = pd.read_csv(train_prev_comp)
# df2 = pd.read_csv(train_prev_comp_2)
# df2 = df2[['id', 'comment_text', 'toxic', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack']]
# df2.columns = ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# df = pd.concat([df1, df2])

# df["y"] = (df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].sum(axis=1) > 0).astype(int)
# df.drop(["id","toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"], axis=1, inplace = True)
# df.head()



In [None]:
# Seeing that dataset is imbalanced

# df["y"].value_counts()

In [None]:
# Balacing dataset

# X = np.array(df["processed"].values)
# X = X.reshape(-1,1)
# y = np.array(df["offensiveness_score"].values)
# rus = RandomUnderSampler(random_state=0)
# x, y = rus.fit_resample(X, y)

# x = x.flatten()
# df = pd.DataFrame()
# df["text"] = x
# df["target"] = y


# # Now its balanced

# df["target"].value_counts()

In [None]:
# Creating column clean_text for cleaned comments

#df["processed"] = clean_data(df["text"])

In [None]:
# Defining keras Model with GRU units

class GRU_model(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.Embedding = Embedding(voc_size, embedding_dim, weights=[embedding_matrix], input_length = max_sequence_length)
        self.GRU1 = Bidirectional(GRU(128, return_sequences=True))
        self.Dropout1 = Dropout(0.25)
        self.GRU2 = Bidirectional(GRU(64, return_sequences = False))
        self.Dropout2 = Dropout(0.25)
        self.Dense1 = Dense(64, activation="relu")
        self.Dropout3 = Dropout(0.2)
        self.Dense2 = Dense(1, activation="sigmoid")
    
    def call(self, inputs):
        x = self.Embedding(inputs)
        x = self.GRU1(x)
        x = self.Dropout1(x)
        x = self.GRU2(x)
        x = self.Dropout2(x)
        x = self.Dense1(x)
        x = self.Dropout3(x)
        x = self.Dense2(x)
        
        return x

In [None]:
# Using early_stopping as callback function 
# It takes the weigths of epoch with the best val_accuracy

early_stopping = EarlyStopping(patience = 5,restore_best_weights = True)

In [None]:
# Tokenizing the comments from train dataset

tokenizer = Tokenizer(num_words = voc_size)
tokenizer.fit_on_texts(df["processed"].values)
X = tokenizer.texts_to_sequences(df["processed"].values)
X = pad_sequences(X, maxlen = max_sequence_length)

In [None]:
# tok=text.Tokenizer(num_words=voc_size,lower=True)
# tok.fit_on_texts(list(df['processed'])+list(test['text']))
# X_train=tok.texts_to_sequences(X_train)
# X_test=tok.texts_to_sequences(X_test)
# x_train=sequence.pad_sequences(X_train,maxlen=maxlen)
# x_test=sequence.pad_sequences(X_test,maxlen=maxlen)

In [None]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in tqdm(f):
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [None]:
word_index = tokenizer.word_index
#prepare embedding matrix
num_words = min(voc_size, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i >= voc_size:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
model = GRU_model()
model.compile(
        loss = tf.keras.losses.BinaryCrossentropy(),
        optimizer = "Adam",
        metrics = ["mse"]
    )

model.fit(
        X, 
        df.offensiveness_score, 
        epochs = 10, 
        validation_split = 0.2,
        batch_size = Batch_size, 
        callbacks = [early_stopping]
    )

In [None]:
# Reading given test dataset 

#test = pd.read_csv(test_cur_comp)

test["text"] = clean_data(test["text"])
x_test = tokenizer.texts_to_sequences(test["text"].values)
x_test = pad_sequences(x_test, maxlen = max_sequence_length)

pred = model.predict(x_test)

In [None]:
# Making submission file

final = pd.DataFrame()
final["comment_id"] = test["comment_id"]
final["score"] = pred

final['score'] = final['score'].rank(method='first')

print(df.shape)
print()

final.to_csv("submission.csv", index=False)

In [None]:
final.head()