In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
train_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip', encoding="ISO-8859-1")
train_df.head()


In [4]:
X_test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip', encoding="ISO-8859-1")
X_test.head()

In [5]:
y_test = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')
y_test.head()

# sample text to visualize

In [6]:
train_df.sample(1)['comment_text'].values[0]

In [7]:
#remove hyperlinks
#remove contractions
#remove punctuation
#lemmatization

In [8]:
import nltk

In [9]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

In [10]:
!pip install contractions

In [11]:
import contractions
import string

In [12]:
def remove_contractions(sent):
    # creating an empty list
    expanded_words = []   
    for word in sent.split(" "):
      # using contractions.fix to expand the shortened words
        expanded_words.append(contractions.fix(word)) 

    return ' '.join(expanded_words)


def to_lowercase(text):
    return text.lower()

# Remove website links
def remove_links(text):
    template = re.compile(r'https?://\S+|www\.\S+') 
    text = template.sub(r'', text)
    return text

# Remove HTML tags
def remove_html(text):
    template = re.compile(r'<[^>]*>') 
    text = template.sub(r'', text)
    return text


# Remove stopwords
def remove_stopwords(words, stop_words):
    return [word for word in words if word not in stop_words]

# Remove none ascii characters
def remove_non_ascii(text):
    template = re.compile(r'[^\x00-\x7E]+') 
    text = template.sub(r'', text)
    return text

# Replace none printable characters
def remove_non_printable(text):
    template = re.compile(r'[\x00-\x0F]+') 
    text = template.sub(r' ', text)
    return text

# Remove special characters
def remove_special_chars(text):
        text = re.sub("'s", '', text)
        template = re.compile('["#$%&\'()\*\+-/:;<=>@\[\]\\\\^_`{|}~]') 
        text = template.sub(r' ', text)
        return text

# Replace multiple punctuation 
def replace_multiplt_punc(text):
        text = re.sub('[.!?]{2,}', '.', text)
        text = re.sub(',+', ',', text) 
        return text

    # Remove numbers
def remove_numbers(text):
        text = re.sub('\d+', ' ', text)
        return text

def handle_spaces(text):
    # Remove extra spaces
    text = re.sub('\s+', ' ', text)
    
    # Remove spaces at the beginning and at the end of string
    text = text.strip() 
    
    return text

def remove_punctuation(text):
    """Remove punctuation from list of tokenized words"""
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# def stem_words(words):
#     """Stem words in text"""
#     stemmer = PorterStemmer()
#     return [stemmer.stem(word) for word in words]

def text2words(text):
      return word_tokenize(text)
    
def lemmatize_words(words):
    """Lemmatize words in text"""

    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]

def lemmatize_verbs(words):
    """Lemmatize verbs in text"""

    lemmatizer = WordNetLemmatizer()
    return ([lemmatizer.lemmatize(word, pos='v') for word in words])

def remove_pattern(text): 
    # remove hi moron 
    text= re.sub(r'(hi)(.*)\1', r'\1', text)
    # remove duplicate words
    text= re.sub(r"\b(\w+)(?:\W+\1\b)+",r'\1', text,flags=re.IGNORECASE)
    # remove [User:Cirt]] 
    text= re.sub(r"\[.*?\]", ' ', text)
    # remove \n\n
    text= re.sub(r"\n", ' ', text)
    return text

def clean_text( text):
    text = remove_contractions(text)
    text = remove_pattern(text)
    text = remove_links(text)
    text = remove_html(text)
    text = remove_special_chars(text)
    text = remove_non_ascii(text)
    text = remove_non_printable(text)
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = to_lowercase(text)
    text = handle_spaces(text)
    words = text2words(text)
    words = remove_stopwords(words, stop_words)
    #words = stem_words(words) #either stem or lemmatize
    words = lemmatize_words(words)
    words = lemmatize_verbs(words)

    return ' '.join(words)

In [13]:
train_df['comment_text'] = train_df['comment_text'].apply(lambda x: clean_text(x))

In [14]:
train_df.head()

In [16]:
X_test['comment_text'] = X_test['comment_text'].apply(lambda x: clean_text(x))

In [17]:
# import string
# string.punctuation

In [18]:
# def clean_text(text):
#     cleaned_text = text.translate(str.maketrans('', '', string.punctuation))
#     return cleaned_text

In [19]:
# train_df['comment_text'] = train_df['comment_text'].apply(clean_text)
# X_test['comment_text'] = X_test['comment_text'].apply(clean_text)

In [20]:
# import re
# train_df['comment_text'] = train_df['comment_text'].apply(lambda s: re.sub(r'[0-9]+', '', s) )
# train_df['comment_text'] = train_df['comment_text'].apply(lambda s: re.sub(r'[0-9]+', '', s) )

In [21]:
from keras.preprocessing.text import Tokenizer

# We create a tokenizer, configured to only take
# into account the top-1000 most common words
tokenizer = Tokenizer(num_words=1000, oov_token='UNK')
# This builds the word index
tokenizer.fit_on_texts(train_df['comment_text'])

# This turns strings into lists of integer indices.
sequences = tokenizer.texts_to_sequences(train_df['comment_text'])

# You could also directly get the one-hot binary representations.
# Note that other vectorization modes than one-hot encoding are supported!
one_hot_results = tokenizer.texts_to_matrix(train_df['comment_text'], mode='binary')
#tfidf = tokenizer.texts_to_matrix(train_df['comment_text'], mode='tfidf')
# This is how you can recover the word index that was computed
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))



In [22]:
one_hot_results.shape

In [23]:
# This turns strings into lists of integer indices.
sequences_test = tokenizer.texts_to_sequences(X_test['comment_text'])

# You could also directly get the one-hot binary representations.
# Note that other vectorization modes than one-hot encoding are supported!
one_hot_results_test = tokenizer.texts_to_matrix(X_test['comment_text'], mode='binary')



In [24]:
from tensorflow.keras import models
from tensorflow.keras import layers

model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(1000,)))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(6, activation='sigmoid'))


In [25]:
from tensorflow.keras import optimizers

model.compile(optimizer=optimizers.RMSprop(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [26]:
history = model.fit(one_hot_results,
                    train_df[['toxic' ,'severe_toxic' ,'obscene' ,'threat' ,'insult' ,'identity_hate']],
                    epochs=4,
                    batch_size=256,
                    validation_split=0.2)



In [27]:
history_dict = history.history
history_dict.keys()

In [28]:
import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()



In [29]:
plt.clf()   # clear figure
acc_values = history_dict['accuracy']
val_acc_values = history_dict['accuracy']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


In [31]:
submit = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')
y_pred = model.predict(one_hot_results_test)
y_pred = pd.DataFrame(y_pred)#.applymap(lambda x: 1 if x>0.5 else 0)
y_pred.columns = ['toxic' ,'severe_toxic' ,'obscene' ,'threat' ,'insult' ,'identity_hate']
submit[['toxic' ,'severe_toxic' ,'obscene' ,'threat' ,'insult' ,'identity_hate']] = y_pred
submit.to_csv('submit_file.csv', index=None)
#Score: 0.91

# LSTM

In [32]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

In [33]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
lengths = [len(sequence) for sequence in sequences]
max_length = max(lengths)
sequences_pad = pad_sequences(sequences, maxlen=50)

In [34]:
mean_lengths = np.mean(lengths)
mean_lengths

In [35]:
import matplotlib.pyplot as plt
plt.hist(lengths, bins=100);
plt.xlim([min(lengths), max(lengths)-1000]);

In [36]:
seq_length = sequences_pad.shape[1]
seq_length

In [37]:
max_length = 40

In [38]:
model = models.Sequential()
model.add(layers.Embedding(vocab_size, 50, input_length=max_length))
model.add(layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(layers.LSTM(64,dropout=0.2, recurrent_dropout=0.2,))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(6, activation='sigmoid'))

In [39]:
model.compile(optimizer=optimizers.RMSprop(learning_rate=0.01),
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(sequences_pad,
                    train_df[['toxic' ,'severe_toxic' ,'obscene' ,'threat' ,'insult' ,'identity_hate']],
                    epochs=4,
                    batch_size=256,
                    validation_split=0.2)


In [40]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


In [41]:
plt.clf()   # clear figure
acc_values = history_dict['accuracy']
val_acc_values = history_dict['accuracy']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


In [42]:
submit = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')
submit.head()

In [43]:
sequences_test_pad =  pad_sequences(sequences_test, maxlen=max_length)

In [45]:
y_pred = model.predict(sequences_test_pad)

In [49]:
y_pred

In [50]:
y_pred = pd.DataFrame(y_pred)#.applymap(lambda x: 1 if x>0.5 else 0)
y_pred.columns = ['toxic' ,'severe_toxic' ,'obscene' ,'threat' ,'insult' ,'identity_hate']

In [51]:
submit[['toxic' ,'severe_toxic' ,'obscene' ,'threat' ,'insult' ,'identity_hate']] = y_pred

In [48]:
submit.to_csv('submit_lstm.csv', index=None)#Score: 0.65039

# Bidirectional GRU

In [52]:
model = models.Sequential()
model.add(layers.Embedding(vocab_size, 100, input_length=seq_length))
model.add(layers.Bidirectional(layers.GRU(256, return_sequences=True)))
model.add(layers.Bidirectional(layers.GRU(128)))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(6, activation='sigmoid'))

In [53]:
model.compile(optimizer=optimizers.RMSprop(learning_rate=0.01),
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(sequences_pad,
                    train_df[['toxic' ,'severe_toxic' ,'obscene' ,'threat' ,'insult' ,'identity_hate']],
                    epochs=5,
                    batch_size=256,
                    validation_split=0.2)


In [54]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


In [55]:
plt.clf()   # clear figure
acc_values = history_dict['accuracy']
val_acc_values = history_dict['accuracy']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


In [56]:
sequences_test_pad =  pad_sequences(sequences_test, maxlen=max_length)
y_pred = model.predict(sequences_test_pad)
y_pred = pd.DataFrame(y_pred)#.applymap(lambda x: 1 if x>0.5 else 0)
y_pred.columns = ['toxic' ,'severe_toxic' ,'obscene' ,'threat' ,'insult' ,'identity_hate']
submit[['toxic' ,'severe_toxic' ,'obscene' ,'threat' ,'insult' ,'identity_hate']] = y_pred
submit.to_csv('submit_gru.csv', index=None)

In [57]:
y_pred