In [1]:
#loading required libraries
#basics
import pandas as pd 
import numpy as np

#misc
import gc
import time
import warnings

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec 
import seaborn as sns

#nlp
import string
import re    #for regex
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
import spacy
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
# Tweet tokenizer does not split at apostophes which is what we want
from nltk.tokenize import TweetTokenizer

#FeatureEngineering
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

eng_stopwords = set(stopwords.words("english"))
warnings.filterwarnings("ignore")

lem = WordNetLemmatizer()
tokenizer=TweetTokenizer()

%matplotlib inline

[nltk_data] Downloading package stopwords to /home/nilesh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from tqdm import tqdm
from gensim.models import KeyedVectors

In [3]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.optimizers import Adam
from gensim.models import KeyedVectors


Using TensorFlow backend.


In [4]:
EMB_PATH = "crawl-300d-2M.gensim"

In [5]:
!ls

crawl-300d-2M.gensim
crawl-300d-2M.gensim.vectors.npy
crawl-300d-2M.gensim.vectors.npy.zip
crawl-300d-2M.gensim.zip
data
jigsaw-unintended-bias-in-toxicity-classification.zip
jigsaw-unintended-bias.ipynb


In [6]:
#!unzip jigsaw-unintended-bias-in-toxicity-classification.zip -d data/

In [7]:
#!unzip crawl-300d-2M.gensim.vectors.npy.zip
#!unzip crawl-300d-2M.gensim.zip 

In [8]:
train = pd.read_csv("data/train.csv")
test  = pd.read_csv("data/test.csv")
sample_submission = pd.read_csv("data/sample_submission.csv")

In [9]:
NUM_MODELS = 1
BATCH_SIZE = 512
LSTM_UNITS = 60    #128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
EPOCHS = 4
MAX_LEN = 220
MAX_FEATURES = 30000
EMBED_SIZE = 300
IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
]

In [10]:
AUX_COLUMNS = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']
TEXT_COLUMN = 'comment_text'
TARGET_COLUMN = 'target'
CHARS_TO_REMOVE = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n“”’\'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—'

In [11]:
x_train = train[TEXT_COLUMN].astype(str)
y_train = train[TARGET_COLUMN].values
y_aux_train = train[AUX_COLUMNS].values
x_test = test[TEXT_COLUMN].astype(str)

In [None]:
tokenizer = text.Tokenizer(filters=CHARS_TO_REMOVE, lower=False)
tokenizer.fit_on_texts(list(x_train) + list(x_test)) #generates word ocab

In [None]:
word_index = tokenizer.word_index #return dictionary of words and their indices in our vocab

In [None]:
x_train = tokenizer.texts_to_sequences(list(x_train)) #convert string to list of words
x_test = tokenizer.texts_to_sequences(list(x_test))

In [None]:
#pad sequences
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [None]:
def load_embeddings(embed_dir=EMB_PATH):
    embedding_index = KeyedVectors.load(embed_dir, mmap= 'r')
    return embedding_index

In [None]:
embeddings_index = load_embeddings(EMB_PATH)

In [None]:
len(word_index)

In [None]:
def build_matrix(word_index=word_index):
    embedding_matrix = np.zeros((len(word_index) + 1,300))
    for word , i in word_index.items():
        try:
            embedding_matrix[i] = embeddings_index[word]
        except:
            embedding_matrix[i] = embeddings_index["unknown"]
    return embedding_matrix

In [None]:
embedding_matrix = build_matrix(word_index)

In [None]:
del embeddings_index
gc.collect()

In [None]:
def build_model(embedding_matrix=embedding_matrix, verbose=0):
    input_sequences = Input(shape=(MAX_LEN,), dtype='int32') #placeholder for input in this case text sequence
    x = Embedding(len(word_index) + 1, EMBED_SIZE , weights=[embedding_matrix], trainable=False)(input_sequences)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    #x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    
    avg_pool1 = GlobalAveragePooling1D()(x)
    max_pool1 = GlobalMaxPooling1D()(x)
    
    x = concatenate([avg_pool1, max_pool1])
    dense = Dense(DENSE_HIDDEN_UNITS, activation='relu')(x)
    
    x = add([x, dense])
    #x = add([x, dense])
    
    preds = Dense(1, activation='sigmoid')(x)
    
    model = Model(input_sequences, preds)
    if verbose:
        model.summary()
    if compile:
        model.compile(loss='binary_crossentropy',optimizer=Adam(0.005),metrics=['acc'])
    return model

In [None]:
from sklearn.model_selection import KFold

splits = list(KFold(n_splits=2).split(x_train,y_train))


from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K
import numpy as np
BATCH_SIZE = 128
NUM_EPOCHS = 10

oof_preds = np.zeros((x_train.shape[0]))
test_preds = np.zeros((x_test.shape[0]))
for fold in [0,1,2,3,4]:
    K.clear_session()
    tr_ind, val_ind = splits[fold]
    print(tr_ind, val_ind)
    ckpt = ModelCheckpoint(f'gru_{fold}.hdf5', save_best_only = True)
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
    model = build_model()
    model.fit(x_train[tr_ind],
        y_train[tr_ind]>0.5,
        batch_size=BATCH_SIZE,
        epochs=NUM_EPOCHS,
        validation_data=(x_train[val_ind], y_train[val_ind]>0.5),
        callbacks = [es,ckpt])

    oof_preds[val_ind] += model.predict(x_train[val_ind])[:,0]
    test_preds += model.predict(x_test)[:,0]
test_preds /= 5