In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

#train = pd.read_csv('../input/processed-data/cleaned_traindata.csv')
train = pd.read_csv('../input/newdata/somenew_data.csv')
xytrain, xyval = train_test_split(train, test_size=0.2)
XYTrain = xytrain.applymap(str)
XYValid = xyval.applymap(str)
X_Train = XYTrain['question_text'].values
Y_Train = XYTrain['target'].values
X_Valid = XYValid['question_text'].values
Y_Valid = XYValid['target'].values

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

num_words = 60000
seq_length = 75
embedding_dim = 300
 
tokenizer = Tokenizer(num_words = num_words)
tokenizer.fit_on_texts(list(X_Train))
token_train = tokenizer.texts_to_sequences(X_Train)
token_valid = tokenizer.texts_to_sequences(X_Valid)
xtrain = pad_sequences(token_train, maxlen = seq_length)
xvalid = pad_sequences(token_valid, maxlen = seq_length)

In [None]:
embeddings = dict()
with open('../input/quora-insincere-questions-classification/embeddings/paragram_300_sl999/paragram_300_sl999.txt', 'r', errors = 'ignore', encoding='utf8') as vocab:
    for line in vocab:
        indices = line.strip().split(' ')
        #first column
        word = indices[0]
        coefs = np.asarray(indices[1:], dtype='float32')
        embeddings[word] = coefs

In [None]:
GLembeddings = dict()
with open('../input/quora-insincere-questions-classification/embeddings/glove.840B.300d/glove.840B.300d.txt', 'r', errors = 'ignore', encoding='utf8') as vocab:
    for line in vocab:
        indices = line.strip().split(' ')
        #first column
        word = indices[0]
        coefs = np.asarray(indices[1:], dtype='float32')
        GLembeddings[word] = coefs

In [None]:
word_index = tokenizer.word_index
words = min(num_words, len(word_index))
embedding_matrix = np.zeros((words, embedding_dim))
for word, i in word_index.items():
    if i > num_words-1: 
        continue
    else:
        embedding_vector = GLembeddings.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix_1 = np.zeros((words, embedding_dim))
for word, i in word_index.items():
    if i > num_words-1: 
        continue
    else:
        embedding_vector = embeddings.get(word)
        if embedding_vector is not None: 
            embedding_matrix_1[i] = embedding_vector

Embedding + LSTM + max avg pool

In [None]:
from keras.layers import *
from keras.models import Model
from keras.optimizers import *

def get_model_v0(lr=0.005):
    inputs = Input(shape=(seq_length,))
    x = Embedding(num_words, embedding_dim, weights = [embedding_matrix], input_length = seq_length, trainable=False)(inputs)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNLSTM(80, return_sequences=True))(x)
    max_pool = GlobalMaxPooling1D()(x)
    avg_pool = GlobalAveragePooling1D()(x)
    conc = concatenate([max_pool, avg_pool])
    outputs = Dense(64, activation="relu")(conc)
    outputs = Dense(1, activation='sigmoid')(outputs)
    model = Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(lr=lr)
    model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
    print(model.summary())
    return model

In [None]:
from keras.layers import *
from keras.models import Model
from keras.optimizers import *

def get_model_v2(lr=0.005):
    inputs = Input(shape=(seq_length,))
    a = Embedding(num_words, embedding_dim, weights = [embedding_matrix], input_length = seq_length, trainable=False)(inputs)
    b = Embedding(num_words, embedding_dim, weights = [embedding_matrix_1], input_length = seq_length, trainable = False)(inputs)
    x = concatenate([a,b])
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNLSTM(80, return_sequences=True))(x)
    max_pool = GlobalMaxPooling1D()(x)
    outputs = Dense(64, activation="elu")(max_pool)
    outputs = Dense(1, activation='sigmoid')(outputs)
    model = Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(lr=lr)
    model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
    print(model.summary())
    return model

In [None]:
from keras.layers import *
from keras.models import Model
from keras.optimizers import *

def get_model_v1(lr=0.005):
    inputs = Input(shape=(seq_length,))
    a = Embedding(num_words, embedding_dim, weights = [embedding_matrix], input_length = seq_length, trainable=False)(inputs)
    b = Embedding(num_words, embedding_dim, weights = [embedding_matrix_1], input_length = seq_length, trainable = False)(inputs)
    x = concatenate([a,b])
    x = Dropout(0.2)(x)
    conv_1 = Conv1D(128, 5, activation = "relu", padding = "same")(x)
    conv_2 = Conv1D(128, 4, activation = "relu", padding = "same")(x)
    conv_3 = Conv1D(128, 3, activation = "relu", padding = "same")(x)
    conv_4 = Conv1D(128, 2, activation = "relu", padding = "same")(x)
    z = concatenate([conv_1, conv_2, conv_3, conv_4])
    max_pool = GlobalMaxPooling1D()(z)
    avg_pool = GlobalAveragePooling1D()(z)
    y = concatenate([max_pool, avg_pool])
    outputs = Dense(64, activation="elu")(y)
    outputs = Dense(1, activation='sigmoid')(outputs)
    model = Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(lr=lr)
    model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
    print(model.summary())
    return model

In [None]:
def get_model_v3(lr=0.005):
    inputs = Input(shape=(seq_length,))
    a = Embedding(num_words, embedding_dim, weights = [embedding_matrix], input_length = seq_length, trainable=False)(inputs)
    b = Embedding(num_words, embedding_dim, weights = [embedding_matrix_1], input_length = seq_length, trainable = False)(inputs)
    x = concatenate([a,b])
    x = SpatialDropout1D(0.2)(x)
    x = CuDNNLSTM(128, return_sequences=True)(x)
    x = CuDNNLSTM(64, return_sequences= True)(x)
    max_pool = GlobalMaxPooling1D()(x)
    avg_pool = GlobalAveragePooling1D()(x)
    y = concatenate([max_pool, avg_pool])
    outputs = Dense(64, activation='relu')(y)
    outputs = Dropout(0.1)(outputs)
    outputs = Dense(1, activation='sigmoid')(outputs)
    model = Model(inputs=inputs, outputs=outputs)
    optimizer = Adam(lr=lr)
    model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
    print(model.summary())
    return model

In [None]:
model = get_model_v2()

In [None]:
yvalid = [float(i) for i in Y_Valid]

In [None]:
history = model.fit(xtrain,Y_Train,batch_size=370,epochs=15, verbose = 1)

In [None]:
model.save('model-CNN-para-v1.h5')

In [None]:
pred = model.predict(xvalid, verbose = True)


In [None]:
binary = np.round(pred)
Y_Valid = [float(i) for i in Y_Valid]

In [None]:
from sklearn.metrics import confusion_matrix,classification_report, f1_score

print(confusion_matrix(Y_Valid,binary))
print(classification_report(Y_Valid, binary))