In [None]:
# I am starting with a baseline model.
# Then I will use Keras Deep Learning Algorithms to create a simple NN Model.
# Then I will use CNN and LSTM to create an optimum algorithm as an improvememt on the Simple NN Model.

In [None]:
#Importing Libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))
import time
from tqdm import tqdm
import math
import matplotlib.pyplot as plt
import seaborn as sns

#Wordcloud
from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS

#sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

#Keras
from keras.models import Sequential
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D,CuDNNLSTM
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
#Loading Data
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

In [None]:
#Head of the Data
train_df.head()

In [None]:
#Head of the Target
test_df.head()

In [None]:
#Checking if the data contains any null values
train_df.isnull().sum(axis = 0)

In [None]:
#Checking the Shape of Training and Testing Data
print(train_df.shape)
print(test_df.shape)

In [None]:
#Splitting the Data into training and validation
questions = train_df['question_text']
target = train_df['target']

questions_train, questions_val, target_train, target_val = train_test_split(questions, target, test_size=0.15, random_state=1000)


In [None]:
#Wordcloud of Questions just for fun
plt.rcParams['figure.figsize']=(10.0,8.0)    #(6.0,4.0)
plt.rcParams['font.size']=16                #10 
plt.rcParams['savefig.dpi']=300             #72 
plt.rcParams['figure.subplot.bottom']=.1 


stopwords = set(STOPWORDS)
#data = pd.read_csv("../input/most_backed.csv")

wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stopwords,
                          max_words=100,
                          max_font_size=40, 
                          random_state=42
                         ).generate(str(questions))

print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
#Defining a Baseline model
vectorizer = CountVectorizer()
vectorizer.fit(questions_train)

In [None]:
X_train = vectorizer.transform(questions_train)
X_val  = vectorizer.transform(questions_val)
X_train

In [None]:
#Logistic Regression - Baseline Model 1
classifier = LogisticRegression()
classifier.fit(X_train, target_train)
score = classifier.score(X_val, target_val)

print("Accuracy:", score)

In [None]:
#Classification Report for Logistic Regression
predictions = classifier.predict(X_val)
print(classification_report(target_val,predictions))

In [None]:
#SVM Algorithm - Baseline 2

text_clf_svm = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=10, random_state=42)
text_clf_svm.fit(X_train, target_train)
predicted_svm = text_clf_svm.predict(X_val)
np.mean(predicted_svm == target_val)

In [None]:
#Classification Report for SVM
print(classification_report(target_val,predicted_svm))

In [None]:
#Tokenizing using Keras
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(questions_train)

X_train = tokenizer.texts_to_sequences(questions_train)
X_test = tokenizer.texts_to_sequences(questions_val)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(questions_train[2])
print(X_train[2])

In [None]:
for word in ['the', 'all', 'happy', 'sad']:
    print('{}: {}'.format(word, tokenizer.word_index[word]))

In [None]:
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
print(X_train[0, :])

In [None]:
#First Keras Model
input_dim = X_train.shape[1]  # Number of features

model = Sequential()
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, target_train, epochs=2, verbose=False, validation_data=(X_test, target_val), batch_size=600)

In [None]:
loss, accuracy = model.evaluate(X_train, target_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, target_val, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
#Checking F1 Score for First Keras Model
pred_paragram_target_val = model.predict([X_test], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(target_val, (pred_paragram_target_val>thresh).astype(int))))

In [None]:
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
plot_history(history)

In [None]:
#Lets use the pre-trained embeddings provided to us
## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

# Pretrained Embeddings Glove

In [None]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

In [None]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

In [None]:
#Keras Model With Pretrained Glove Text
"""
model = Sequential()
model.add(layers.Embedding(max_features, embed_size, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=False))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()
"""

In [None]:
#Trainable = False
"""
history = model.fit(X_train, target_train,
                    epochs=2,
                    verbose=False,
                    validation_data=(X_test, target_val),
                    batch_size=800)
loss, accuracy = model.evaluate(X_train, target_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, target_val, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)
"""

In [None]:
#Trainable = True
model = Sequential()
model.add(layers.Embedding(max_features, embed_size, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=True))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, target_train,
                    epochs=2,
                    verbose=False,
                    validation_data=(X_test, target_val),
                    batch_size=800)
loss, accuracy = model.evaluate(X_train, target_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, target_val, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [None]:
#CNN

embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(max_features, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, target_train,
                    epochs=2,
                    verbose=False,
                    validation_data=(X_test, target_val),
                    batch_size=600)
loss, accuracy = model.evaluate(X_train, target_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, target_val, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [None]:
#Checking F1 Score for CNN Model
pred_paragram_target_val = model.predict([X_test], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(target_val, (pred_paragram_target_val>thresh).astype(int))))

In [None]:
#Recurrent LSTM
embedding_dim = 100

model = Sequential()
model.add(layers.Embedding(max_features, embedding_dim, input_length=maxlen))
model.add(Bidirectional(CuDNNLSTM(64, return_sequences = True)))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()


In [None]:
history = model.fit(X_train, target_train,
                    epochs=10,
                    verbose=False,
                    validation_data=(X_test, target_val), batch_size=800)
loss, accuracy = model.evaluate(X_train, target_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, target_val, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [None]:
#Checking F1 Score for CSTM Model with Epoch 10
pred_paragram_target_val = model.predict([X_test], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    print("F1 score at threshold {0} is {1}".format(thresh, metrics.f1_score(target_val, (pred_paragram_target_val>thresh).astype(int))))

In [None]:
#For Test Dataset
questions_test = test_df['question_text']
questions_test[0]

In [None]:
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(questions_test)

test = tokenizer.texts_to_sequences(questions_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(questions_test[2])
print(test[2])

In [None]:
maxlen = 100
test = pad_sequences(test, padding='post', maxlen=maxlen)
print(test[0, :])

In [None]:
#Predict
predicted_test_y = model.predict([test], batch_size=1024, verbose=1)

In [None]:
#Final Output
predicted_test_y = (predicted_test_y>0.33).astype(int)
output = pd.DataFrame({"qid":test_df["qid"].values})
output['prediction'] = predicted_test_y
output.to_csv("submission.csv", index=False)