In [1]:
########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from string import punctuation
from collections import defaultdict

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.preprocessing import StandardScaler

import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

Using TensorFlow backend.


In [2]:
########################################
## set directories and parameters
########################################
BASE_DIR = '../kaggle-quora/data/'
EMBEDDING_FILE = '../glove.840B.300d.txt'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = False # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

In [3]:
########################################
## process texts in datasets
########################################

print('Processing text dataset')

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

Processing text dataset


In [12]:
%%time

## process texts in datasets continued... ##

texts_1 = [] 
texts_2 = []
labels = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts_1.append(text_to_wordlist(values[3]))
        texts_2.append(text_to_wordlist(values[4]))
        labels.append(int(values[5]))
print('Found %s texts in train.csv' % len(texts_1))

test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1]))
        test_texts_2.append(text_to_wordlist(values[2]))
        test_ids.append(values[0])
print('Found %s texts in test.csv' % len(test_texts_1))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

sequences_1 = tokenizer.texts_to_sequences(texts_1) # Question_1, train
sequences_2 = tokenizer.texts_to_sequences(texts_2) # Question_2, train
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1) # Question_1, test
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2) # Question_2, test

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH) # Question_1, train
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH) # Question_2, train
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH) # Question_1, test
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH) # Question_2, test
test_ids = np.array(test_ids)

Found 404290 texts in train.csv
Found 2345796 texts in test.csv
Found 120500 unique tokens
('Shape of data tensor:', (404290, 30))
('Shape of label tensor:', (404290,))
CPU times: user 7min 33s, sys: 8.93 s, total: 7min 42s
Wall time: 7min 49s


In [17]:
pd.DataFrame(test_data_1).to_csv('./test_data_1.csv')
pd.DataFrame(test_data_2).to_csv('./test_data_2.csv')

In [4]:
# TODO: Delete after use

train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)

ques = pd.concat([train_df[['question1', 'question2']], \
        test_df[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)

In [10]:
# TODO: Delete after use

for i in range(ques.shape[0]):
    q_dict[ques.question1[i]].add(ques.question2[i])
    q_dict[ques.question2[i]].add(ques.question1[i])
type(q_dict)

collections.defaultdict

In [7]:
%%time

########################################
## generate leaky features
########################################

train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)

ques = pd.concat([train_df[['question1', 'question2']], \
        test_df[['question1', 'question2']]], axis=0).reset_index(drop='index')
q_dict = defaultdict(set)
for i in range(ques.shape[0]):
        q_dict[ques.question1[i]].add(ques.question2[i])
        q_dict[ques.question2[i]].add(ques.question1[i])

def q1_freq(row):
    return(len(q_dict[row['question1']]))
    
def q2_freq(row):
    return(len(q_dict[row['question2']]))
    
def q1_q2_intersect(row):
    return(len(set(q_dict[row['question1']]).intersection(set(q_dict[row['question2']]))))

train_df['q1_q2_intersect'] = train_df.apply(q1_q2_intersect, axis=1, raw=True)
train_df['q1_freq'] = train_df.apply(q1_freq, axis=1, raw=True)
train_df['q2_freq'] = train_df.apply(q2_freq, axis=1, raw=True)

test_df['q1_q2_intersect'] = test_df.apply(q1_q2_intersect, axis=1, raw=True)
test_df['q1_freq'] = test_df.apply(q1_freq, axis=1, raw=True)
test_df['q2_freq'] = test_df.apply(q2_freq, axis=1, raw=True)

leaks = train_df[['q1_q2_intersect', 'q1_freq', 'q2_freq']]
test_leaks = test_df[['q1_q2_intersect', 'q1_freq', 'q2_freq']]

ss = StandardScaler()
ss.fit(np.vstack((leaks, test_leaks)))
leaks = ss.transform(leaks)
test_leaks = ss.transform(test_leaks)

CPU times: user 8min 53s, sys: 9.82 s, total: 9min 3s
Wall time: 9min 7s


In [76]:
print train_df.shape, test_df.shape
train_df.head()

(404290, 9) (2345796, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,q1_q2_intersect,q1_freq,q2_freq
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0,1,2
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,0,8,3
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,0,2,1
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,0,1,1
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,0,3,1


In [16]:
%%time

########################################
## index word vectors
########################################
print('Indexing word vectors')

embeddings_index = {}
f = open(EMBEDDING_FILE)
count = 0
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %d word vectors of glove.' % len(embeddings_index))

########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Indexing word vectors
Found 2196016 word vectors of glove.
Preparing embedding matrix
Null word embeddings: 33233
CPU times: user 1min 58s, sys: 9.73 s, total: 2min 8s
Wall time: 2min 12s


In [7]:
%%time
# Don't use this for now, use my own customized oversampling

########################################
## sample train/validation data
########################################

#np.random.seed(1234)
# perm = np.random.permutation(len(data_1))
# idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
# idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

# data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
# data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
# leaks_train = np.vstack((leaks[idx_train], leaks[idx_train]))
# labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

# data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
# data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
# leaks_val = np.vstack((leaks[idx_val], leaks[idx_val]))
# labels_val = np.concatenate((labels[idx_val], labels[idx_val]))

# weight_val = np.ones(len(labels_val))
# if re_weight:
#     weight_val *= 0.472001959
#     weight_val[labels_val==0] = 1.309028344

CPU times: user 371 ms, sys: 305 ms, total: 675 ms
Wall time: 804 ms


# Oversample the Data

In [17]:
# Data are: data_1, data_2, leaks, labels

def oversampling(x_train, y_train):
    # Now we oversample the negative class - on your own risk of overfitting!
    pos_train = x_train[y_train == 1]
    neg_train = x_train[y_train == 0]

    print("Oversampling started for proportion: {}".format(1.0*len(pos_train) / (1.0*len(pos_train) + 1.0*len(neg_train))))
    p = 0.165
    scale = ((1.0*len(pos_train) / (1.0*len(pos_train) + 1.0*len(neg_train))) / p) - 1
    while scale > 1:
        neg_train = pd.concat([neg_train, neg_train])
        scale -=1
    neg_train = pd.concat([neg_train, neg_train[:int(scale * len(neg_train))]])
    print("Oversampling done, new proportion: {}".format(1.0*len(pos_train) / (1.0*len(pos_train) + 1.0*len(neg_train))))

    x_train = pd.concat([pos_train, neg_train])
    y_train = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
    del pos_train, neg_train
    
    return (x_train, y_train)

In [68]:
data_1, labels_temp = oversampling(pd.DataFrame(data_1), labels)
data_2, labels_temp = oversampling(pd.DataFrame(data_2), labels)
leaks, labels = oversampling(pd.DataFrame(leaks), labels)
labels = pd.Series(labels)
print data_1.shape, data_2.shape, leaks.shape, labels.shape

 (780486, 30) (780486, 30) (780486, 3) (780486,)


In [27]:
%%time

########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

leaks_input = Input(shape=(leaks.shape[1],))
leaks_dense = Dense(num_dense/2, activation=act)(leaks_input)

merged = concatenate([x1, y1, leaks_dense])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)

########################################
## add class weight
########################################
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None

CPU times: user 1.94 s, sys: 443 ms, total: 2.38 s
Wall time: 2.05 s


In [31]:
print class_weight

None


# Generate 5 Fold Train for Stacking

In [65]:
print data_1.shape, data_2.shape, leaks.shape, len(labels)
print type(data_1)
print type(labels)

(780486, 30) (780486, 30) (780486, 3) 780486
<class 'pandas.core.frame.DataFrame'>
<type 'list'>


In [29]:
########################################
## prepare to train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
# model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

lstm_217_103_0.25_0.18


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss


def five_fold_train(model, name='model', folds=5):
    ''' Function for Generate Five Fold Stacking Features '''
    skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
    fold_count = 1
    for idx_train, idx_val in skf.split(data_1, labels):
        print('Folds: ', fold_count)
        
        # Generate data...
        data_1_train = data_1.iloc[idx_train]
        data_2_train = data_2.iloc[idx_train]
        leaks_train = leaks.iloc[idx_train]
        labels_train = labels.iloc[idx_train]

        data_1_val = data_1.iloc[idx_val]
        data_2_val = data_2.iloc[idx_val]
        leaks_val = leaks.iloc[idx_val]
        labels_val = labels.iloc[idx_val]
        
        
        # Training...
        hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
                         validation_data=([data_1_val, data_2_val, leaks_val], labels_val), \
                         epochs=200, \
                         batch_size=2048, \
                         shuffle=True, \
                         callbacks=[early_stopping, model_checkpoint])
        
        model.load_weights(bst_model_path)
        bst_val_score = min(hist.history['val_loss'])
        print "Best Validation Score: ", bst_val_score
        
        # Validating. Layer 2 training features...
        val = model.predict([data_1_val, data_2_val, leaks_val], batch_size=8192, verbose=1)
        
        val = model.predict_proba(x_train_scaled[val_index])
        val = pd.DataFrame(val)

        val.to_csv('./{}_fold_{}.csv'.format(name, fold_count))
        pd.DataFrame(labels_val).to_csv('./{}_yval_fold_{}.csv'.format(name, fold_count)) # For sanity check.
        print('Validation Error: ', log_loss(labels_val, val))
        fold_count = fold_count + 1

        break
#     return model

In [None]:
### Train ###

trained_model = five_fold_train(model, name='lstmGlove', folds=5)

# hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
#         validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
#         epochs=200, batch_size=2048, shuffle=True, \
#         class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

# model.load_weights(bst_model_path)
# bst_val_score = min(hist.history['val_loss'])

In [None]:
########################################
## make the submission
########################################
print('Start making the submission before fine-tuning')

preds = model.predict([test_data_1, test_data_2, test_leaks], batch_size=8192, verbose=1)
preds += model.predict([test_data_2, test_data_1, test_leaks], batch_size=8192, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)