In [None]:
import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

import os
import re
import codecs
import csv
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
# Ref: https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = str(text).lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [3]:
BASE_DIR = './'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
# EMBEDDING_FILE = BASE_DIR + 'GoogleNews-vectors-negative300.bin'

# Preprocessing Training Data

In [4]:
df = pd.read_csv(TRAIN_DATA_FILE, delimiter=',', encoding='utf-8')
print df.head()
print len(df.iloc[:, 3])

   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   

                                           question2  is_duplicate  
0  What is the step by step guide to invest in sh...             0  
1  What would happen if the Indian government sto...             0  
2  How can Internet speed be increased by hacking...             0  
3  Find the remainder when [math]23^{24}[/math] i...             0  
4            Which fish would survive in salt water?             0  
404290


In [5]:
texts_1 = map(text_to_wordlist, df.iloc[:, 3])
texts_2 = map(text_to_wordlist, df.iloc[:, 4])
labels = map(text_to_wordlist, df.iloc[:, 5])
print type(texts_1)
print 'Number of questions in train.csv: {}'.format(len(texts_1))

<type 'list'>
Number of questions in train.csv: 404290


# Preprocessing Test Data

In [16]:
test_df = pd.read_csv(TEST_DATA_FILE, delimiter=',', encoding='utf-8', quoting=3, error_bad_lines=False)
print test_df.head()
print len(test_df.iloc[:, 1])

  "test_id"                                        "question1"  \
0         0  "How does the Surface Pro himself 4 compare wi...   
1         1  "Should I have a hair transplant at age 24? Ho...   
2         2  "What but is the best way to send money from C...   
3         3                      "Which food not emulsifiers?"   
4         4               "How ""aberystwyth"" start reading?"   

                                         "question2"  
0  "Why did Microsoft choose core m3 and not core...  
1      "How much cost does hair transplant require?"  
2                    "What you send money to China?"  
3                                "What foods fibre?"  
4                   "How their can I start reading?"  
1566062


In [17]:
test_texts_1 = map(text_to_wordlist, test_df.iloc[:, 1])
test_texts_2 = map(text_to_wordlist, test_df.iloc[:, 2])
test_ids = map(text_to_wordlist, test_df.iloc[:, 0])
print type(test_texts_1)
print 'Number of questions in test.csv: {}'.format(len(test_texts_1))

<type 'list'>
Number of questions in test.csv: 1566062


# Tokenizing, sequencing data

In [22]:
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200 # Default 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

In [23]:
print 'Tokenizing...'
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

print 'Sequencing...'
sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

print sequences_1

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [34]:
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)

Found 118523 unique tokens
('Shape of data tensor:', (404290, 30))
('Shape of label tensor:', (404290,))


# Prepare Embeddings

In [35]:
nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

NameError: name 'word2vec' is not defined