# Quora Question Pair - Process Raw Data

 - created in 2021/5/9, Zhu Zhongbo, first trial with kaggle solutions

## Modules and global variables

In [24]:
import os
import sys
import re
import csv,json
import codecs
import numpy as np
import pandas as pd
from zipfile import ZipFile
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from os.path import expanduser, exists

 - codecs – String encoding and decoding
Purpose:	Encoders and decoders for converting text between different representations.
Available In:	2.1 and later
The codecs module provides stream and file interfaces for transcoding data in your program. It is most commonly used to work with Unicode text, but other encodings are also available for other purposes.

- keras - api based on tensorflow

In [37]:
BASE_DIR = './data/'
KERAS_DATASETS_DIR = expanduser('~/.keras/datasets/')
GLOVE_ZIP_FILE_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
GLOVE_ZIP_FILE = 'glove.840B.300d.zip'
GLOVE_FILE = 'glove.840B.300d.txt'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
Q1_TEST_DATA_FILE = 'q1_test.npy'
Q2_TEST_DATA_FILE = 'q2_test.npy'
TEST_ID_FILE = 'test_ids.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'

## Data Prepare

In [3]:
# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

texts_1 = [] 
texts_2 = []
labels = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts_1.append(text_to_wordlist(values[3]))
        texts_2.append(text_to_wordlist(values[4]))
        labels.append(int(values[5]))
print('Found %s texts in train.csv' % len(texts_1))

Found 404290 texts in train.csv


In [4]:
print(texts_1[0:10])

['what is the step by step guide to invest in share market in india ', 'what is the story of kohinoor koh - i - noor diamond ', 'how can i increase the speed of my internet connection while using a vpn ', 'why am i mentally very lonely how can i solve it ', 'which one dissolve in water quikly sugar salt methane and carbon di oxide ', 'astrology : i am a capricorn sun cap moon and cap rising what does that say about me ', 'should i buy tiago ', 'how can i be a good geologist ', 'when do you use instead of ', 'motorola company : can i hack my charter motorolla dcx3400 ']


In [5]:
print(texts_2[0:10])

['what is the step by step guide to invest in share market ', 'what would happen if the indian government stole the kohinoor koh - i - noor diamond back ', 'how can internet speed be increased by hacking through dns ', 'find the remainder when math 23 ^ 24 math is divided by 24 23 ', 'which fish would survive in salt water ', 'i am a triple capricorn sun moon and ascendant in capricorn what does this say about me ', 'what keeps childern active and far from phone and video games ', 'what should i do to be a great geologist ', 'when do you use instead of and ', 'how do i hack motorola dcx3400 for free internet ']


In [6]:
print(labels[0:10])

[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]


In [7]:
test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1]))
        test_texts_2.append(text_to_wordlist(values[2]))
        test_ids.append(values[0])
print('Found %s texts in test.csv' % len(test_texts_1))

Found 2345796 texts in test.csv


In [8]:
print(test_texts_1[0:10])

['how does the surface pro himself 4 compare with ipad pro ', 'should i have a hair transplant at age 24 how much would it cost ', 'what but is the best way to send money from china to the us ', 'which food not emulsifiers ', 'how aberystwyth start reading ', 'how are the two wheeler insurance from bharti axa insurance ', 'how can i reduce my belly fat through a diet ', 'by scrapping the 500 and 1000 rupee notes how is rbi planning to fight against issue black money ', 'what are the how best books of all time ', 'after 12th years old boy and i had sex with a 12 years old girl with her consent is there anything wrong ']


In [9]:
print(test_ids[0:10])

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


## Tokenization

In [10]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

In [11]:
sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)


In [21]:
print(sequences_1[0:10])

[[2, 3, 1, 1254, 61, 1254, 2921, 8, 578, 7, 759, 370, 7, 35], [2, 3, 1, 532, 10, 16563, 11914, 4, 22978, 4743], [5, 13, 4, 293, 1, 423, 10, 18, 334, 1724, 153, 127, 6, 2885], [15, 47, 4, 3396, 278, 3289, 5, 13, 4, 682, 17], [23, 48, 5750, 7, 204, 55553, 1593, 2208, 10718, 12, 1892, 7839, 5204], [2922, 4, 47, 6, 9205, 921, 4796, 825, 12, 4796, 5013, 2, 21, 30, 206, 50, 54], [29, 4, 122, 17729], [5, 13, 4, 28, 6, 42, 29375], [37, 9, 16, 71, 466, 10], [7208, 173, 13, 4, 549, 18, 13182, 98267, 93282]]


In [12]:
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

Found 120499 unique tokens


In [15]:
print(list(word_index.items())[0:30])

[('the', 1), ('what', 2), ('is', 3), ('i', 4), ('how', 5), ('a', 6), ('in', 7), ('to', 8), ('do', 9), ('of', 10), ('are', 11), ('and', 12), ('can', 13), ('for', 14), ('why', 15), ('you', 16), ('it', 17), ('my', 18), ('best', 19), ('on', 20), ('does', 21), ('or', 22), ('which', 23), ('if', 24), ('have', 25), ('get', 26), ('with', 27), ('be', 28), ('should', 29), ('that', 30)]


In [16]:
data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (404290, 30)
Shape of label tensor: (404290,)


In [19]:
print(type(data_1))
print(data_1[0:5])

<class 'numpy.ndarray'>
[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     2     3     1  1254    61  1254  2921     8
    578     7   759   370     7    35]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     2     3     1   532
     10 16563 11914     4 22978  4743]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     5    13     4   293     1   423    10    18
    334  1724   153   127     6  2885]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0    15    47     4  3396   278
   3289     5    13     4   682    17]
 [    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0    23    48  5750     7   204 55553  1593
   2208 10718    12  1892  7839  5204]]


In [20]:
test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)

In [33]:
print(type(test_data_1[0:10]))
print(test_data_1.shape)

<class 'numpy.ndarray'>
(2345796, 30)


## GLOVE

In [30]:
if not exists(KERAS_DATASETS_DIR + GLOVE_ZIP_FILE):
    print("no such zip file, download it")
    zipfile = ZipFile(get_file(GLOVE_ZIP_FILE, GLOVE_ZIP_FILE_URL))
    zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR)
elif exists(KERAS_DATASETS_DIR + GLOVE_ZIP_FILE) and (not exists(KERAS_DATASETS_DIR + GLOVE_FILE)):
    print("had the zip file, extract it")
    zipfile = ZipFile(KERAS_DATASETS_DIR+GLOVE_ZIP_FILE)
    zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR)

print("Processing", GLOVE_FILE)

embeddings_index = {}
with open(KERAS_DATASETS_DIR + GLOVE_FILE, encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

had the zip file, extract it
Processing glove.840B.300d.txt
Word embeddings: 2196016


In [31]:
nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

Null word embeddings: 33233


In [35]:
print('Shape of question1 data tensor:', data_1.shape)
print('Shape of question2 data tensor:', data_2.shape)
print('Shape of label tensor:', labels.shape)
print('Shape of question1 test data tensor:', test_data_1.shape)
print('Shape of question2 test data tensor:', test_data_2.shape)
print('Shape of label tensor:', test_ids.shape)

Shape of question1 data tensor: (404290, 30)
Shape of question2 data tensor: (404290, 30)
Shape of label tensor: (404290,)
Shape of question1 test data tensor: (2345796, 30)
Shape of question2 test data tensor: (2345796, 30)
Shape of label tensor: (2345796,)


In [38]:
np.save(open(Q1_TRAINING_DATA_FILE, 'wb'), data_1)
np.save(open(Q2_TRAINING_DATA_FILE, 'wb'), data_2)
np.save(open(Q1_TEST_DATA_FILE, 'wb'), test_data_1)
np.save(open(Q2_TEST_DATA_FILE, 'wb'), test_data_2)
np.save(open(TEST_ID_FILE, 'wb'), test_ids)
np.save(open(LABEL_TRAINING_DATA_FILE, 'wb'), labels)
np.save(open(WORD_EMBEDDING_MATRIX_FILE, 'wb'), word_embedding_matrix)
with open(NB_WORDS_DATA_FILE, 'w') as f:
    json.dump({'nb_words': nb_words}, f)