# Quora Question Pair - Process Raw Data

 - created in 2021/5/9, Zhu Zhongbo, first trial with kaggle solutions

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pwd

/content


In [3]:
% cd /content/drive/MyDrive/QuoraQuestionPair/

/content/drive/MyDrive/QuoraQuestionPair


## Modules and global variables

In [2]:
import os
import sys
import re
import csv,json
import codecs
import numpy as np
import pandas as pd
from zipfile import ZipFile
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from os.path import expanduser, exists
from keras.utils.data_utils import get_file

Using TensorFlow backend.


 - codecs – String encoding and decoding
Purpose:	Encoders and decoders for converting text between different representations.
Available In:	2.1 and later
The codecs module provides stream and file interfaces for transcoding data in your program. It is most commonly used to work with Unicode text, but other encodings are also available for other purposes.

- keras - api based on tensorflow

In [3]:
BASE_DIR = './data/'
KERAS_DATASETS_DIR = expanduser('~/.keras/datasets/')
GLOVE_ZIP_FILE_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
GLOVE_ZIP_FILE = 'glove.840B.300d.zip'
GLOVE_FILE = 'glove.840B.300d.txt'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
TRAIN_FEAT_FILE = BASE_DIR + 'X_train.csv'
TEST_FEAT_FILE = BASE_DIR + 'X_test.csv'
COMPLETE_TRAIN_FEAT_FILE = BASE_DIR + 'X_train_full.csv'
COMPLETE_TEST_FEAT_FILE = BASE_DIR + 'X_test_full.csv'
# params
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1
# output files
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
Q1_TEST_DATA_FILE = 'q1_test.npy'
Q2_TEST_DATA_FILE = 'q2_test.npy'
TEST_ID_FILE = 'test_ids.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
TRAIN_FEAT_NPY_FILE = 'train_feat_array.npy'
TEST_FEAT_NPY_FILE = 'test_feat_array.npy'

## Data Prepare

In [None]:
# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

texts_1 = [] 
texts_2 = []
labels = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts_1.append(text_to_wordlist(values[3]))
        texts_2.append(text_to_wordlist(values[4]))
        labels.append(int(values[5]))
print('Found %s texts in train.csv' % len(texts_1))

Found 404290 texts in train.csv


In [None]:
print(texts_1[0:10])

['what is the step by step guide to invest in share market in india ', 'what is the story of kohinoor koh - i - noor diamond ', 'how can i increase the speed of my internet connection while using a vpn ', 'why am i mentally very lonely how can i solve it ', 'which one dissolve in water quikly sugar salt methane and carbon di oxide ', 'astrology : i am a capricorn sun cap moon and cap rising what does that say about me ', 'should i buy tiago ', 'how can i be a good geologist ', 'when do you use instead of ', 'motorola company : can i hack my charter motorolla dcx3400 ']


In [None]:
print(texts_2[0:10])

['what is the step by step guide to invest in share market ', 'what would happen if the indian government stole the kohinoor koh - i - noor diamond back ', 'how can internet speed be increased by hacking through dns ', 'find the remainder when math 23 ^ 24 math is divided by 24 23 ', 'which fish would survive in salt water ', 'i am a triple capricorn sun moon and ascendant in capricorn what does this say about me ', 'what keeps childern active and far from phone and video games ', 'what should i do to be a great geologist ', 'when do you use instead of and ', 'how do i hack motorola dcx3400 for free internet ']


In [None]:
print(labels[0:10])

[0, 0, 0, 0, 0, 1, 0, 1, 0, 0]


In [None]:
test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1]))
        test_texts_2.append(text_to_wordlist(values[2]))
        test_ids.append(values[0])
print('Found %s texts in test.csv' % len(test_texts_1))

Found 2345796 texts in test.csv


In [None]:
print(test_texts_1[0:10])

['how does the surface pro himself 4 compare with ipad pro ', 'should i have a hair transplant at age 24 how much would it cost ', 'what but is the best way to send money from china to the us ', 'which food not emulsifiers ', 'how aberystwyth start reading ', 'how are the two wheeler insurance from bharti axa insurance ', 'how can i reduce my belly fat through a diet ', 'by scrapping the 500 and 1000 rupee notes how is rbi planning to fight against issue black money ', 'what are the how best books of all time ', 'after 12th years old boy and i had sex with a 12 years old girl with her consent is there anything wrong ']


In [None]:
print(test_ids[0:10])

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


## Tokenization

In [None]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

In [None]:
sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)


In [None]:
print(sequences_1[0:10])

In [None]:
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

In [None]:
print(list(word_index.items())[0:30])

In [None]:
data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

In [None]:
print(type(data_1))
print(data_1[0:5])

In [None]:
test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)

In [None]:
print(type(test_data_1[0:10]))
print(test_data_1.shape)

## GLOVE

In [None]:
if not exists(KERAS_DATASETS_DIR + GLOVE_ZIP_FILE):
    print("no such zip file, download it")
    zipfile = ZipFile(get_file(GLOVE_ZIP_FILE, GLOVE_ZIP_FILE_URL))
    zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR)
elif exists(KERAS_DATASETS_DIR + GLOVE_ZIP_FILE) and (not exists(KERAS_DATASETS_DIR + GLOVE_FILE)):
    print("had the zip file, extract it")
    zipfile = ZipFile(KERAS_DATASETS_DIR+GLOVE_ZIP_FILE)
    zipfile.extract(GLOVE_FILE, path=KERAS_DATASETS_DIR)

print("Processing", GLOVE_FILE)

embeddings_index = {}
with open(KERAS_DATASETS_DIR + GLOVE_FILE, encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))

had the zip file, extract it
Processing glove.840B.300d.txt
Word embeddings: 2196016


In [None]:
nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))

Null word embeddings: 33233


In [None]:
print('Shape of question1 data tensor:', data_1.shape)
print('Shape of question2 data tensor:', data_2.shape)
print('Shape of label tensor:', labels.shape)
print('Shape of question1 test data tensor:', test_data_1.shape)
print('Shape of question2 test data tensor:', test_data_2.shape)
print('Shape of label tensor:', test_ids.shape)

Shape of question1 data tensor: (404290, 30)
Shape of question2 data tensor: (404290, 30)
Shape of label tensor: (404290,)
Shape of question1 test data tensor: (2345796, 30)
Shape of question2 test data tensor: (2345796, 30)
Shape of label tensor: (2345796,)


In [None]:
np.save(open(Q1_TRAINING_DATA_FILE, 'wb'), data_1)
np.save(open(Q2_TRAINING_DATA_FILE, 'wb'), data_2)
np.save(open(Q1_TEST_DATA_FILE, 'wb'), test_data_1)
np.save(open(Q2_TEST_DATA_FILE, 'wb'), test_data_2)
np.save(open(TEST_ID_FILE, 'wb'), test_ids)
np.save(open(LABEL_TRAINING_DATA_FILE, 'wb'), labels)
np.save(open(WORD_EMBEDDING_MATRIX_FILE, 'wb'), word_embedding_matrix)
with open(NB_WORDS_DATA_FILE, 'w') as f:
    json.dump({'nb_words': nb_words}, f)

## Merge More Features Into One File

In [3]:
def extend_file(input_fname, data_frame_orig):
  fpath = BASE_DIR + input_fname
  pf = pd.read_csv(fpath)
  pf.fillna(value=0)
  print(pf.shape)
  new_data_frame = pd.concat([data_frame_orig, pf], axis=1)
  return new_data_frame

In [4]:
x_train_orig = pd.read_csv(TRAIN_FEAT_FILE)
x_test_orig = pd.read_csv(TEST_FEAT_FILE)
x_train_orig.shape, x_test_orig.shape

((404290, 17), (2345796, 17))

In [4]:
# merge things here
X_train = extend_file('extra_train_feat.csv', x_train_orig)
X_test = extend_file('extra_test_feat.csv', x_test_orig)

(404290, 73)
(2345796, 73)


In [7]:
# save to csv files
X_train.to_csv(COMPLETE_TRAIN_FEAT_FILE, index=False)
X_test.to_csv(COMPLETE_TEST_FEAT_FILE, index=False)

In [8]:
X_train.shape, X_test.shape

((404290, 90), (2345796, 90))

In [9]:
del X_train
del X_test

## Feature Extraction + Standard Scaler

In [10]:
from sklearn.preprocessing import StandardScaler

In [5]:
# new feaatures
train_feat = pd.read_csv(COMPLETE_TRAIN_FEAT_FILE)
test_feat = pd.read_csv(COMPLETE_TEST_FEAT_FILE)
train_feat = train_feat.fillna(value=0)
test_feat = test_feat.fillna(value=0)
train_feat.shape, test_feat.shape

((404290, 90), (2345796, 90))

In [13]:
train_feat

Unnamed: 0,word_match,tfidf_wm,tfidf_wm_stops,jaccard,wc_diff,wc_ratio,wc_diff_unique,wc_ratio_unique,wc_diff_unq_stop,wc_ratio_unique_stop,...,tfidf_mean2,tfidf_len1,tfidf_len2,graph_clique_feat,q1_pr,q2_pr,q1_freq,q2_freq,q1_q2_intersect,q1_q2_wm_ratio
0,0.727273,0.796024,0.772164,0.769231,2,0.857143,1,0.916667,1,0.833333,...,0.417370,6,5,2.0,1.608405e-07,3.047505e-07,1,2,0,0.000000
1,0.307692,0.359200,0.361758,0.250000,5,1.625000,4,1.500000,5,2.250000,...,0.340163,5,8,8.0,8.189555e-07,3.094298e-07,8,3,0,0.000000
2,0.363636,0.304783,0.355191,0.200000,4,0.714286,4,0.714286,1,0.833333,...,0.440659,6,5,2.0,3.047507e-07,1.608406e-07,2,1,0,0.000000
3,0.000000,0.000000,0.000000,0.000000,2,0.818182,1,0.900000,1,1.250000,...,0.433367,3,5,0.0,2.088105e-07,2.088105e-07,1,1,0,0.000000
4,0.000000,0.034253,0.000000,0.111111,6,0.538462,6,0.538462,5,0.500000,...,0.496439,9,4,3.0,3.404968e-07,1.277957e-07,3,1,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404285,0.857143,0.837624,0.828152,0.785714,1,0.928571,1,0.923077,0,1.000000,...,0.398473,6,6,3.0,1.933878e-07,1.952633e-07,3,3,0,0.000000
404286,0.666667,0.683850,0.657463,0.454545,1,1.125000,0,1.000000,0,1.000000,...,0.571469,3,3,15.0,4.349779e-07,4.397159e-07,15,15,13,0.863133
404287,0.500000,0.602845,0.712011,0.166667,1,0.750000,1,0.750000,0,1.000000,...,1.000000,1,1,0.0,2.088105e-07,2.088105e-07,1,1,0,0.000000
404288,0.000000,0.005178,0.000000,0.025641,8,1.470588,6,1.352941,4,1.444444,...,0.268516,9,13,0.0,2.088105e-07,2.088105e-07,1,1,0,0.000000


In [12]:
test_feat

Unnamed: 0,word_match,tfidf_wm,tfidf_wm_stops,jaccard,wc_diff,wc_ratio,wc_diff_unique,wc_ratio_unique,wc_diff_unq_stop,wc_ratio_unique_stop,...,tfidf_mean2,tfidf_len1,tfidf_len2,graph_clique_feat,q1_pr,q2_pr,q1_freq,q2_freq,q1_q2_intersect,q1_q2_wm_ratio
0,0.266667,0.245900,0.290094,0.090909,3,1.272727,2,1.181818,3,1.500000,...,0.314640,5,9,0.0,2.088105e-07,2.088105e-07,1,1,0,0.0
1,0.500000,0.439759,0.480962,0.235294,7,0.500000,7,0.500000,2,0.714286,...,0.430365,5,5,2.0,2.366496e-07,2.193292e-07,2,2,0,0.0
2,0.444444,0.418727,0.468893,0.285714,8,0.428571,6,0.500000,3,0.500000,...,0.571907,5,3,0.0,2.088105e-07,2.088105e-07,1,1,0,0.0
3,0.000000,0.000000,0.000000,0.000000,1,0.750000,1,0.750000,0,1.000000,...,0.699928,2,2,0.0,2.088105e-07,2.088105e-07,1,1,0,0.0
4,0.800000,0.851476,1.000000,0.428571,2,1.500000,2,1.500000,1,0.666667,...,0.699651,3,2,0.0,2.088105e-07,2.088105e-07,1,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2345791,0.000000,0.000000,0.000000,0.000000,2,0.818182,2,0.818182,3,0.571429,...,0.488174,8,4,0.0,2.088105e-07,2.088105e-07,1,1,0,0.0
2345792,0.222222,0.223549,0.238630,0.076923,4,0.555556,4,0.555556,1,0.800000,...,0.492754,5,4,0.0,2.088105e-07,2.088105e-07,1,1,0,0.0
2345793,0.000000,0.000000,0.000000,0.000000,1,0.888889,2,0.777778,2,0.666667,...,0.443660,5,5,0.0,2.088105e-07,2.088105e-07,1,1,0,0.0
2345794,0.869565,0.887973,0.936711,0.739130,0,1.000000,0,1.000000,1,1.090909,...,0.288907,10,11,0.0,2.088105e-07,2.088105e-07,1,1,0,0.0


In [14]:
train_feat = train_feat.replace([np.inf, -np.inf], 0)
test_feat = test_feat.replace([np.inf, -np.inf], 0)
train_feat.shape, test_feat.shape

((404290, 90), (2345796, 90))

In [15]:
ss = StandardScaler()

train_feat_ss = train_feat[train_feat.columns.values]
test_feat_ss = test_feat[test_feat.columns.values]

np.vstack((train_feat_ss, test_feat_ss)).shape

(2750086, 90)

In [16]:
ss.fit(np.vstack((train_feat_ss, test_feat_ss)))
train_feat_ss = ss.transform(train_feat_ss)
test_feat_ss = ss.transform(test_feat_ss)

In [11]:
print(type(train_feat_ss))
print(type(test_feat_ss))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [17]:
train_feat_ss

array([[ 1.73268924,  2.06181366,  1.85449699, ..., -0.04504227,
        -0.11478188, -0.24204707],
       [ 0.05078041,  0.26531904,  0.25809247, ..., -0.03159392,
        -0.11478188, -0.24204707],
       [ 0.27503492,  0.04151896,  0.23255036, ..., -0.05849061,
        -0.11478188, -0.24204707],
       ...,
       [ 0.82165529,  1.26733775,  1.62051008, ..., -0.05849061,
        -0.11478188, -0.24204707],
       [-1.1826194 , -1.19064572, -1.14907551, ..., -0.05849061,
        -0.11478188, -0.24204707],
       [ 2.82592999,  2.62373743,  2.74073429, ..., -0.05849061,
        -0.11478188, -0.24204707]])

In [18]:
test_feat_ss

array([[-0.1136729 , -0.20064503, -0.02066457, ..., -0.05849061,
        -0.11478188, -0.24204707],
       [ 0.82165529,  0.5966286 ,  0.72177347, ..., -0.04504227,
        -0.11478188, -0.24204707],
       [ 0.59895811,  0.51012821,  0.67483084, ..., -0.05849061,
        -0.11478188, -0.24204707],
       ...,
       [-1.1826194 , -1.21194048, -1.14907551, ..., -0.05849061,
        -0.11478188, -0.24204707],
       [ 2.30307572,  2.43996797,  2.49455362, ..., -0.05849061,
        -0.11478188, -0.24204707],
       [ 0.59895811,  0.87149159,  1.048907  , ..., -0.05849061,
        -0.11478188, -0.24204707]])

In [19]:
np.save(open(TRAIN_FEAT_NPY_FILE, 'wb'), train_feat_ss)
np.save(open(TEST_FEAT_NPY_FILE, 'wb'), test_feat_ss)