In [1]:
import os
import re
import csv
import codecs
import gensim
import itertools
import numpy as np
import pandas as pd
import operator
import sys

from nltk import ngrams
from collections import Counter
from string import punctuation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from iwillwin.trainer.supervised_trainer import KerasModelTrainer
from iwillwin.data_utils.data_helpers import DataTransformer, DataLoader
from iwillwin.config import dataset_config
from iwillwin.data_utils.feature_engineering import FeatureCreator
from simhash import Simhash

from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, make_scorer
from sklearn.decomposition import TruncatedSVD

from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize

import seaborn as sns
%matplotlib inline

import jieba

Using TensorFlow backend.


# Phase 1 Feature Engineering

# Clean the texts and drop the duplicate pairs

In [2]:
data_loader = DataLoader()

spn_train_df = data_loader.load_dataset(dataset_config.DATASET_TRAIN_PATH, names=None)
test_df = data_loader.load_dataset(dataset_config.DATASET_TEST_PATH, names=None)

train_df = spn_train_df
train_df = train_df.drop_duplicates()

def preprocessing(text, clean_wiki_tokens=True, drop_image=True):
    if type(text) == float:
        return 'error'
    
    words = [w for w in jieba.cut(text)]
    text = " ".join(words)    
    text = re.sub(r"\<i\>", "", text)
    text = re.sub(r"|", "", text)
    text = re.sub(r";", "", text)
    text = re.sub(r"，", "'", text)
    text = re.sub(r"！ ", "'", text)
    text = re.sub(r"!", "", text)
    text = re.sub(r"¿", "", text)
    text = re.sub(r",", "", text)
    text = re.sub(r"–", "", text)
    text = re.sub(r"−", "", text)
    text = re.sub(r"\.", "", text)
    text = re.sub(r"!", "", text)
    text = re.sub(r"\/", "", text)
    text = re.sub(r"_", "", text)
    text = re.sub(r"\?", "", text)
    text = re.sub(r"？", "", text)
    text = re.sub(r"\^", "", text)
    text = re.sub(r"\+", "", text)
    text = re.sub(r"\-", "", text)
    text = re.sub(r"\=", "", text)
    text = re.sub(r"#", "", text)

    text = re.sub(r"'", "", text)
    return text

dfs = [train_df, test_df]

train_df['spn_1'] = train_df['title1_zh']
train_df['spn_2'] = train_df['title2_zh']

test_df['spn_1'] = test_df['title1_zh']
test_df['spn_2'] = test_df['title2_zh']

for df in [train_df, test_df,]:
    df['raw_spn_1'] = df['spn_1'].values
    df['raw_spn_2'] = df['spn_2'].values

for df in dfs:
    df['spn_1'] = df['spn_1'].apply(lambda v: preprocessing(v))
    df['spn_2'] = df['spn_2'].apply(lambda v: preprocessing(v))

train_df.to_csv(dataset_config.PROCESSED_WORDS_TRAIN_SET, index=False, encoding='utf-8')
test_df.to_csv(dataset_config.PROCESSED_WORDS_TEST_SET, index=False, encoding='utf-8')    

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\zake7\AppData\Local\Temp\jieba.cache
Loading model cost 0.534 seconds.
Prefix dict has been built succesfully.


In [3]:
NB_WORDS = 10000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 50
OUT_SIZE = 3

In [4]:
processed_on = 'RAW'

train_path = dataset_config.PROCESSED_WORDS_TRAIN_SET
test_path = dataset_config.PROCESSED_WORDS_TEST_SET

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
data_loader = DataLoader()
dfs = [train_df, test_df]

if processed_on == 'RAW':
    train_output_path = dataset_config.ENGINEERED_WORDS_TRAIN_SET
    test_output_path = dataset_config.ENGINEERED_WORDS_TEST_SET
    
def split(v):
    v = str(v)
    return v.split()

for df in [train_df, test_df]:
    df['splited_spn_1'] = df['spn_1'].apply(lambda v: v.split())
    df['splited_spn_2'] = df['spn_2'].apply(lambda v: v.split())    

In [5]:
train_df.head()

Unnamed: 0,id,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label,spn_1,spn_2,raw_spn_1,raw_spn_2,splited_spn_1,splited_spn_2
0,0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated,2017 养老保险 又 新增 两项 农村 老人 人人 可 申领 你 领到 了 吗,警方 辟谣 “ 鸟巢 大会 每人 领 5 万 ” 仍 有 老人 坚持 进京,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,"[2017, 养老保险, 又, 新增, 两项, 农村, 老人, 人人, 可, 申领, 你, ...","[警方, 辟谣, “, 鸟巢, 大会, 每人, 领, 5, 万, ”, 仍, 有, 老人, ..."
1,3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated,""" 你 不来 深圳 早晚 你 儿子 也 要 来 "" 不出 10 年 深圳 人均 GDP ...",深圳 GDP 首超 香港 深圳 统计局 辟谣 ： 只是 差距 在 缩小,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,"["", 你, 不来, 深圳, 早晚, 你, 儿子, 也, 要, 来, "", 不出, 10, ...","[深圳, GDP, 首超, 香港, 深圳, 统计局, 辟谣, ：, 只是, 差距, 在, 缩小]"
2,1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated,""" 你 不来 深圳 早晚 你 儿子 也 要 来 "" 不出 10 年 深圳 人均 GDP ...",GDP 首超 香港 深圳 澄清 ： 还 差 一点点 … …,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,"["", 你, 不来, 深圳, 早晚, 你, 儿子, 也, 要, 来, "", 不出, 10, ...","[GDP, 首超, 香港, 深圳, 澄清, ：, 还, 差, 一点点, …, …]"
3,2,2,5,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",去年深圳GDP首超香港？深圳统计局辟谣：还差611亿,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP topped Hong Kong last year? She...,unrelated,""" 你 不来 深圳 早晚 你 儿子 也 要 来 "" 不出 10 年 深圳 人均 GDP ...",去年 深圳 GDP 首超 香港 深圳 统计局 辟谣 ： 还 差 611 亿,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",去年深圳GDP首超香港？深圳统计局辟谣：还差611亿,"["", 你, 不来, 深圳, 早晚, 你, 儿子, 也, 要, 来, "", 不出, 10, ...","[去年, 深圳, GDP, 首超, 香港, 深圳, 统计局, 辟谣, ：, 还, 差, 61..."
4,9,6,7,"""用大蒜鉴别地沟油的方法,怎么鉴别地沟油",吃了30年食用油才知道，一片大蒜轻松鉴别地沟油,"""How to discriminate oil from gutter oil by me...",It took 30 years of cooking oil to know that o...,agreed,""" 用 大蒜 鉴别 地沟油 的 方法 怎么 鉴别 地沟油",吃 了 30 年 食用油 才 知道 一片 大蒜 轻松 鉴别 地沟油,"""用大蒜鉴别地沟油的方法,怎么鉴别地沟油",吃了30年食用油才知道，一片大蒜轻松鉴别地沟油,"["", 用, 大蒜, 鉴别, 地沟油, 的, 方法, 怎么, 鉴别, 地沟油]","[吃, 了, 30, 年, 食用油, 才, 知道, 一片, 大蒜, 轻松, 鉴别, 地沟油]"


In [6]:
test_df.head()

Unnamed: 0,id,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,spn_1,spn_2,raw_spn_1,raw_spn_2,splited_spn_1,splited_spn_2
0,321187,167562,59521,萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大,辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？,egypt 's presidential election failed to win m...,Lyon! Lyon officials have denied that Felipe F...,萨拉 赫 人气 爆棚 埃及 总统大选 未 参选 获 百万 选票 现任 总统 压力 山 大,辟谣 里昂 官方 否认 费 基尔 加盟 利物浦 难道 是 价格 没 谈拢,萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大,辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？,"[萨拉, 赫, 人气, 爆棚, 埃及, 总统大选, 未, 参选, 获, 百万, 选票, 现任...","[辟谣, 里昂, 官方, 否认, 费, 基尔, 加盟, 利物浦, 难道, 是, 价格, 没,..."
1,321190,167564,91315,萨达姆被捕后告诫美国的一句话，发人深思,10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国,A message from Saddam Hussein after he was cap...,The Top 10 Americans believe that the Lizard M...,萨达姆 被捕 后 告诫 美国 的 一句 话 发人深思,10 大 最 让 美国 人 相信 的 荒诞 谣言 如 蜥蜴人 掌控 着 美国,萨达姆被捕后告诫美国的一句话，发人深思,10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国,"[萨达姆, 被捕, 后, 告诫, 美国, 的, 一句, 话, 发人深思]","[10, 大, 最, 让, 美国, 人, 相信, 的, 荒诞, 谣言, 如, 蜥蜴人, 掌控..."
2,321189,167563,167564,萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗,萨达姆被捕后告诫美国的一句话，发人深思,Will the United States wage war on Iraq withou...,A message from Saddam Hussein after he was cap...,萨达姆 此项 计划 没有 此国 破坏 的话 美国 还会 对 伊拉克 发动战争 吗,萨达姆 被捕 后 告诫 美国 的 一句 话 发人深思,萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗,萨达姆被捕后告诫美国的一句话，发人深思,"[萨达姆, 此项, 计划, 没有, 此国, 破坏, 的话, 美国, 还会, 对, 伊拉克, ...","[萨达姆, 被捕, 后, 告诫, 美国, 的, 一句, 话, 发人深思]"
3,321193,167564,160994,萨达姆被捕后告诫美国的一句话，发人深思,被绞刑处死的萨达姆是替身？他的此男人举动击破替身谣言！,A message from Saddam Hussein after he was cap...,The hanging Saddam is a surrogate? This man's ...,萨达姆 被捕 后 告诫 美国 的 一句 话 发人深思,被 绞刑 处死 的 萨达姆 是 替身 他 的 此 男人 举动 击破 替身 谣言 ！,萨达姆被捕后告诫美国的一句话，发人深思,被绞刑处死的萨达姆是替身？他的此男人举动击破替身谣言！,"[萨达姆, 被捕, 后, 告诫, 美国, 的, 一句, 话, 发人深思]","[被, 绞刑, 处死, 的, 萨达姆, 是, 替身, 他, 的, 此, 男人, 举动, 击破..."
4,321191,167564,15084,萨达姆被捕后告诫美国的一句话，发人深思,中国川贝枇杷膏在美国受到热捧？纯属谣言！,A message from Saddam Hussein after he was cap...,Chinese loquat loquat plaster in America? Pure...,萨达姆 被捕 后 告诫 美国 的 一句 话 发人深思,中国 川贝 枇杷膏 在 美国 受到 热 捧 纯属 谣言 ！,萨达姆被捕后告诫美国的一句话，发人深思,中国川贝枇杷膏在美国受到热捧？纯属谣言！,"[萨达姆, 被捕, 后, 告诫, 美国, 的, 一句, 话, 发人深思]","[中国, 川贝, 枇杷膏, 在, 美国, 受到, 热, 捧, 纯属, 谣言, ！]"


In [7]:
%%time
feature_creator = FeatureCreator(train_df, test_df, data_loader, normalization=False)
train_df, test_df = feature_creator.create_features()

No of words in the dictionary = 84834
[FE] create the frequency features
[FE] creating the IR features
[FE] creating the weighted distance features
[FE] creating the length features
[FE] creating the weight features
[FE] creating the distance features
[FE] cosine_sim sample= 
 [0.06900655593423542, 0.31448545101657555]
[FE] manhattan_dis sample = 
 [27.0, 24.0]
[FE] eucledian_dis sample = 
 [5.196152422706632, 5.291502622129181]
[FE] jaccard_dis sample = 
 [0.03571428571428571, 0]
[FE] minkowsk_dis sample = 
 [5.196152422706632, 5.291502622129181]
[FE] creating the fuzzy features
[FE] creating the topic word features
[FE] TODO! Create the graph features
[FE] create the frequency features
[FE] creating the IR features
[FE] creating the weighted distance features
[FE] creating the length features
[FE] creating the weight features
[FE] creating the distance features
[FE] cosine_sim sample= 
 [0.0, 0.24253562503633294]
[FE] manhattan_dis sample = 
 [29.0, 20.0]
[FE] eucledian_dis sample = 

## Features Zoo

## SimHash

In [8]:
%%time
def create_hash_features(df):

    def get_word_ngrams(sequence, n=3):
        return [' '.join(ngram) for ngram in ngrams(sequence, n)]

    def get_character_ngrams(sequence, n=3):
        sequence = ' '.join(sequence)
        return [sequence[i:i+n] for i in range(len(sequence)-n+1)]

    def calculate_simhash_distance(sequence1, sequence2):
        return Simhash(sequence1).distance(Simhash(sequence2))

    def calculate_all_simhash(row):
        q1, q2 = row['splited_spn_1'], row['splited_spn_2']
        simhash_distance = calculate_simhash_distance(q1, q2)

        q1, q2 = get_word_ngrams(q1, 2), get_word_ngrams(q2, 2)
        simhash_distance_2gram = calculate_simhash_distance(q1, q2)

        q1, q2 = get_word_ngrams(q1, 3), get_word_ngrams(q2, 3)
        simhash_distance_3gram = calculate_simhash_distance(q1, q2)

        q1, q2 = get_character_ngrams(q1, 2), get_character_ngrams(q2, 2)
        simhash_distance_ch_2gram = calculate_simhash_distance(q1, q2)

        q1, q2 = get_character_ngrams(q1, 3), get_character_ngrams(q2, 3)
        simhash_distance_ch_3gram = calculate_simhash_distance(q1, q2)

        return '{}:{}:{}:{}:{}'.format(simhash_distance, simhash_distance_2gram, simhash_distance_3gram,
                                             simhash_distance_ch_2gram, simhash_distance_ch_3gram,)


    df['sim_hash'] = df.apply(lambda row: calculate_all_simhash(row), axis=1)
    print("Build sim_hash")
    df['simhash_distance'] = df['sim_hash'].apply(lambda x: float(x.split(':')[0]))
    df['simhash_distance_2gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[1]))
    df['simhash_distance_3gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[2]))
    df['simhash_distance_ch_2gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[3]))
    df['simhash_distance_ch_3gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[4]))
    
create_hash_features(train_df)
print("trainset has processed.")
create_hash_features(test_df)
print("testset has processed.")

  after removing the cwd from sys.path.


Build sim_hash
trainset has processed.
Build sim_hash
testset has processed.
Wall time: 1h 7min 2s


## JellyFish

In [9]:
%%time

import jellyfish
import numpy as np
def smith_waterman(a, b, alignment_score=1, gap_cost=1):
  # H holds the alignment score at each point, computed incrementally
    H = np.zeros((len(a) + 1, len(b) + 1))
    for i in range(1, len(a) + 1):
        for j in range(1, len(b) + 1):
        # The score for substituting the letter a[i-1] for b[j-1]. Generally low
        # for mismatch, high for match.
            match = H[i-1,j -1] + (alignment_score if a[i-1] == b[j-1] else 0)
            # The scores for for introducing extra letters in one of the strings (or
            # by symmetry, deleting them from the other).
            delete = H[1:i,j].max() - gap_cost if i > 1 else 0
            insert = H[i,1:j].max() - gap_cost if j > 1 else 0
            H[i, j] = max(match, delete, insert, 0)
    # The highest score is the best local alignment.
    # For our purposes, we don't actually care _what_ the alignment was, just how
    # aligned the two strings were.
    return H.max()

for df in [train_df, test_df]:
    df['jellyfish_jaro_winkler_distance'] = df[['spn_1', 'spn_2']].apply(lambda row: jellyfish.jaro_winkler(row['spn_1'], row['spn_2']), axis=1)
    df['smith_waterman_distance'] = df[['spn_1', 'spn_2']].apply(lambda row: smith_waterman(row['spn_1'], row['spn_2']), axis=1)

Wall time: 1h 6min 23s


# Check features correlation

In [10]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [11]:
train_df.iloc[-10000:].corr()

Unnamed: 0,id,tid1,tid2,bm25_q1_to_q2,bm25_q2_to_q1,weighted_cosine_sim,len_word_max,len_word_min,len_char_max,len_char_min,...,q1_qué,q2_qué,qué_both,simhash_distance,simhash_distance_2gram,simhash_distance_3gram,simhash_distance_ch_2gram,simhash_distance_ch_3gram,jellyfish_jaro_winkler_distance,smith_waterman_distance
id,1.0,0.108226,0.071973,-0.076214,-0.077182,-0.061058,0.009765,-0.040056,-0.00928,-0.067526,...,,,,0.030228,0.053861,0.029927,0.045483,0.026139,-0.032645,-0.051735
tid1,0.108226,1.0,0.294812,-0.114171,-0.121506,-0.116647,-0.0224,-0.048601,-0.019254,-0.048562,...,,,,0.082436,0.051228,0.028568,0.083775,0.079725,-0.0482,-0.066867
tid2,0.071973,0.294812,1.0,0.072492,0.063952,0.069738,0.011731,0.045773,-0.005947,0.038901,...,,,,-0.062692,-0.051823,-0.032828,-0.065964,-0.070893,0.106472,0.102722
bm25_q1_to_q2,-0.076214,-0.114171,0.072492,1.0,0.973797,0.919055,0.053826,0.251035,0.019754,0.21377,...,,,,-0.769901,-0.606508,-0.383226,-0.739795,-0.492305,0.695813,0.665216
bm25_q2_to_q1,-0.077182,-0.121506,0.063952,0.973797,1.0,0.921253,0.070051,0.230944,0.032744,0.194022,...,,,,-0.770268,-0.605321,-0.388996,-0.737343,-0.485394,0.622651,0.628454
weighted_cosine_sim,-0.061058,-0.116647,0.069738,0.919055,0.921253,1.0,-0.077305,0.064433,-0.112351,0.030513,...,,,,-0.748751,-0.570895,-0.375503,-0.690306,-0.377853,0.664222,0.524746
len_word_max,0.009765,-0.0224,0.011731,0.053826,0.070051,-0.077305,1.0,0.577986,0.812562,0.510876,...,,,,0.050664,0.019413,0.018946,-0.050342,-0.347331,-0.021187,0.44022
len_word_min,-0.040056,-0.048601,0.045773,0.251035,0.230944,0.064433,0.577986,1.0,0.488899,0.90856,...,,,,-0.099133,-0.101633,-0.044995,-0.215782,-0.601265,0.151787,0.742559
len_char_max,-0.00928,-0.019254,-0.005947,0.019754,0.032744,-0.112351,0.812562,0.488899,1.0,0.538249,...,,,,0.075791,0.045719,0.034415,-0.010078,-0.27049,-0.089846,0.384953
len_char_min,-0.067526,-0.048562,0.038901,0.21377,0.194022,0.030513,0.510876,0.90856,0.538249,1.0,...,,,,-0.066824,-0.071919,-0.018999,-0.182653,-0.542706,0.089535,0.710006


## Extract training columns

In [12]:
meta_columns = ['bm25_q1_to_q2', 'bm25_q2_to_q1', 'weighted_cosine_sim',
       'len_word_max', 'len_word_min', 'len_char_max', 'len_char_min',
       'word_length_diff', 'char_length_diff', 'len_diff_remove_stopwords',
       'word_match', 'tfidf_word_match', 'shared_count', 'bigram_corr', 'trigram_corr',
       'word_match_no_stopwords', 'unique_word_ratio', 'cosine_sim',
       'manhattan_dis', 'eucledian_dis', 'jaccard_dis', 'minkowsk_dis',
       'fuzzy_ratio', 'fuzzy_set_ratio', 'fuzzy_partial_ratio',
       'fuzzy_token_sort_ratio', 'fuzzy_qratio', 'fuzzy_WRatio',
       'longest_substr_ratio', 'cómo_both', 'simhash_distance', 'simhash_distance_2gram',
       'simhash_distance_3gram', 'simhash_distance_ch_2gram',
       'simhash_distance_ch_3gram', 'raw_wmd', 'word2vec_jaccard_distance',
       'freq_based_word2vec_cosine_distance',
       'freq_based_word2vec_jaccard_distance',
       'lda_balanced_euclidean_distance', 'lsi_cosine_distance',
       'lsi_jaccard_distance', 'jellyfish_jaro_winkler_distance',
       'smith_waterman_distance'
]

# Output the engineered features

In [13]:
test_output_path

'../data/processed_dataset/engineered_words_test.csv'

In [14]:
train_df.to_csv(train_output_path, index=False, encoding='utf-8')
test_df.to_csv(test_output_path, index=False, encoding='utf-8')