In [None]:
import os
import re
import csv
import codecs
import gensim
import itertools
import numpy as np
import pandas as pd
import operator
import sys

from nltk import ngrams
from collections import Counter
from string import punctuation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from iwillwin.trainer.supervised_trainer import KerasModelTrainer
from iwillwin.data_utils.data_helpers import DataTransformer, DataLoader
from iwillwin.config import dataset_config
from iwillwin.data_utils.feature_engineering import FeatureCreator, CharFeatureCreator
from simhash import Simhash

from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, make_scorer
from sklearn.decomposition import TruncatedSVD

from fuzzywuzzy import fuzz
from nltk.corpus import stopwords
from tqdm import tqdm
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize

import seaborn as sns
%matplotlib inline

Using TensorFlow backend.


# Phase 1 Feature Engineering

# Clean the texts and drop the duplicate pairs

In [None]:
data_loader = DataLoader()

spn_train_df = data_loader.load_dataset(dataset_config.DATASET_TRAIN_PATH, names=None)
test_df = data_loader.load_dataset(dataset_config.DATASET_TEST_PATH, names=None)

train_df = spn_train_df
train_df = train_df.drop_duplicates()

def preprocessing(text, clean_wiki_tokens=True, drop_image=True):
    if type(text) == float:
        return 'error'
    text = re.sub(r"\<i\>", " ", text)
    text = re.sub(r"|", " ", text)
    text = re.sub(r";", " ", text)
    text = re.sub(r"’", "'", text)
    text = re.sub(r"‘", "'", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"¿", " ¿ ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"–", " ", text)
    text = re.sub(r"−", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"_", " ", text)
    text = re.sub(r"\?", " ? ", text)
    text = re.sub(r"？", " ? ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"#", " # ", text)

    text = re.sub(r"'", " ", text)
    
    return text

dfs = [train_df, test_df]

train_df['spn_1'] = train_df['title1_zh']
train_df['spn_2'] = train_df['title2_zh']

test_df['spn_1'] = test_df['title1_zh']
test_df['spn_2'] = test_df['title2_zh']

for df in [train_df, test_df,]:
    df['raw_spn_1'] = df['spn_1'].values
    df['raw_spn_2'] = df['spn_2'].values

for df in dfs:
    df['spn_1'] = df['spn_1'].apply(lambda v: preprocessing(v))
    df['spn_2'] = df['spn_2'].apply(lambda v: preprocessing(v))

train_df.to_csv(dataset_config.PROCESSED_CHARS_TRAIN_SET, index=False, encoding='utf-8')
test_df.to_csv(dataset_config.PROCESSED_CHARS_TEST_SET, index=False, encoding='utf-8')    

In [None]:
NB_WORDS = 10000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LENGTH = 50
OUT_SIZE = 3

In [None]:
processed_on = 'RAW'

train_path = dataset_config.PROCESSED_CHARS_TRAIN_SET
test_path = dataset_config.PROCESSED_CHARS_TEST_SET

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
data_loader = DataLoader()
dfs = [train_df, test_df]

if processed_on == 'RAW':
    train_output_path = dataset_config.ENGINEERED_CHARS_TRAIN_SET
    test_output_path = dataset_config.ENGINEERED_CHARS_TEST_SET
    
def split(v):
    v = str(v)
    return v.split()

for df in [train_df, test_df]:
    df['splited_spn_1'] = df['spn_1'].apply(lambda v: [e for e in v if e != ' '])
    df['splited_spn_2'] = df['spn_2'].apply(lambda v: [e for e in v if e != ' '])    

In [None]:
train_df.head()

Unnamed: 0,id,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label,spn_1,spn_2,raw_spn_1,raw_spn_2,splited_spn_1,splited_spn_2
0,0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated,2 0 1 7 养 老 保 险 又 新 增 两 项 ， 农 村 老 人 人 人 可 申...,警 方 辟 谣 “ 鸟 巢 大 会 每 人 领 5 万 ” 仍 有 老 人 坚 持 进 京,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,"[2, 0, 1, 7, 养, 老, 保, 险, 又, 新, 增, 两, 项, ，, 农, ...","[警, 方, 辟, 谣, “, 鸟, 巢, 大, 会, 每, 人, 领, 5, 万, ”, ..."
1,3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated,""" 你 不 来 深 圳 ， 早 晚 你 儿 子 也 要 来 "" ， 不 出 1 0 年 深...",深 圳 G D P 首 超 香 港 ? 深 圳 统 计 局 辟 谣 ： 只 是 差 距...,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,"["", 你, 不, 来, 深, 圳, ，, 早, 晚, 你, 儿, 子, 也, 要, 来, ...","[深, 圳, G, D, P, 首, 超, 香, 港, ?, 深, 圳, 统, 计, 局, ..."
2,1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated,""" 你 不 来 深 圳 ， 早 晚 你 儿 子 也 要 来 "" ， 不 出 1 0 年 深...",G D P 首 超 香 港 ? 深 圳 澄 清 ： 还 差 一 点 点 … …,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,"["", 你, 不, 来, 深, 圳, ，, 早, 晚, 你, 儿, 子, 也, 要, 来, ...","[G, D, P, 首, 超, 香, 港, ?, 深, 圳, 澄, 清, ：, 还, 差, ..."
3,2,2,5,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",去年深圳GDP首超香港？深圳统计局辟谣：还差611亿,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP topped Hong Kong last year? She...,unrelated,""" 你 不 来 深 圳 ， 早 晚 你 儿 子 也 要 来 "" ， 不 出 1 0 年 深...",去 年 深 圳 G D P 首 超 香 港 ? 深 圳 统 计 局 辟 谣 ： 还 差...,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",去年深圳GDP首超香港？深圳统计局辟谣：还差611亿,"["", 你, 不, 来, 深, 圳, ，, 早, 晚, 你, 儿, 子, 也, 要, 来, ...","[去, 年, 深, 圳, G, D, P, 首, 超, 香, 港, ?, 深, 圳, 统, ..."
4,9,6,7,"""用大蒜鉴别地沟油的方法,怎么鉴别地沟油",吃了30年食用油才知道，一片大蒜轻松鉴别地沟油,"""How to discriminate oil from gutter oil by me...",It took 30 years of cooking oil to know that o...,agreed,""" 用 大 蒜 鉴 别 地 沟 油 的 方 法 怎 么 鉴 别 地 沟 油",吃 了 3 0 年 食 用 油 才 知 道 ， 一 片 大 蒜 轻 松 鉴 别 地 沟 油,"""用大蒜鉴别地沟油的方法,怎么鉴别地沟油",吃了30年食用油才知道，一片大蒜轻松鉴别地沟油,"["", 用, 大, 蒜, 鉴, 别, 地, 沟, 油, 的, 方, 法, 怎, 么, 鉴, ...","[吃, 了, 3, 0, 年, 食, 用, 油, 才, 知, 道, ，, 一, 片, 大, ..."


In [None]:
test_df.head()

Unnamed: 0,id,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,spn_1,spn_2,raw_spn_1,raw_spn_2,splited_spn_1,splited_spn_2
0,321187,167562,59521,萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大,辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？,egypt 's presidential election failed to win m...,Lyon! Lyon officials have denied that Felipe F...,萨 拉 赫 人 气 爆 棚 ! 埃 及 总 统 大 选 未 参 选 获 百 万 选...,辟 谣 ！ 里 昂 官 方 否 认 费 基 尔 加 盟 利 物 浦 ， 难 道 是 价 格...,萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大,辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？,"[萨, 拉, 赫, 人, 气, 爆, 棚, !, 埃, 及, 总, 统, 大, 选, 未, ...","[辟, 谣, ！, 里, 昂, 官, 方, 否, 认, 费, 基, 尔, 加, 盟, 利, ..."
1,321190,167564,91315,萨达姆被捕后告诫美国的一句话，发人深思,10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国,A message from Saddam Hussein after he was cap...,The Top 10 Americans believe that the Lizard M...,萨 达 姆 被 捕 后 告 诫 美 国 的 一 句 话 ， 发 人 深 思,1 0 大 最 让 美 国 人 相 信 的 荒 诞 谣 言 ， 如 蜥 蜴 人 掌 控 着...,萨达姆被捕后告诫美国的一句话，发人深思,10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国,"[萨, 达, 姆, 被, 捕, 后, 告, 诫, 美, 国, 的, 一, 句, 话, ，, ...","[1, 0, 大, 最, 让, 美, 国, 人, 相, 信, 的, 荒, 诞, 谣, 言, ..."
2,321189,167563,167564,萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗,萨达姆被捕后告诫美国的一句话，发人深思,Will the United States wage war on Iraq withou...,A message from Saddam Hussein after he was cap...,萨 达 姆 此 项 计 划 没 有 此 国 破 坏 的 话 ， 美 国 还 会 对 伊 拉...,萨 达 姆 被 捕 后 告 诫 美 国 的 一 句 话 ， 发 人 深 思,萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗,萨达姆被捕后告诫美国的一句话，发人深思,"[萨, 达, 姆, 此, 项, 计, 划, 没, 有, 此, 国, 破, 坏, 的, 话, ...","[萨, 达, 姆, 被, 捕, 后, 告, 诫, 美, 国, 的, 一, 句, 话, ，, ..."
3,321193,167564,160994,萨达姆被捕后告诫美国的一句话，发人深思,被绞刑处死的萨达姆是替身？他的此男人举动击破替身谣言！,A message from Saddam Hussein after he was cap...,The hanging Saddam is a surrogate? This man's ...,萨 达 姆 被 捕 后 告 诫 美 国 的 一 句 话 ， 发 人 深 思,被 绞 刑 处 死 的 萨 达 姆 是 替 身 ? 他 的 此 男 人 举 动 击 破...,萨达姆被捕后告诫美国的一句话，发人深思,被绞刑处死的萨达姆是替身？他的此男人举动击破替身谣言！,"[萨, 达, 姆, 被, 捕, 后, 告, 诫, 美, 国, 的, 一, 句, 话, ，, ...","[被, 绞, 刑, 处, 死, 的, 萨, 达, 姆, 是, 替, 身, ?, 他, 的, ..."
4,321191,167564,15084,萨达姆被捕后告诫美国的一句话，发人深思,中国川贝枇杷膏在美国受到热捧？纯属谣言！,A message from Saddam Hussein after he was cap...,Chinese loquat loquat plaster in America? Pure...,萨 达 姆 被 捕 后 告 诫 美 国 的 一 句 话 ， 发 人 深 思,中 国 川 贝 枇 杷 膏 在 美 国 受 到 热 捧 ? 纯 属 谣 言 ！,萨达姆被捕后告诫美国的一句话，发人深思,中国川贝枇杷膏在美国受到热捧？纯属谣言！,"[萨, 达, 姆, 被, 捕, 后, 告, 诫, 美, 国, 的, 一, 句, 话, ，, ...","[中, 国, 川, 贝, 枇, 杷, 膏, 在, 美, 国, 受, 到, 热, 捧, ?, ..."


In [None]:
%%time
feature_creator = CharFeatureCreator(train_df, test_df, data_loader, normalization=False)
train_df, test_df = feature_creator.create_features()

No of words in the dictionary = 5244
[FE] create the frequency features
[FE] creating the IR features
[FE] creating the weighted distance features
[FE] creating the length features
[FE] creating the weight features
[FE] creating the distance features
[FE] cosine_sim sample= 
 [0.31234752377721214, 0.37921028494152054]
[FE] manhattan_dis sample = 
 [44.0, 38.0]
[FE] eucledian_dis sample = 
 [6.782329983125268, 6.928203230275509]
[FE] jaccard_dis sample = 
 [0, 0]
[FE] minkowsk_dis sample = 
 [6.782329983125268, 6.928203230275509]
[FE] creating the fuzzy features
[FE] creating the topic word features
[FE] TODO! Create the graph features
[FE] create the frequency features
[FE] creating the IR features
[FE] creating the weighted distance features
[FE] creating the length features
[FE] creating the weight features
[FE] creating the distance features
[FE] cosine_sim sample= 
 [0.0, 0.32963425737213164]
[FE] manhattan_dis sample = 
 [57.0, 34.0]
[FE] eucledian_dis sample = 
 [8.30662386291807

## Features Zoo

## SimHash

In [None]:
%%time
def create_hash_features(df):

    def get_word_ngrams(sequence, n=3):
        return [' '.join(ngram) for ngram in ngrams(sequence, n)]

    def get_character_ngrams(sequence, n=3):
        sequence = ' '.join(sequence)
        return [sequence[i:i+n] for i in range(len(sequence)-n+1)]

    def calculate_simhash_distance(sequence1, sequence2):
        return Simhash(sequence1).distance(Simhash(sequence2))

    def calculate_all_simhash(row):
        q1, q2 = row['splited_spn_1'], row['splited_spn_2']
        simhash_distance = calculate_simhash_distance(q1, q2)

        q1, q2 = get_word_ngrams(q1, 2), get_word_ngrams(q2, 2)
        simhash_distance_2gram = calculate_simhash_distance(q1, q2)

        q1, q2 = get_word_ngrams(q1, 3), get_word_ngrams(q2, 3)
        simhash_distance_3gram = calculate_simhash_distance(q1, q2)

        q1, q2 = get_character_ngrams(q1, 2), get_character_ngrams(q2, 2)
        simhash_distance_ch_2gram = calculate_simhash_distance(q1, q2)

        q1, q2 = get_character_ngrams(q1, 3), get_character_ngrams(q2, 3)
        simhash_distance_ch_3gram = calculate_simhash_distance(q1, q2)

        return '{}:{}:{}:{}:{}'.format(simhash_distance, simhash_distance_2gram, simhash_distance_3gram,
                                             simhash_distance_ch_2gram, simhash_distance_ch_3gram,)


    df['sim_hash'] = df.apply(lambda row: calculate_all_simhash(row), axis=1)
    print("Build sim_hash")
    df['simhash_distance'] = df['sim_hash'].apply(lambda x: float(x.split(':')[0]))
    df['simhash_distance_2gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[1]))
    df['simhash_distance_3gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[2]))
    df['simhash_distance_ch_2gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[3]))
    df['simhash_distance_ch_3gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[4]))
    
create_hash_features(train_df)
print("trainset has processed.")
create_hash_features(test_df)
print("testset has processed.")

  after removing the cwd from sys.path.


Build sim_hash
trainset has processed.
Build sim_hash
testset has processed.
Wall time: 1h 43min 56s


## JellyFish

In [None]:
%%time

import jellyfish
import numpy as np
def smith_waterman(a, b, alignment_score=1, gap_cost=1):
  # H holds the alignment score at each point, computed incrementally
    H = np.zeros((len(a) + 1, len(b) + 1))
    for i in range(1, len(a) + 1):
        for j in range(1, len(b) + 1):
        # The score for substituting the letter a[i-1] for b[j-1]. Generally low
        # for mismatch, high for match.
            match = H[i-1,j -1] + (alignment_score if a[i-1] == b[j-1] else 0)
            # The scores for for introducing extra letters in one of the strings (or
            # by symmetry, deleting them from the other).
            delete = H[1:i,j].max() - gap_cost if i > 1 else 0
            insert = H[i,1:j].max() - gap_cost if j > 1 else 0
            H[i, j] = max(match, delete, insert, 0)
    # The highest score is the best local alignment.
    # For our purposes, we don't actually care _what_ the alignment was, just how
    # aligned the two strings were.
    return H.max()

for df in [train_df, test_df]:
    df['jellyfish_jaro_winkler_distance'] = df[['spn_1', 'spn_2']].apply(lambda row: jellyfish.jaro_winkler(row['spn_1'], row['spn_2']), axis=1)
    df['smith_waterman_distance'] = df[['spn_1', 'spn_2']].apply(lambda row: smith_waterman(row['spn_1'], row['spn_2']), axis=1)

Wall time: 2h 31min 13s


# Check features correlation

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
train_df.iloc[-10000:].corr()

## Extract training columns

In [None]:
meta_columns = ['bm25_q1_to_q2', 'bm25_q2_to_q1', 'weighted_cosine_sim',
       'len_word_max', 'len_word_min', 'len_char_max', 'len_char_min',
       'word_length_diff', 'char_length_diff', 'len_diff_remove_stopwords',
       'word_match', 'tfidf_word_match', 'shared_count', 'bigram_corr', 'trigram_corr',
       'word_match_no_stopwords', 'unique_word_ratio', 'cosine_sim',
       'manhattan_dis', 'eucledian_dis', 'jaccard_dis', 'minkowsk_dis',
       'fuzzy_ratio', 'fuzzy_set_ratio', 'fuzzy_partial_ratio',
       'fuzzy_token_sort_ratio', 'fuzzy_qratio', 'fuzzy_WRatio',
       'longest_substr_ratio', 'cómo_both', 'simhash_distance', 'simhash_distance_2gram',
       'simhash_distance_3gram', 'simhash_distance_ch_2gram',
       'simhash_distance_ch_3gram', 'raw_wmd', 'word2vec_jaccard_distance',
       'freq_based_word2vec_cosine_distance',
       'freq_based_word2vec_jaccard_distance',
       'lda_balanced_euclidean_distance', 'lsi_cosine_distance',
       'lsi_jaccard_distance', 'jellyfish_jaro_winkler_distance',
       'smith_waterman_distance'
]

# Output the engineered features

In [12]:
test_output_path

'../data/processed_dataset/engineered_chars_test.csv'

In [13]:
train_df.to_csv(train_output_path, index=False, encoding='utf-8')
test_df.to_csv(test_output_path, index=False, encoding='utf-8')