In [11]:
import time
import re
import math
import jieba
import json
import codecs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
jieba.load_userdict('./dict/jieba_dict')

In [15]:
data_1 = pd.read_csv('./data/atec_nlp_sim_train_add.csv', sep='\t', header=None, encoding='utf-8')
data_1.columns = ['id', 'question1', 'question2', 'is_duplicate']
data_2 = pd.read_csv('./data/atec_nlp_sim_train.csv', sep='\t', header=None, encoding='utf-8')
data_2.columns = ['id', 'question1', 'question2', 'is_duplicate']
all_data = pd.concat([data_1, data_2])

In [2]:
all_data = pd.read_csv('./data/all_data', sep=' ', header=None, encoding='utf-8')
all_data.columns = ['id', 'question1', 'question2', 'is_duplicate']
all_data.head()

Unnamed: 0,id,question1,question2,is_duplicate
0,1,我要取消预约理财,怎么取消我的定期理财,0
1,2,我银行卡密码正确，但是错误的。,密码输入正确，提示错误,1
2,3,自动还款是扣余额宝的吗,余额宝有钱，还款当日自动扣钱吗,1
3,4,赔付的运费险什么时候到账,我退货了，什么时候退运费给我？,0
4,5,余额宝的***转不出来了吗,为什么我的余额宝转不进去？,0


In [3]:
def jieba_cut(sentence):
    '''jieba分词'''
    seg_sent = jieba.cut(sentence, cut_all=False)
    return list(seg_sent)

def jieba_word_cut(sentence):
    '''切成字，并且保留切的词'''
    sentence_list = []
    for word in sentence:
        if len(word) == 1:
            sentence_list.append(word)
        else:
            for character in word:
                sentence_list.append(character)
            sentence_list.append(word)
    return ' '.join(sentence_list)

def first_char_cut(sentence):
    '''jieba分词'''
    first_char = []
    for word in sentence:
        if len(word) == 1:
            first_char.append(word)
        else:
            first_char.append(word[0])
    return ' '.join(first_char)

def word_cut(sentence):
    '''分字'''
    return ' '.join([word for word in sentence])

def replace_words(sentence):
    #stopwords = ['"', '#', '&', "'", '(', ')', '*', '+',',', '-', '.', '...', '/', ':', ';', '<', '=','>', '?', '@', 'Lex', '[', ']', 'exp', 'sub', 'sup', '}', '~', '·', '×', '÷', 'Δ', 'Ψ', 'γ', 'μ', 'φ', 'В', '—', '———', '‘', '’', '“', '”', '″', '℃', 'Ⅲ', '↑', '→', '∈', '①','②', '③', '④', '⑤', '⑥', '⑦', '⑧', '⑨', '⑩', '──', '■', '▲', '、', '。', '〉', '《', '》', '『', '』', '【', '】', '〔', '〕', '㈧', '一','丫', '也', '了', '俺', '俺们', '兮', '吧', '吧哒', '呃', '呐', '呗', '咚', '咦', '咧', '哎', '哎呀','哎哟','哩','唉', '啊', '啐','喏', '喔', '喔唷', '嗬', '嗯', '嗳', '我', '把', '按', '按照', '数', '日', '的', '罢了', '蚁', '蚂', '蜂', '逼', '阿', '！', '＃', '％', '＆', '＇', '（', '）', '＊', '＋', '，', '－', '．', '／', '１', '：', '；', '＜', '＝','＞', '＞λ', '？', 'Ａ', 'Ｂ', 'Ｆ', 'Ｉ', 'Ｌ', 'ＬＩ', 'Ｔ', 'Ｘ', 'Ｚ', '［', '［－', '］', '＿', 'ａ', '｛', '｝', '～', '～±','～＋']
    replace_dict = {'零':'0','一':'1','二':'2','三':'3','四':'4','五':'5','六':'6','七':'7','八':'8','九':'9','十':'10','叻': '了', '童': '通', '职': '值', '電': '电', '刪': '删', '宫': '营', '轲': '可', '兩': '两', '泽': '择', '拱': '供', '貝': '呗', '夠': '够', '罰': '罚', '嚒': '么', '涉': '设', '愈': '逾', '唄': '呗', '為': '为', '現': '现', '珊': '删', '酱': '降', '讷': '呐', '杞': '从', '竖': '公', '戗': '钱', '凊': '清', '陪': '赔', '嫌': '限', '鹅': '额', '聪': '充', '個': '个', '亿': '已', '蚂议': '蚂蚁', '陶': '淘', '伏': '付', '堡': '宝', '肔': '服', '巳': '已', '花坝': '花呗', '洼': '清', '甜': '填', '厉': '里', '渣': '咋', '纬': '为', '無': '无', '勇': '用', '扭': '钮', '压金': '押金', '绐': '给', '捉': '提', '喻': '逾', '換': '还', '胞': '宝', '調': '调', '抄': '超', '麽': '么', '歆': '完', '囗': '口', '卜': '不', '扮': '办', '氣': '气', '費': '费', '評': '评', '夫': '付', '猛': '能', '銀': '银', '枪': '清', '痛': '通', '喀': '额', '囙': '回', '筠': '运', '昰': '是', '吾': '我', '帳': '账', '玉': '与', '浓': '弄', '雪': '学', '螞蟻': '蚂蚁', '規': '规', '丕': '不', '時': '时', '花唄': '花呗', '梓': '么', '济': '机', '弍': '式', '貸': '贷', '繳': '交', '届': '借', '甪': '用', '蒋': '降', '欺': '期', '舍': '设', '妃': '为', '咔': '卡', '啤': '碑', '傲': '以', '俞': '逾', '田': '天', '剛': '刚', '怼': '对', '脚': '交', '怅': '账', '餮': '餐', '栓': '删', '揍': '款', '寶': '宝', '蝙': '变', '肖': '消', '剪': '减', '崔': '催', '榜': '绑', '扎': '咋', '圆': '元', '饯': '钱', '嘞': '了', '腐': '付', '辞': '迟', '昱': '里', '师': '是', '侍': '待', '睌': '晚', '宣': '删', '花背': '花呗', '購': '购', '慨': '概', '魔': '摩', '臂': '呗', '肿': '怎', '花贝': '花呗', '碼': '码', '茨': '款', '拳': '券', '乍': '咋', '証': '证', '歧': '期', '嘟': '都', '结呗': '借呗', '锤': '吹', '轻': '清', '厚': '后', '玏': '功', '乙': '已', '挷': '绑', '拦': '栏', '辟': '批', '讠': '之', '闹': '弄', '負': '付', '犹': '怀', '筘': '扣', '嗳': '爱', '說': '说', '扔': '仍', '花裁': '花', '吋': '时', '収': '收', '磕': '可', '給': '给', '腿': '退', '梆': '绑', '冬': '冻', '幼': '动', '炸': '咋', '經': '经', '骂': '吗', '欹': '款', '莉': '里', '叶': '页', '鍀': '的', '岀': '出', '欲': '逾', '花被': '花呗', '节清': '结清', '錢': '钱', '曰': '日', '戒备': '借呗', '灣': '湾', '贺': '和', '紅': '红', '幺': '么', '孒': '了', '別': '别', '涮': '刷', '歉': '欠', '泥': '呢', '額': '额', '栅': '删', '佝': '何', '壮': '状', '叹': '呗', '眷': '劵', '洋': '样', '蚂蚊': '蚂蚁', '哩': '里', '蚂蚱': '蚂蚁', '還': '还', '樣': '样', '杳': '查', '茌': '花', '卷': '券', '證': '证', '麼': '么', '佘': '余', '買': '买', '帝': '低', '胀': '账', '雨': '与', '花能': '花呗能', '崴': '为', '貨': '货', '丟': '丢', '開': '开', '叭': '呗', '昵': '呢', '祝': '况', '毙': '闭', '屎': '是', '佰': '百', '宴': '延', '幵': '开', '仟': '千', '來': '来', '挨': '爱', '祢': '你', '糸': '细', '颃': '用', '乳': '用', '借唄': '借呗', '唔': '客', '則': '则', '阔': '可', '叨': '嘛', '花多上': '花多少', '镀': '度', '刭': '到', '冯': '吗', '蔡': '才', '丽': '里', '減': '减', '狂': '款', '錯': '错', '匆': '充', '問': '问', '窃': '切', '贯': '关', '勒': '了', '颌': '额', '敗': '败', '咬': '要', '鈤': '日', '莪': '我', '腨': '用', '吵': '超', '篮': '蓝', '培': '赔', '粑': '把', '躲': '多', '嗎': '吗', '戶': '户', '毎': '每', '呮': '呗', '姑': '过', '胆': '但', '脱': '拖', '胃': '为', '剧': '刷', '吸': '息', '布': '不', '夂': '久', '栋': '冻', '淸': '清', '萌': '能', '愉': '逾', '請': '请', '卅': '啥', '堤': '提', '吱': '支', '禾': '何', '菅': '营', '渝': '逾', '侯': '候', '權': '权', '鼻': '比', '杜': '度', '嗨': '还', '踩': '才', '矿': '款', '珐': '法', 'ma': '吗', '花百': '花呗', '绊': '绑', '甬': '通', '車': '车', '叧': '另', '述': '诉', '査': '查', '瞪': '登', '機': '机', '啲': '的', '設': '设', '綁': '绑', '遲': '迟', '赞': '暂', '粤': '月', '驗': '验', '説': '说', '花花': '花', '坝': '呗', '发呗': '花呗', '虫': '了', '臨': '临', '笫': '第', '廷': '延', '琪': '期', '扥': '等', '谝': '骗', '倩': '欠', '挑': '调', '⑩': '', '/': '', '‘': '', '哩': '', '～±': '', '／': '', '｛': '', ';': '', '：': '', '％': '', '＝': '', '按': '', '喏': '', '>': '', '俺们': '', '》': '', 'Ⅲ': '', '蜂': '', '*': '', '日': '', '=': '', '⑦': '', '～': '', '）': '', ']': '', '。': '', ',': '', '“': '', '}': '', '逼': '', '＞': '', 'ＬＩ': '', '&': '', '(': '', "'": '', '哎哟': '', '数': '', 'sub': '', '！': '', '~': '', '@': '', '∈': '', '咦': '', '?': '', '喔唷': '', '⑤': '', '①': '', 'μ': '', '、': '', 'γ': '', '嗳': '', '』': '', '一': '', '②': '', 'Ｉ': '', 'В': '', '＃': '', '兮': '', '我': '', '［': '', '＋': '', '把': '', 'Ｌ': '', '<': '', '俺': '', '──': '', '⑧': '', '[': '', '④': '', 'sup': '', '哎呀': '', '；': '', '哎': '', 'Ｂ': '', '÷': '', '呗': '', '阿': '', '喔': '', '蚂': '', 'ａ': '', '#': '', 'Ｆ': '', '〔': '', ':': '', '吧': '', '丫': '', '嗯': '', '的': '', '■': '', '”': '', 'Ｔ': '', '＇': '', '《': '', '啐': '', '也': '', '嗬': '', '㈧': '', 'φ': '', '"': '', '↑': '', 'Δ': '', 'Ψ': '', '℃': '', '⑥': '', '〕': '', '+': '', 'exp': '', '＆': '', '罢了': '', '·': '', '″': '', '—': '', '１': '', '〉': '', '...': '', '＊': '', 'Lex': '', '＿': '', '蚁': '', '啊': '', '『': '', '．': '', '【': '', '呃': '', '［－': '', '▲': '', '，': '', '’': '', '｝': '', '（': '', '－': '', '】': '', '×': '', '咧': '', '.': '', '了': '', ')': '', 'Ａ': '', 'Ｘ': '', '咚': '', '］': '', '？': '', '→': '', '③': '', '＜': '', '吧哒': '', '按照': '', '唉': '', '＞λ': '', '———': '', '-': '', 'Ｚ': '', '⑨': '', '～＋': '', '呐': ''}
    for key, value in replace_dict.items():
        sentence = sentence.replace(key, value)
    return sentence

In [4]:
time_1 = time.time()
replace_dict = {'零':'0','一':'1','二':'2','三':'3','四':'4','五':'5','六':'6','七':'7','八':'8','九':'9','十':'10','叻': '了', '童': '通', '职': '值', '電': '电', '刪': '删', '宫': '营', '轲': '可', '兩': '两', '泽': '择', '拱': '供', '貝': '呗', '夠': '够', '罰': '罚', '嚒': '么', '涉': '设', '愈': '逾', '唄': '呗', '為': '为', '現': '现', '珊': '删', '酱': '降', '讷': '呐', '杞': '从', '竖': '公', '戗': '钱', '凊': '清', '陪': '赔', '嫌': '限', '鹅': '额', '聪': '充', '個': '个', '亿': '已', '蚂议': '蚂蚁', '陶': '淘', '伏': '付', '堡': '宝', '肔': '服', '巳': '已', '花坝': '花呗', '洼': '清', '甜': '填', '厉': '里', '渣': '咋', '纬': '为', '無': '无', '勇': '用', '扭': '钮', '压金': '押金', '绐': '给', '捉': '提', '喻': '逾', '換': '还', '胞': '宝', '調': '调', '抄': '超', '麽': '么', '歆': '完', '囗': '口', '卜': '不', '扮': '办', '氣': '气', '費': '费', '評': '评', '夫': '付', '猛': '能', '銀': '银', '枪': '清', '痛': '通', '喀': '额', '囙': '回', '筠': '运', '昰': '是', '吾': '我', '帳': '账', '玉': '与', '浓': '弄', '雪': '学', '螞蟻': '蚂蚁', '規': '规', '丕': '不', '時': '时', '花唄': '花呗', '梓': '么', '济': '机', '弍': '式', '貸': '贷', '繳': '交', '届': '借', '甪': '用', '蒋': '降', '欺': '期', '舍': '设', '妃': '为', '咔': '卡', '啤': '碑', '傲': '以', '俞': '逾', '田': '天', '剛': '刚', '怼': '对', '脚': '交', '怅': '账', '餮': '餐', '栓': '删', '揍': '款', '寶': '宝', '蝙': '变', '肖': '消', '剪': '减', '崔': '催', '榜': '绑', '扎': '咋', '圆': '元', '饯': '钱', '嘞': '了', '腐': '付', '辞': '迟', '昱': '里', '师': '是', '侍': '待', '睌': '晚', '宣': '删', '花背': '花呗', '購': '购', '慨': '概', '魔': '摩', '臂': '呗', '肿': '怎', '花贝': '花呗', '碼': '码', '茨': '款', '拳': '券', '乍': '咋', '証': '证', '歧': '期', '嘟': '都', '结呗': '借呗', '锤': '吹', '轻': '清', '厚': '后', '玏': '功', '乙': '已', '挷': '绑', '拦': '栏', '辟': '批', '讠': '之', '闹': '弄', '負': '付', '犹': '怀', '筘': '扣', '嗳': '爱', '說': '说', '扔': '仍', '花裁': '花', '吋': '时', '収': '收', '磕': '可', '給': '给', '腿': '退', '梆': '绑', '冬': '冻', '幼': '动', '炸': '咋', '經': '经', '骂': '吗', '欹': '款', '莉': '里', '叶': '页', '鍀': '的', '岀': '出', '欲': '逾', '花被': '花呗', '节清': '结清', '錢': '钱', '曰': '日', '戒备': '借呗', '灣': '湾', '贺': '和', '紅': '红', '幺': '么', '孒': '了', '別': '别', '涮': '刷', '歉': '欠', '泥': '呢', '額': '额', '栅': '删', '佝': '何', '壮': '状', '叹': '呗', '眷': '劵', '洋': '样', '蚂蚊': '蚂蚁', '哩': '里', '蚂蚱': '蚂蚁', '還': '还', '樣': '样', '杳': '查', '茌': '花', '卷': '券', '證': '证', '麼': '么', '佘': '余', '買': '买', '帝': '低', '胀': '账', '雨': '与', '花能': '花呗能', '崴': '为', '貨': '货', '丟': '丢', '開': '开', '叭': '呗', '昵': '呢', '祝': '况', '毙': '闭', '屎': '是', '佰': '百', '宴': '延', '幵': '开', '仟': '千', '來': '来', '挨': '爱', '祢': '你', '糸': '细', '颃': '用', '乳': '用', '借唄': '借呗', '唔': '客', '則': '则', '阔': '可', '叨': '嘛', '花多上': '花多少', '镀': '度', '刭': '到', '冯': '吗', '蔡': '才', '丽': '里', '減': '减', '狂': '款', '錯': '错', '匆': '充', '問': '问', '窃': '切', '贯': '关', '勒': '了', '颌': '额', '敗': '败', '咬': '要', '鈤': '日', '莪': '我', '腨': '用', '吵': '超', '篮': '蓝', '培': '赔', '粑': '把', '躲': '多', '嗎': '吗', '戶': '户', '毎': '每', '呮': '呗', '姑': '过', '胆': '但', '脱': '拖', '胃': '为', '剧': '刷', '吸': '息', '布': '不', '夂': '久', '栋': '冻', '淸': '清', '萌': '能', '愉': '逾', '請': '请', '卅': '啥', '堤': '提', '吱': '支', '禾': '何', '菅': '营', '渝': '逾', '侯': '候', '權': '权', '鼻': '比', '杜': '度', '嗨': '还', '踩': '才', '矿': '款', '珐': '法', 'ma': '吗', '花百': '花呗', '绊': '绑', '甬': '通', '車': '车', '叧': '另', '述': '诉', '査': '查', '瞪': '登', '機': '机', '啲': '的', '設': '设', '綁': '绑', '遲': '迟', '赞': '暂', '粤': '月', '驗': '验', '説': '说', '花花': '花', '坝': '呗', '发呗': '花呗', '虫': '了', '臨': '临', '笫': '第', '廷': '延', '琪': '期', '扥': '等', '谝': '骗', '倩': '欠', '挑': '调', '⑩': '', '/': '', '‘': '', '哩': '', '～±': '', '／': '', '｛': '', ';': '', '：': '', '％': '', '＝': '', '按': '', '喏': '', '>': '', '俺们': '', '》': '', 'Ⅲ': '', '蜂': '', '*': '', '日': '', '=': '', '⑦': '', '～': '', '）': '', ']': '', '。': '', ',': '', '“': '', '}': '', '逼': '', '＞': '', 'ＬＩ': '', '&': '', '(': '', "'": '', '哎哟': '', '数': '', 'sub': '', '！': '', '~': '', '@': '', '∈': '', '咦': '', '?': '', '喔唷': '', '⑤': '', '①': '', 'μ': '', '、': '', 'γ': '', '嗳': '', '』': '', '一': '', '②': '', 'Ｉ': '', 'В': '', '＃': '', '兮': '', '我': '', '［': '', '＋': '', '把': '', 'Ｌ': '', '<': '', '俺': '', '──': '', '⑧': '', '[': '', '④': '', 'sup': '', '哎呀': '', '；': '', '哎': '', 'Ｂ': '', '÷': '', '呗': '', '阿': '', '喔': '', '蚂': '', 'ａ': '', '#': '', 'Ｆ': '', '〔': '', ':': '', '吧': '', '丫': '', '嗯': '', '的': '', '■': '', '”': '', 'Ｔ': '', '＇': '', '《': '', '啐': '', '也': '', '嗬': '', '㈧': '', 'φ': '', '"': '', '↑': '', 'Δ': '', 'Ψ': '', '℃': '', '⑥': '', '〕': '', '+': '', 'exp': '', '＆': '', '罢了': '', '·': '', '″': '', '—': '', '１': '', '〉': '', '...': '', '＊': '', 'Lex': '', '＿': '', '蚁': '', '啊': '', '『': '', '．': '', '【': '', '呃': '', '［－': '', '▲': '', '，': '', '’': '', '｝': '', '（': '', '－': '', '】': '', '×': '', '咧': '', '.': '', '了': '', ')': '', 'Ａ': '', 'Ｘ': '', '咚': '', '］': '', '？': '', '→': '', '③': '', '＜': '', '吧哒': '', '按照': '', '唉': '', '＞λ': '', '———': '', '-': '', 'Ｚ': '', '⑨': '', '～＋': '', '呐': ''}
# all_data['question1'] = all_data['question1'].map(replace_dict)
# all_data['question2'] = all_data['question2'].map(replace_dict)

# all_data.replace({'question1': replace_dict, 'question2': replace_dict})

all_data['question1'] = all_data.iloc[:, 1].apply(lambda x: replace_words(x))
all_data['question2'] = all_data.iloc[:, 2].apply(lambda x: replace_words(x))
print('替换词: ', time.time() - time_1)
time_15 = time.time()
all_data['jieba_q1'] = all_data.iloc[:, 1].apply(lambda x: jieba_cut(x))
all_data['jieba_q2'] = all_data.iloc[:, 2].apply(lambda x: jieba_cut(x))
print('jieba分词：', time.time() - time_15)
time_2 = time.time()
all_data['jieba_word_cut_q1'] = all_data.jieba_q1.apply(lambda x: jieba_word_cut(x))
all_data['jieba_word_cut_q2'] = all_data.jieba_q2.apply(lambda x: jieba_word_cut(x))
print('分字+词: ', time.time() - time_2)
time_3 = time.time()
all_data['first_char_q1'] = all_data.jieba_q1.apply(lambda x: first_char_cut(x))
all_data['first_char_q2'] = all_data.jieba_q2.apply(lambda x: first_char_cut(x))
print('单词首字: ', time.time() - time_3)
time_4 = time.time()
all_data['word_cut_q1'] = all_data.iloc[:, 1].apply(lambda x: word_cut(x))
all_data['word_cut_q2'] = all_data.iloc[:, 2].apply(lambda x: word_cut(x))
print('分字: ', time.time() - time_4)
print('总时间：', time.time() - time_1)
all_data.head()

替换词:  78.54145288467407
jieba分词： 42.33256936073303
分字+词:  2.2613556385040283
单词首字:  1.2675347328186035
分字:  1.0780537128448486
总时间： 125.48151326179504


Unnamed: 0,id,question1,question2,is_duplicate,jieba_q1,jieba_q2,jieba_word_cut_q1,jieba_word_cut_q2,first_char_q1,first_char_q2,word_cut_q1,word_cut_q2
0,1,要取消预约理财,怎么取消定期理财,0,"[要, 取消, 预约, 理财]","[怎么, 取消, 定期, 理财]",要 取 消 取消 预 约 预约 理 财 理财,怎 么 怎么 取 消 取消 定 期 定期 理 财 理财,要 取 预 理,怎 取 定 理,要 取 消 预 约 理 财,怎 么 取 消 定 期 理 财
1,2,银行卡密码正确但是错误,密码输入正确提示错误,1,"[银行卡, 密码, 正确, 但是, 错误]","[密码, 输入, 正确, 提示, 错误]",银 行 卡 银行卡 密 码 密码 正 确 正确 但 是 但是 错 误 错误,密 码 密码 输 入 输入 正 确 正确 提 示 提示 错 误 错误,银 密 正 但 错,密 输 正 提 错,银 行 卡 密 码 正 确 但 是 错 误,密 码 输 入 正 确 提 示 错 误
2,3,自动还款是扣余额宝吗,余额宝有钱还款当自动扣钱吗,1,"[自动, 还款, 是, 扣, 余额宝, 吗]","[余额宝, 有钱, 还款, 当, 自动, 扣钱, 吗]",自 动 自动 还 款 还款 是 扣 余 额 宝 余额宝 吗,余 额 宝 余额宝 有 钱 有钱 还 款 还款 当 自 动 自动 扣 钱 扣钱 吗,自 还 是 扣 余 吗,余 有 还 当 自 扣 吗,自 动 还 款 是 扣 余 额 宝 吗,余 额 宝 有 钱 还 款 当 自 动 扣 钱 吗
3,4,赔付运费险什么时候到账,退货什么时候退运费给,0,"[赔付, 运费, 险, 什么, 时候, 到, 账]","[退货, 什么, 时候, 退, 运费, 给]",赔 付 赔付 运 费 运费 险 什 么 什么 时 候 时候 到 账,退 货 退货 什 么 什么 时 候 时候 退 运 费 运费 给,赔 运 险 什 时 到 账,退 什 时 退 运 给,赔 付 运 费 险 什 么 时 候 到 账,退 货 什 么 时 候 退 运 费 给
4,5,余额宝转不出来吗,为什么余额宝转不进去,0,"[余额宝, 转, 不, 出来, 吗]","[为什么, 余额宝, 转, 不, 进去]",余 额 宝 余额宝 转 不 出 来 出来 吗,为 什 么 为什么 余 额 宝 余额宝 转 不 进 去 进去,余 转 不 出 吗,为 余 转 不 进,余 额 宝 转 不 出 来 吗,为 什 么 余 额 宝 转 不 进 去


In [5]:
'''IF-IDF'''
stopwords = get_stopwords()
text_data = []
for idx,row in all_data.iterrows():
    text_data.append(row['cut_question1'])
    text_data.append(row['cut_question2'])

In [None]:
text_data[:5]

In [7]:
# term-count matrix
count_vectorizer = CountVectorizer(input='content', analyzer='word', stop_words=stopwords, lowercase=False)
count_matrix = count_vectorizer.fit_transform(text_data)
count_matrix.shape

(204954, 12098)

In [16]:
# term-tfidf matrix
tfidf_vectorizer = TfidfVectorizer(input='content', analyzer='word',stop_words=stopwords,
                                   lowercase=False,use_idf=True, sublinear_tf=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)

In [17]:
'''Latent Semantic Analysis'''
lsa = TruncatedSVD(n_components=200, n_iter=10, random_state=123)
count_lsa_vector = lsa.fit_transform(count_matrix)
idf_lsa_vector = lsa.fit_transform(tfidf_matrix)

In [5]:
# 特征: 计算（s1的字同时在s2中也出现和s2的字同时在s1中也出现）的比例
def shared_word_proportion(row):
        q1words = {}
        q2words = {}
        for word in row['jieba_word_cut_q1'].split():
            q1words[word] = q1words.get(word, 0) + 1
        for word in row['jieba_word_cut_q2'].split():
            q2words[word] = q2words.get(word, 0) + 1
        n_shared_word_in_q1 = sum([q1words[w] for w in q1words if w in q2words])
        n_shared_word_in_q2 = sum([q2words[w] for w in q2words if w in q1words])
        n_tol = sum(q1words.values()) + sum(q2words.values())
        if 1e-6 > n_tol:
            return 0.
        else:
            return 1.0 * (n_shared_word_in_q1 + n_shared_word_in_q2) / n_tol

In [6]:
all_data['shared_word'] = all_data.apply(shared_word_proportion, axis=1, raw=True)

In [7]:
# 计算所有词的tfidf值
def init_idf(data):
    idf = {}
    q_set = set()
    for index, row in data.iterrows():
        q1 = row['jieba_word_cut_q1']
        q2 = row['jieba_word_cut_q2']
        if q1 not in q_set:
            q_set.add(q1)
            words = q1.split()
            for word in words:
                idf[word] = idf.get(word, 0) + 1
        if q2 not in q_set:
            q_set.add(q2)
            words = q2.split()
            for word in words:
                idf[word] = idf.get(word, 0) + 1
    num_docs = len(data)
    for word in idf:
        idf[word] = math.log(num_docs / (idf[word] + 1.)) / math.log(2.)
    return idf
idf = init_idf(all_data)

In [9]:
# 特征：共现词的tfidf比例（跟上面shared_word_proportion类似）
def tfidf_shared_word(row):
    q1words = {}
    q2words = {}
    for word in row['jieba_word_cut_q1'].split():
        q1words[word] = q1words.get(word, 0) + 1
    for word in row['jieba_word_cut_q2'].split():
        q2words[word] = q2words.get(word, 0) + 1
    sum_shared_word_in_q1 = sum([q1words[w] * idf.get(w, 0) for w in q1words if w in q2words])
    sum_shared_word_in_q2 = sum([q2words[w] * idf.get(w, 0) for w in q2words if w in q1words])
    sum_tol = sum(q1words[w] * idf.get(w, 0) for w in q1words) + sum(q2words[w] * idf.get(w, 0) for w in q2words)
    if 1e-6 > sum_tol:
        return 0.
    else:
        return 1.0 * (sum_shared_word_in_q1 + sum_shared_word_in_q2) / sum_tol

In [10]:
all_data['tfidf_shared'] = all_data.apply(tfidf_shared_word, axis=1, raw=True)

In [11]:
# 特征：两个句子的tfidf总和的差值
def tfidf_dif(row):
    q1words = {}
    q2words = {}
    for word in row['jieba_word_cut_q1'].split():
        q1words[word] = q1words.get(word, 0) + 1
    for word in row['jieba_word_cut_q2'].split():
        q2words[word] = q2words.get(word, 0) + 1
    tfidf_q1 = sum([q1words[w] * idf.get(w, 0) for w in q1words])
    tfidf_q2 = sum([q2words[w] * idf.get(w, 0) for w in q2words])
    return abs(tfidf_q1 - tfidf_q2)

In [12]:
all_data['tfidf_dif'] = all_data.apply(tfidf_dif, axis=1, raw=True)

In [13]:
# 特征：长度特征
all_data['word_len1'] = all_data.first_char_q1.apply(lambda x: len(x.split()))
all_data['word_len2'] = all_data.first_char_q2.apply(lambda x: len(x.split()))
all_data['char_len1'] = all_data.word_cut_q1.apply(lambda x: len(x.split()))
all_data['char_len2'] = all_data.word_cut_q2.apply(lambda x: len(x.split()))

In [14]:
# 特征：两个句子长度差
def length_dif(row):
    len_s1 = len(row['word_cut_q1'].split())
    len_s2 = len(row['word_cut_q2'].split())
    len_dif = abs(len_s1 - len_s2)
    return len_dif

In [15]:
all_data['length_dif'] = all_data.apply(length_dif, axis=1, raw=True)

In [16]:
# 特征：两个句子的长度差比例
def length_dif_rate(row):
    len_q1 = len(row['word_cut_q1'].split())
    len_q2 = len(row['word_cut_q2'].split())
    if max(len_q1, len_q2) < 1e-6:
        return 0.0
    else:
        return 1.0 * min(len_q1, len_q2) / max(len_q1, len_q2)

In [17]:
all_data['length_dif_rate'] = all_data.apply(length_dif_rate, axis=1, raw=True)

In [18]:
# 特征：共现词个数
def common_chars(row):
    s1 = set(row['word_cut_q1'].split())
    s2 = set(row['word_cut_q2'].split())
    intersection = s1.intersection(s2)
    return len(intersection)

In [19]:
all_data['common_words'] = all_data.apply(common_chars, axis=1, raw=True)

In [20]:
# 特征：莱文斯顿距离
def levenshtein_dist(row):
    s1 = row['word_cut_q1'].split()
    s2 = row['word_cut_q2'].split()
    if len(s1) < len(s2):
        temp = s1
        s1 = s2
        s2 = temp

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return 1.0 * previous_row[-1]

In [21]:
all_data['levenshtein'] = all_data.apply(levenshtein_dist, axis=1, raw=True)

In [22]:
# 特征：否定词出现情况，一个有否定词，另一个没有，则是不相似的
def neg_word(row):
    neg_words = ['不','非','无','未','不曾','没','没有','别','勿','请勿','不用','无须','并非','毫无','决不','休想','永不','不要','未尝','未曾','毋','莫','从不','从未','从未有过','尚未','一无','并未','尚无','从来不','从没','绝非','远非','切莫','永不','休想','绝不','毫不','不必','禁止','忌','拒绝','杜绝','否','弗','木有']
    s1 = row['jieba_word_cut_q1'].split()
    s2 = row['jieba_word_cut_q2'].split()
    s1_inter = set(s1).intersection(neg_words)
    s2_inter = set(s2).intersection(neg_words)
    if len(s1_inter)>0 and len(s2_inter)>0:
        return 1
    elif len(s1_inter)==0 and len(s2_inter)==0:
        return 1
    else:
        return 0

In [23]:
all_data['neg_word'] = all_data.apply(neg_word, axis=1, raw=True)

In [24]:
# 特征：数字出现情况，一个有数字，另一个没数字，则不相似
def digit_in_sent(row):
    p = re.compile(r'\d+')
    digit_s1 = p.findall(row['question1'])
    digit_s2 = p.findall(row['question2'])
    s1_count = len(digit_s1)
    s2_count = len(digit_s2)
    pair_and = int((0 < s1_count) and (0 < s2_count))
    pair_or = int((0 < s1_count) or (0 < s2_count))
    return [s1_count, s2_count, pair_and, pair_or]

In [25]:
all_data['digit_in_sent'] = all_data.apply(digit_in_sent, axis=1, raw=True)

In [26]:
# 特征：只取出词语的第一个字，来计算相似度
def first_char_shared(row):
    s1 = set(row['first_char_q1'].split())
    s2 = set(row['first_char_q2'].split())
    shared_num = len(s1.intersection(s2))
    return shared_num
def first_char_jaccard(row):
    s1 = set(row['first_char_q1'].split())
    s2 = set(row['first_char_q2'].split())
    intersection = s1.intersection(s2)
    union = s1.union(s2)
    return len(intersection)/len(union)

In [27]:
all_data['1st_shared'] = all_data.apply(first_char_shared, axis=1, raw=True)
all_data['1st_jaccard'] = all_data.apply(first_char_jaccard, axis=1, raw=True)

In [28]:
def generate_powerful_word(data):
    """
    计算数据中词语的影响力，格式如下：
    词语-->[0. 出现语句对数量，1. 出现语句对比例，2. 正确语句对比例，3. 单侧语句对比例，4. 单侧语句对正确比例，5. 双侧语句对比例，6. 双侧语句对正确比例]
    """
    words_power = {}
    for index, row in data.iterrows():
        label = int(row['is_duplicate'])
        q1_words = row['jieba_word_cut_q1'].split()
        q2_words = row['jieba_word_cut_q2'].split()
        all_words = set(q1_words + q2_words)
        q1_words = set(q1_words)
        q2_words = set(q2_words)
        for word in all_words:
            if word not in words_power:
                words_power[word] = [0. for i in range(7)]
            # 计算出现语句对数量
            words_power[word][0] += 1.
            words_power[word][1] += 1.
            if ((word in q1_words) and (word not in q2_words)) or ((word not in q1_words) and (word in q2_words)):
                # 计算单侧语句数量
                words_power[word][3] += 1.
                if 0 == label:
                    # 计算正确语句对数量
                    words_power[word][2] += 1.
                    # 计算单侧语句正确比例
                    words_power[word][4] += 1.
            if (word in q1_words) and (word in q2_words):
                # 计算双侧语句数量
                words_power[word][5] += 1.
                if 1 == label:
                    # 计算正确语句对数量
                    words_power[word][2] += 1.
                    # 计算双侧语句正确比例
                    words_power[word][6] += 1.
    for word in words_power:
        # 计算出现语句对比例
        words_power[word][1] /= len(data)
        # 计算正确语句对比例
        words_power[word][2] /= words_power[word][0]
        # 计算单侧语句对正确比例
        if words_power[word][3] > 1e-6:
            words_power[word][4] /= words_power[word][3]
        # 计算单侧语句对比例
        words_power[word][3] /= words_power[word][0]
        # 计算双侧语句对正确比例
        if words_power[word][5] > 1e-6:
            words_power[word][6] /= words_power[word][5]
        # 计算双侧语句对比例
        words_power[word][5] /= words_power[word][0]
    sorted_words_power = sorted(words_power.items(), key=lambda d: d[1][0], reverse=True)
    return sorted_words_power

power_words = generate_powerful_word(all_data)

In [29]:
len(power_words)

38535

In [30]:
def init_powerful_word_dside(pword, thresh_num, thresh_rate):
    pword_dside = []
    pword = filter(lambda x: x[1][0] * x[1][5] >= thresh_num, pword)
    pword_sort = sorted(pword, key=lambda d: d[1][6], reverse=True)
    pword_dside.extend(map(lambda x: x[0], filter(lambda x: x[1][6] >= thresh_rate, pword_sort)))
    return pword_dside
power_word_dside = init_powerful_word_dside(power_words, 20000, 0.4)

In [31]:
len(power_word_dside)

27

In [32]:
def power_dside(row):
    tags = []
    q1_words = row['jieba_word_cut_q1'].split()
    q2_words = row['jieba_word_cut_q2'].split()
    for word in power_word_dside:
        if (word in q1_words) and (word in q2_words):
            tags.append(1.0)
        else:
            tags.append(0.0)
    return tags

In [33]:
all_data['power_dside'] = all_data.apply(power_dside, axis=1, raw=True)

In [34]:
def init_powerful_word_oside(pword, thresh_num, thresh_rate):
    pword_oside = []
    pword = filter(lambda x: x[1][0] * x[1][3] >= thresh_num, pword)
    pword_oside.extend(map(lambda x: x[0], filter(lambda x: x[1][4] >= thresh_rate, pword)))
    return pword_oside
power_word_oside = init_powerful_word_oside(power_words, 30000, 0.5)

In [35]:
len(power_word_oside)

28

In [36]:
def power_oside(row):
    tags = []
    q1_words = set(row['jieba_word_cut_q1'].split())
    q2_words = set(row['jieba_word_cut_q2'].split())
    for word in power_word_oside:
        if (word in q1_words) and (word not in q2_words):
            tags.append(1.0)
        elif (word not in q1_words) and (word in q2_words):
            tags.append(1.0)
        else:
            tags.append(0.0)
    return tags

In [37]:
all_data['power_oside'] = all_data.apply(power_oside, axis=1, raw=True)

In [42]:
def pword_dside_rate(row):
    num_least = 300
    rate = [1.0]
    q1_words = set(row['jieba_word_cut_q1'].split())
    q2_words = set(row['jieba_word_cut_q2'].split())
    share_words = list(q1_words.intersection(q2_words))
    for word in share_words:
        if word not in power_words:
            continue
        if power_words[word][0] * power_words[word][5] < num_least:
            continue
        rate[0] *= (1.0 - power_words[word][6])
    rate = [1 - num for num in rate]
    return rate

In [43]:
# all_data['pword_dside_rate'] = all_data.apply(pword_dside_rate, axis=1, raw=True)

In [46]:
def pword_oside_rate(row):
    num_least = 300
    rate = [1.0]
    q1_words = set(row['jieba_word_cut_q1'].split())
    q2_words = set(row['jieba_word_cut_q2'].split())
    q1_diff = list(q1_words.difference(q2_words))
    q2_diff = list(q2_words.difference(q1_words))
    all_diff = set(q1_diff + q2_diff)
    for word in all_diff:
        if word not in power_words:
            continue
        if power_words[word][0] * power_words[word][3] < num_least:
            continue
        rate[0] *= (1.0 - power_words[word][4])
    rate = [1 - num for num in rate]
    return rate

In [None]:
# all_data['pword_oside_rate'] = all_data.apply(pword_oside_rate, axis=1, raw=True)

In [38]:
# 特征：TFIDF
def init_tfidf():
    tfidf = TfidfVectorizer(input='content', analyzer='word',lowercase=False,use_idf=True, sublinear_tf=True)
    tfidf_txt = pd.Series(all_data['jieba_word_cut_q1'].tolist() + all_data['jieba_word_cut_q2'].tolist()).astype(str)
    tfidf.fit_transform(tfidf_txt)
    return tfidf
tfidf = init_tfidf()

In [129]:
def tfidf_fs(row):
    q1 = row['jieba_word_cut_q1']
    q2 = row['jieba_word_cut_q2']
    fs = list()
    fs.append(np.sum(tfidf.transform([str(q1)]).data))
    fs.append(np.sum(tfidf.transform([str(q2)]).data))
    fs.append(np.mean(tfidf.transform([str(q1)]).data))
    fs.append(np.mean(tfidf.transform([str(q2)]).data))
    fs.append(len(tfidf.transform([str(q1)]).data))
    fs.append(len(tfidf.transform([str(q2)]).data))
    return fs

In [None]:
# all_data['tfidf_fs'] = all_data.apply(tfidf_fs, axis=1, raw=True)

In [38]:
# 特征：句子在语料中的重复次数
def duplicate_num():
    dup_num = {}
    for index, row in all_data.iterrows():
        q1 = row['question1']
        q2 = row['question2']
        dup_num[q1] = dup_num.get(q1, 0) + 1
        if q1 != q2:
            dup_num[q2] = dup_num.get(q2, 0) + 1
    return dup_num
dup_num = duplicate_num()

In [39]:
def duplicate_sent(row):
    s1 = row['question1']
    s2 = row['question2']
    s1_num = dup_num[s1]
    s2_num = dup_num[s2]
    return [s1_num, s2_num, max(s1_num, s2_num), min(s1_num, s2_num)]

In [40]:
all_data['duplicate_sent'] = all_data.apply(duplicate_sent, axis=1, raw=True)

In [41]:
# 距离计算函数！
def jaccard_coef(A, B):
    if not isinstance(A, set):
        A = set(A)
    if not isinstance(B, set):
        B = set(B)
    return float(len(A.intersection(B)) / len(A.union(B)))

def dice_dist(A, B):
    if not isinstance(A, set):
        A = set(A)
    if not isinstance(B, set):
        B = set(B)
    return (2.0 * float(len(A.intersection(B)))) / (len(A) + len(B))


In [42]:
# 计算 N-gram.
def unigrams(words):
    assert type(words) == list
    return words

def bigrams(words, join_string, skip=0):
    assert type(words) == list
    L = len(words)
    if L > 1:
        lst = []
        for i in range(L - 1):
            for k in range(1, skip + 2):
                if i + k < L:
                    lst.append(join_string.join([words[i], words[i + k]]))
    else:
        # set it as unigram
        lst = unigrams(words)
    return lst

def trigrams(words, join_string, skip=0):
    assert type(words) == list
    L = len(words)
    if L > 2:
        lst = []
        for i in range(L - 2):
            for k1 in range(1, skip + 2):
                for k2 in range(1, skip + 2):
                    if i + k1 < L and i + k1 + k2 < L:
                        lst.append(join_string.join([words[i], words[i + k1], words[i + k1 + k2]]))
    else:
        # set it as bigram
        lst = bigrams(words, join_string, skip)
    return lst

def fourgrams(words, join_string):
    assert type(words) == list
    L = len(words)
    if L > 3:
        lst = []
        for i in xrange(L - 3):
            lst.append(join_string.join([words[i], words[i + 1], words[i + 2], words[i + 3]]))
    else:
        # set it as trigram
        lst = trigrams(words, join_string)
    return lst

def ngrams(words, ngram, join_string=" "):
    if ngram == 1:
        return unigrams(words)
    elif ngram == 2:
        return bigrams(words, join_string)
    elif ngram == 3:
        return trigrams(words, join_string)
    elif ngram == 4:
        return fourgrams(words, join_string)
    elif ngram == 12:
        unigram = unigrams(words)
        bigram = [x for x in bigrams(words, join_string) if len(x.split(join_string)) == 2]
        return unigram + bigram
    elif ngram == 123:
        unigram = unigrams(words)
        bigram = [x for x in bigrams(words, join_string) if len(x.split(join_string)) == 2]
        trigram = [x for x in trigrams(words, join_string) if len(x.split(join_string)) == 3]
        return unigram + bigram + trigram

In [43]:
# 特征：n-gram jaccard系数
def ngram_jaccard(row):
    q1_words = row['jieba_word_cut_q1'].split()
    q2_words = row['jieba_word_cut_q2'].split()
    fs = list()
    for n in range(1, 4):
        q1_ngrams = ngrams(q1_words, n)
        q2_ngrams = ngrams(q2_words, n)
        fs.append(jaccard_coef(q1_ngrams, q2_ngrams))
    return fs

In [44]:
all_data['ngram_jaccard'] = all_data.apply(ngram_jaccard, axis=1, raw=True)

In [45]:
# 特征：n-gram dice系数
def ngram_dice(row):
    q1_words = row['jieba_word_cut_q1'].split()
    q2_words = row['jieba_word_cut_q2'].split()
    fs = list()
    for n in range(1, 4):
        q1_ngrams = ngrams(q1_words, n)
        q2_ngrams = ngrams(q2_words, n)
        fs.append(dice_dist(q1_ngrams, q2_ngrams))
    return fs

In [46]:
all_data['ngram_dice'] = all_data.apply(ngram_dice, axis=1, raw=True)

In [5]:
# 计算所有字的tfidf值
def init_idf(data):
    idf = {}
    q_set = set()
    for index, row in data.iterrows():
        q1 = row['word_cut_q1']
        q2 = row['word_cut_q2']
        if q1 not in q_set:
            q_set.add(q1)
            words = q1.split()
            for word in words:
                idf[word] = idf.get(word, 0) + 1
        if q2 not in q_set:
            q_set.add(q2)
            words = q2.split()
            for word in words:
                idf[word] = idf.get(word, 0) + 1
    num_docs = len(data)
    for word in idf:
        idf[word] = math.log(num_docs / (idf[word] + 1.)) / math.log(2.)
    return idf
idf = init_idf(all_data)

In [6]:
def preTrainWordEmbedding():
    # 读取预训练的词向量，返回Embedding Dict
    embedding_dict = {}
    f = codecs.open('./word_embedding/sgns.baidubaike.bigram-char', 'r', encoding='utf-8')
    for line in f:
        line = line.split()
        if len(line) > 0:
            word = line[0]
            if len(line) == 301:
                embed_vec = np.array(line[1:], dtype="float32")
                embedding_dict[word] = embed_vec
    f.close()
    return embedding_dict

embedding_dict = preTrainWordEmbedding()
len(embedding_dict)

635788

In [7]:
# 计算词典
word_dict = {}
for idx,row in all_data.iterrows():
    text = row['word_cut_q1'].split() + row['word_cut_q2'].split()
    for word in text:
        if word not in word_dict:
            word_dict[word] = len(word_dict)
len(word_dict)

2875

In [8]:
load_count = 0
w2v_dict = {}
for word in word_dict:
    if word not in w2v_dict:
        if word in embedding_dict:
            w2v_dict[word] = embedding_dict.get(word)
            load_count += 1
        else:
            w2v_dict[word] = np.random.randn(300)

In [51]:
load_count

2776

In [15]:
# 特征：句子中每个单词的词向量之和即为句子向量，两个句向量之间的cosine similarity
def w2v_cos(row):
    q1_words = row['word_cut_q1'].split()
    q2_words = row['word_cut_q2'].split()
    q1_vec = np.array(300 * [0.])
    q2_vec = np.array(300 * [0.])

    for word in q1_words:
        if word in w2v_dict:
            q1_vec = q1_vec + w2v_dict[word]
    for word in q2_words:
        if word in w2v_dict:
            q2_vec = q2_vec + w2v_dict[word]

    cos_sim = 0.
    q1_vec = np.mat(q1_vec)
    q2_vec = np.mat(q2_vec)
    factor = np.linalg.norm(q1_vec) * np.linalg.norm(q2_vec)
    if 1e-6 < factor:
        cos_sim = float(q1_vec * q2_vec.T) / factor
    return cos_sim

In [9]:
def sent_vec(row):
    q1_words = row['word_cut_q1'].split()
    q2_words = row['word_cut_q2'].split()
    q1_vec = np.array(300 * [0.])
    q2_vec = np.array(300 * [0.])
    for word in q1_words:
        if word in w2v_dict:
            q1_vec += w2v_dict[word]
    for word in q2_words:
        if word in w2v_dict:
            q2_vec += w2v_dict[word]
    return list(q1_vec) + list(q1_vec)

In [None]:
# 该特征会使内存爆炸！！！
# all_data['sent_vec'] = all_data.apply(sent_vec, axis=1, raw=True)

In [16]:
# 特征：句子中每个单词的词向量的tfidf加权的和即为句子向量，两个句向量之间的cosine similarity
def w2v_idf_cos(row):
    q1_words = row['word_cut_q1'].split()
    q2_words = row['word_cut_q2'].split()
    q1_vec = np.array(300 * [0.])
    q2_vec = np.array(300 * [0.])
    
    q1_words_cnt = {}
    q2_words_cnt = {}
    
    for word in q1_words:
        q1_words_cnt[word] = q1_words_cnt.get(word, 0.) + 1.
    for word in q2_words:
        q2_words_cnt[word] = q2_words_cnt.get(word, 0.) + 1.

    for word in q1_words_cnt:
        if word in w2v_dict:
            q1_vec += idf.get(word, 0.) * q1_words_cnt[word] * w2v_dict[word]
    for word in q2_words_cnt:
        if word in w2v_dict:
            q2_vec += idf.get(word, 0.) * q2_words_cnt[word] * w2v_dict[word]

    cos_sim = 0.
    q1_vec = np.mat(q1_vec)
    q2_vec = np.mat(q2_vec)
    factor = np.linalg.norm(q1_vec) * np.linalg.norm(q2_vec)
    if 1e-6 < factor:
        cos_sim = float(q1_vec * q2_vec.T) / factor
    return cos_sim

In [17]:
def text_w2v_sim(row):
    q1_words = row['word_cut_q1'].split()
    q2_words = row['word_cut_q2'].split()
    q1_idf = 0.0
    q2_idf = 0.0
    q1_sim_idf_sum = 0.0
    q2_sim_idf_sum = 0.0
    for word1 in q1_words:
        q1_idf += idf.get(word1, 0)
        q1_max_sim = 0.0
        for word2 in q2_words:
            cos_sim = float(cosine_similarity(np.mat(w2v_dict.get(word1,0)), np.mat(w2v_dict.get(word2,0))))
            if cos_sim > q1_max_sim:
                q1_max_sim = cos_sim
        q1_sim_idf_sum += q1_max_sim * idf.get(word1,0)
    for word1 in q2_words:
        q2_idf += idf.get(word1, 0)
        q2_max_sim = 0.0
        for word2 in q1_words:
            cos_sim = float(cosine_similarity(np.mat(w2v_dict.get(word1,0)), np.mat(w2v_dict.get(word2,0))))
            if cos_sim > q2_max_sim:
                q2_max_sim = cos_sim
        q2_sim_idf_sum += q2_max_sim * idf.get(word1,0)
    sim = ((q1_sim_idf_sum / q1_idf) + (q2_sim_idf_sum / q2_idf)) / 2.0
    return sim

In [11]:
all_data['w2v_cos'] = all_data.apply(w2v_cos, axis=1, raw=True)

In [18]:
all_data['w2v_idf_cos'] = all_data.apply(w2v_idf_cos, axis=1, raw=True)

In [None]:
# start = time.time()
# all_data['text_w2v_sim'] = all_data.apply(text_w2v_sim, axis=1, raw=True)
# print(time.time() - start)

In [28]:
train_orig = pd.read_csv('./data/all_data', sep=' ', header=None, encoding='utf-8')
train_orig.columns = ['id', 'question1', 'question2', 'is_duplicate']

df1 = train_orig[['question1']].copy()
df2 = train_orig[['question2']].copy()
df2.rename(columns = {'question2':'question1'},inplace=True)

train_questions = df1.append(df2)
train_questions.drop_duplicates(subset = ['question1'],inplace=True)

train_questions.reset_index(inplace=True,drop=True)
questions_dict = pd.Series(train_questions.index.values,index=train_questions.question1.values).to_dict()
train_cp = train_orig.copy()

# test_cp['is_duplicate'] = -1
comb = train_cp

comb['q1_hash'] = comb['question1'].map(questions_dict)
comb['q2_hash'] = comb['question2'].map(questions_dict)

q1_vc = comb.q1_hash.value_counts().to_dict()
q2_vc = comb.q2_hash.value_counts().to_dict()

def try_apply_dict(x,dict_to_apply):
    try:
        return dict_to_apply[x]
    except KeyError:
        return 0
#map to frequency space
comb['q1_freq'] = comb['q1_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))
comb['q2_freq'] = comb['q2_hash'].map(lambda x: try_apply_dict(x,q1_vc) + try_apply_dict(x,q2_vc))

train_comb = comb[comb['is_duplicate'] >= 0][['id','q1_hash','q2_hash','q1_freq','q2_freq','is_duplicate']]
test_comb = comb[comb['is_duplicate'] < 0][['id','q1_hash','q2_hash','q1_freq','q2_freq']]

In [29]:
train_comb.drop(columns=['id', 'is_duplicate'], inplace=True)
train_comb.head()

Unnamed: 0,q1_hash,q2_hash,q1_freq,q2_freq
0,0,470276,1,1
1,1,470277,1,1
2,2,470278,1,1
3,3,470279,2,1
4,4,470280,1,1


In [30]:
train_comb.to_csv('./data/freq_feature.csv', index=False)

In [19]:
all_data.head()
all_data.columns.values.tolist()

['id',
 'question1',
 'question2',
 'is_duplicate',
 'jieba_q1',
 'jieba_q2',
 'jieba_word_cut_q1',
 'jieba_word_cut_q2',
 'first_char_q1',
 'first_char_q2',
 'word_cut_q1',
 'word_cut_q2',
 'w2v_cos',
 'w2v_idf_cos']

In [24]:
lgb_label = all_data['is_duplicate']
lgb_data = all_data.drop(columns=['id', 'question1', 'question2', 'is_duplicate', 'jieba_q1', 'jieba_q2', 'jieba_word_cut_q1', 'jieba_word_cut_q2', 'first_char_q1', 
                                  'first_char_q2', 'word_cut_q1', 'word_cut_q2'])
# lgb_data = lgb_data.drop(columns=['f12_neg_word', 'f13_digit_in_sent','f3_word_len1','f4_word_len2','f5_char_len1','f6_char_len2','f7_length_dif'])
lgb_data.head()
lgb_data.columns.values.tolist()

['w2v_cos', 'w2v_idf_cos']

In [49]:
lgb_data['digit_in_sent'] = lgb_data.digit_in_sent.apply(lambda x: ','.join(map(str, x)))
digit_sent = lgb_data['digit_in_sent'].str.split(',', expand=True).rename(columns = lambda x: 'digit_sent_' + str(x+1))
digit_sent = digit_sent.apply(pd.to_numeric)
lgb_data = pd.concat([lgb_data, digit_sent], axis=1)
lgb_data.drop(['digit_in_sent'], axis=1, inplace=True)

lgb_data['power_dside'] = lgb_data.power_dside.apply(lambda x: ','.join(map(str, x)))
power_d = lgb_data['power_dside'].str.split(',', expand=True).rename(columns = lambda x: 'power_d_' + str(x+1))
power_d = power_d.apply(pd.to_numeric)
lgb_data = pd.concat([lgb_data, power_d], axis=1)
lgb_data.drop(['power_dside'], axis=1, inplace=True)

lgb_data['power_oside'] = lgb_data.power_oside.apply(lambda x: ','.join(map(str, x)))
power_o = lgb_data['power_oside'].str.split(',', expand=True).rename(columns = lambda x: 'power_o_' + str(x+1))
power_o = power_o.apply(pd.to_numeric)
lgb_data = pd.concat([lgb_data, power_o], axis=1)
lgb_data.drop(['power_oside'], axis=1, inplace=True)

lgb_data['duplicate_sent'] = lgb_data.duplicate_sent.apply(lambda x: ','.join(map(str, x)))
dup_sent = lgb_data['duplicate_sent'].str.split(',', expand=True).rename(columns = lambda x: 'dup_sent_' + str(x+1))
dup_sent = dup_sent.apply(pd.to_numeric)
lgb_data = pd.concat([lgb_data, dup_sent], axis=1)
lgb_data.drop(['duplicate_sent'], axis=1, inplace=True)

lgb_data['ngram_jaccard'] = lgb_data.ngram_jaccard.apply(lambda x: ','.join(map(str, x)))
ngram_jac = lgb_data['ngram_jaccard'].str.split(',', expand=True).rename(columns = lambda x: 'ngram_jac_' + str(x+1))
ngram_jac = ngram_jac.apply(pd.to_numeric)
lgb_data = pd.concat([lgb_data, ngram_jac], axis=1)
lgb_data.drop(['ngram_jaccard'], axis=1, inplace=True)

lgb_data['ngram_dice'] = lgb_data.ngram_dice.apply(lambda x: ','.join(map(str, x)))
ngram_di = lgb_data['ngram_dice'].str.split(',', expand=True).rename(columns = lambda x: 'ngram_di_' + str(x+1))
ngram_di = ngram_di.apply(pd.to_numeric)
lgb_data = pd.concat([lgb_data, ngram_di], axis=1)
lgb_data.drop(['ngram_dice'], axis=1, inplace=True)


In [None]:
lgb_data.columns.values.tolist()

In [96]:
# Columns Normalization
scaler = preprocessing.MinMaxScaler()
lgb_data['word_len1'] = pd.DataFrame(scaler.fit_transform(lgb_data['word_len1'].values.astype(float).reshape(-1,1)))
lgb_data['word_len2'] = pd.DataFrame(scaler.fit_transform(lgb_data['word_len2'].values.astype(float).reshape(-1,1)))
lgb_data['char_len1'] = pd.DataFrame(scaler.fit_transform(lgb_data['char_len1'].values.astype(float).reshape(-1,1)))
lgb_data['char_len2'] = pd.DataFrame(scaler.fit_transform(lgb_data['char_len2'].values.astype(float).reshape(-1,1)))
lgb_data['length_dif'] = pd.DataFrame(scaler.fit_transform(lgb_data['length_dif'].values.astype(float).reshape(-1,1)))
lgb_data['common_words'] = pd.DataFrame(scaler.fit_transform(lgb_data['common_words'].values.astype(float).reshape(-1,1)))
lgb_data['levenshtein'] = pd.DataFrame(scaler.fit_transform(lgb_data['levenshtein'].values.astype(float).reshape(-1,1)))
lgb_data['1st_shared'] = pd.DataFrame(scaler.fit_transform(lgb_data['1st_shared'].values.astype(float).reshape(-1,1)))

In [53]:
lgb_data.head()

Unnamed: 0,jieba_q1,jieba_q2,shared_word,tfidf_shared,tfidf_dif,word_len1,word_len2,char_len1,char_len2,length_dif,...,dup_sent_1,dup_sent_2,dup_sent_3,dup_sent_4,ngram_jac_1,ngram_jac_2,ngram_jac_3,ngram_di_1,ngram_di_2,ngram_di_3
0,"[要, 取消, 预约, 理财]","[怎么, 取消, 定期, 理财]",0.545455,0.577543,11.628187,4,4,7,8,1,...,1,9,9,1,0.375,0.25,0.125,0.545455,0.4,0.222222
1,"[银行卡, 密码, 正确, 但是, 错误]","[密码, 输入, 正确, 提示, 错误]",0.580645,0.669128,7.723174,5,5,11,10,1,...,1,1,1,1,0.409091,0.26087,0.125,0.580645,0.413793,0.222222
2,"[自动, 还款, 是, 扣, 余额宝, 吗]","[余额宝, 有钱, 还款, 当, 自动, 扣钱, 吗]",0.774194,0.669738,24.60532,6,7,10,13,3,...,1,1,1,1,0.666667,0.318182,0.173913,0.8,0.482759,0.296296
3,"[赔付, 运费, 险, 什么, 时候, 到, 账]","[退货, 什么, 时候, 退, 运费, 给]",0.62069,0.539562,0.362458,7,6,11,10,1,...,4,1,4,1,0.473684,0.35,0.25,0.642857,0.518519,0.4
4,"[余额宝, 转, 不, 出来, 吗]","[为什么, 余额宝, 转, 不, 进去]",0.521739,0.307491,8.165206,5,5,8,10,2,...,1,8,8,1,0.352941,0.3125,0.266667,0.521739,0.47619,0.421053


In [62]:
x = lgb_data.values
y = lgb_label.values
x, x_test, y, y_test = train_test_split(x, y, test_size=0.15, random_state=123, stratify=y, shuffle=True)

In [None]:
''' LightGBM '''

In [65]:
# F1值度量方法
def threshold(i):
    if i > 0.40:
        return 1.0
    else:
        return 0.0
def f1_metric(y_pred, train_data):
    y_true = train_data.get_label()
    #y_pred = np.round(y_pred)
    y_pred = list(map(threshold, y_pred))
    return 'f1_score', f1_score(y_true, y_pred), True

In [None]:
categorical_features = lgb_data.columns.values.tolist()
lgb_train = lgb.Dataset(x, label=y, feature_name=categorical_features, categorical_feature=categorical_features, free_raw_data=False)
lgb_test = lgb.Dataset(x_test, label=y_test)

In [123]:
parameters = {'application': 'binary', 
              'objective': 'binary',
              'is_unbalance': 'true',
              'boosting': 'gbdt',
              'num_leaves': 31,
              'feature_fraction': 0.5,
              'bagging_fraction': 0.5,
              'bagging_freq': 20,
              'learning_rate': 0.05,
              'verbose': 0
             }
parameters['metric'] = ['binary_logloss']
# parameters['metric'] = ['None']

In [None]:
model = lgb.train(params=parameters, 
                  train_set=lgb_train, 
                  valid_sets=lgb_test, 
                  num_boost_round=5000, 
                  early_stopping_rounds=100,
                  feval=f1_metric)

In [None]:
''' XGBoost '''

In [104]:
# F1值度量方法
def threshold(i):
    if i > 0.20:
        return 1.0
    else:
        return 0.0
def f1_metric(y_pred, train_data):
    y_true = train_data.get_label()
    #y_pred = np.round(y_pred)
    y_pred = list(map(threshold, y_pred))
    return 'F1', f1_score(y_true, y_pred)

In [105]:
xgb_train = xgb.DMatrix(data=x, label=y)
xgb_test = xgb.DMatrix(data=x_test, label=y_test)

In [106]:
parameters = {
            'booster':'gbtree',
            'objective':'binary:logistic',
            'eta':0.2,
            'max_depth':10,
            'subsample':1.0,
            'min_child_weight':2,
            'colsample_bytree':0.8,
            'scale_pos_weight':0.5,
            'eval_metric':'logloss',
            'gamma':0.2,            
            'lambda':0
}

In [None]:
watchlist = [(xgb_train,'train'),(xgb_test,'val')]
xgb_model = xgb.train(params=parameters,
                      dtrain=xgb_train,
                      num_boost_round=5000,
                      evals=watchlist,
                      early_stopping_rounds=100,
                      feval=f1_metric,
                      )