In [1]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import json

In [2]:
# 数据载入
def dataLoad(path='D:\Program\PycharmProjects\quoraDupli\data\quora_duplicate_questions.tsv'):
    df = pd.read_csv(path, delimiter='\t')
    df = df.dropna(how='any')  # 删除两行缺失数据
    # 转换编码
    df['question1'] = df['question1'].apply(lambda x: unicode(str(x), 'utf-8'))
    df['question2'] = df['question2'].apply(lambda x: unicode(str(x), 'utf-8'))

    print '数据量: %d行' % (df.shape[0])
    rowNum0 = df['is_duplicate'].value_counts()[0]
    rowNum1 = df['is_duplicate'].value_counts()[1]
    print '重复: 不重复 = %d : %d = 1 : %f' \
          % (rowNum1, rowNum0, (rowNum0 * 1.0 / rowNum1))
    uniqueId = set(list(df['qid2'].unique()) + list(df['qid1'].unique()))
    print '问题总数: %d' % (len(uniqueId))
    return df

In [3]:
# 计算TF-IDF分数
def getTfIdfScore(df):
    from sklearn.feature_extraction.text import TfidfVectorizer

    questions = list(df['question1']) + list(df['question2'])

    tfIdf = TfidfVectorizer(lowercase=False)
    tfIdf.fit_transform(questions)

    word2tfIdf = dict(zip(tfIdf.get_feature_names(), tfIdf.idf_))
    return word2tfIdf

In [4]:
import os
from tqdm import tqdm
import numpy as np
import en_core_web_md as md

In [5]:
df = dataLoad()
tfIdfScore = getTfIdfScore(df)

数据量: 404288行
重复: 不重复 = 149263 : 255025 = 1 : 1.708561
问题总数: 537931


In [6]:
nlp = md.load()

In [7]:
questionList = [list(df['question1']), list(df['question2'])]

In [8]:
vec = []
for question in tqdm(questionList[0]):
    words = nlp(question)
    mean_vec = np.zeros([len(words), 300])
    for word in words:
        wordVector = word.vector
        try:
            idf = tfIdfScore[str(word)]
        except:
            idf = 0
        mean_vec += wordVector * idf
    mean_vec = mean_vec.mean(axis=0)
    vec.append(mean_vec.tolist())

100%|█████████████████████████████████████████████████████████████████████████| 404288/404288 [11:47<00:00, 571.17it/s]                                                                                                                       


In [12]:
with open('D:\Program\PycharmProjects\quoraDupli\data\question1.json', 'w') as fp:
    fp.write(json.dumps(vec))

MemoryError: 

In [13]:
len(vec)

404288

In [16]:
vec = []
for question in tqdm(questionList[1]):
    words = nlp(question)
    mean_vec = np.zeros([len(words), 300])
    for word in words:
        wordVector = word.vector
        try:
            idf = tfIdfScore[str(word)]
        except:
            idf = 0
        mean_vec += wordVector * idf
    mean_vec = mean_vec.mean(axis=0)
    vec.append(mean_vec)

100%|█████████████████████████████████████████████████████████████████████████| 404288/404288 [12:51<00:00, 523.98it/s]


In [None]:
with open('D:\Program\PycharmProjects\quoraDupli\data\question2.json', 'w') as fp:
    fp.write(json.dumps(vec))

In [None]:
pd.to_pickle(df, 'data/2_word2vec_tfidf.pkl')

In [14]:
df['q1Vector'] = vec

In [18]:
pd.to_pickle(df, 'D:\Program\PycharmProjects\quoraDupli\data\question1&2.pickle')

In [17]:
df['q2Vector'] = vec

In [27]:
df['q1Vector'][1717]

array([ -10.89369355,   11.25902551,   -5.1187965 ,   -6.06989233,
         11.99266816,    1.70115947,   -5.61840542,   -3.79112197,
         -2.71978045,  103.98578262,  -24.09299075,    2.34198799,
        -10.05585681,   -3.63996656,   -0.6522512 ,   -0.1667682 ,
         -7.7384637 ,   76.51058149,   -6.41214633,   -4.2586635 ,
          9.4239338 ,   -2.67132625,   -1.15786346,   -4.58551546,
         -4.23862864,   -5.27130169,   -4.16966924,    2.80506968,
          7.22021147,    0.36746059,   -4.10047547,   -2.87639775,
         -1.1988828 ,   -0.52249566,   -1.57655483,    3.14391417,
          2.32159036,   -1.53440873,   -7.30013071,   -4.69445034,
          0.47051984,    7.01570294,    1.72144576,   -9.5606603 ,
         -0.66424323,    6.90612473,   -6.76487689,   -9.0081695 ,
         -1.27538522,    6.22450536,   -7.66572031,    0.88944499,
          2.6314193 ,  -12.96832015,   10.19972441,   -3.85901373,
          5.33758172,    4.93926939,   -1.321724  ,   -4.46512

In [28]:
df['q2Vector'][1717]

array([ -7.35004675e+00,   4.77369707e+00,  -3.19105649e+00,
        -4.43868736e+00,   6.65813670e+00,   2.24078010e+00,
        -4.72003703e+00,  -3.30655451e+00,  -4.61895303e+00,
         4.37299714e+01,  -1.01270496e+01,  -7.04756662e-01,
        -5.96746479e+00,  -3.21366613e+00,  -3.58270088e+00,
        -1.96673114e+00,  -3.36736354e+00,   4.11322300e+01,
        -3.31787935e+00,  -4.84912136e+00,   3.60126641e+00,
        -1.50437972e+00,  -3.76362888e+00,  -2.47237039e+00,
        -1.09926730e+00,  -4.27385652e+00,  -3.82525279e+00,
         2.98098058e-01,  -8.59318212e-01,  -2.92120951e+00,
        -3.06126615e+00,  -9.25722871e-01,   1.93964611e+00,
        -1.79039050e+00,  -3.43446693e+00,   2.41662283e+00,
         5.31657726e-01,   5.45711935e-01,  -3.74512207e+00,
        -3.59156236e+00,   2.91275397e+00,   1.23301873e+00,
         1.28207661e+00,  -7.56854874e+00,   6.27340481e-01,
         2.48359012e+00,  -7.82130349e+00,  -4.14453018e+00,
        -2.33352192e+00,

In [29]:
print df['question1'][1717]
print df['question2'][1717]

How can I get a green card to live and work in the USA?
How can I get a green card?


In [30]:
vec = []
words = nlp(df['question1'][1717])
mean_vec = np.zeros([len(words), 300])
for word in words:
    wordVector = word.vector
    try:
        idf = tfIdfScore[str(word)]
    except:
        idf = 0
    mean_vec += wordVector * idf
mean_vec = mean_vec.mean(axis=0)
vec.append(mean_vec.tolist())

In [31]:
vec

[[-10.89369355441886,
  11.259025506675243,
  -5.118796497583389,
  -6.069892330095172,
  11.99266815930605,
  1.7011594660580158,
  -5.618405416607857,
  -3.7911219745874405,
  -2.7197804525494576,
  103.98578262329102,
  -24.09299075230956,
  2.3419879898428917,
  -10.055856809020042,
  -3.639966562157497,
  -0.652251198887825,
  -0.16676820488646626,
  -7.7384636998176575,
  76.51058149337769,
  -6.412146329879761,
  -4.25866350159049,
  9.42393379798159,
  -2.671326246112585,
  -1.1578634604811668,
  -4.585515461862087,
  -4.238628637045622,
  -5.271301686763763,
  -4.169669241644442,
  2.805069677531719,
  7.220211474224925,
  0.3674605917185545,
  -4.1004754703026265,
  -2.876397754997015,
  -1.1988828033208847,
  -0.5224956646561623,
  -1.5765548311173916,
  3.1439141668379307,
  2.3215903639793396,
  -1.5344087332487106,
  -7.300130708143115,
  -4.694450341165066,
  0.4705198425799608,
  7.015702936798334,
  1.7214457616209984,
  -9.560660302639008,
  -0.6642432287335396,
  6.9

In [43]:
list(df['q2Vector'][1717]) == vec2[0]

True

In [33]:
vec2 = []
words = nlp(df['question2'][1717])
mean_vec = np.zeros([len(words), 300])
for word in words:
    wordVector = word.vector
    try:
        idf = tfIdfScore[str(word)]
    except:
        idf = 0
    mean_vec += wordVector * idf
mean_vec = mean_vec.mean(axis=0)
vec2.append(mean_vec.tolist())

In [36]:
df['q2Vector'][1717] == vec2

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
      