# 交互式特征抽取

建立在文本向量化表示的基础上进行交互式的特征的抽取

# 使用scipy库中的距离计算库进行计算

# 特征抽取 
命名原则为 单独某个句子的特征使用q1或者q2开头

两个句子共同特征 命名方式统一为q12

使用 距离计算库scipy.spatial.distance 计算两个向量距离

对之前tfidf cos方法的补充

In [1]:
import pickle
import os
import re
import numpy
import numpy as np
import pandas as pd
import copy
import time
import scipy.spatial.distance as dist
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def read_pickle(fname):
    with open(fname,'rb') as file:
        return pickle.load(file)
def dump_pickle(obj,fname):
    with open(fname,'wb') as file:
        pickle.dump(obj,file)

In [3]:
merge_df = pd.read_pickle('../data/pure_data/merge_df_2018111172053.pkl')

In [4]:
all_df = pd.read_csv('../data/raw_data/question_id.csv')

In [5]:
feature_stop_columns = ['qid1','qid2','label','q1_w_list', 'q1_c_list', 'q2_w_list', 'q2_c_list']

In [6]:
def what_time_now():
    localtime = time.localtime()
    return '{0:0>4}{1:0>2}{2:0>2}{3:0>2}{4:0>2}'.format(localtime[0],localtime[1],localtime[2],localtime[3],localtime[4])

In [7]:
def interaction_output_distance(func,matrix1,matrix2):
    '''
    调用scipy函数计算matrix12对应行的距离
    matrix12 shape应当一致 为数据量 * 特征维度
    返回值为 数据量 * 1的向量 代表q1 和 q2的交互
    '''
    result = []
    for index,vec in enumerate(matrix1):
        result.append(func(vec,matrix2[index]))
    return result

In [8]:
def scipy_dist_interaction_df_update(func,df,matrix1,matrix2,label='w_tfidf_1gram'):
    func_name = func.__name__
    new_label = 'q12_{0}_{1}'.format(label,func_name)
    df[new_label] = interaction_output_distance(func,matrix1,matrix2)
    # 平滑
    df = df.fillna(value=1e-6)
    # 归一化
    max_val = max(df[new_label])
    min_val = min(df[new_label])
    if max_val != min_val:
        df[new_label] = df[new_label].apply(lambda x:(x-min_val)/(max_val-min_val))
    else:
        df[new_label] = [1.0 for i in range(df.shape[0])]
    return df

In [9]:
def distance_pipline(df,matrix1,matrix2,label='w_tfidf_1gram'):
    distance_func_list = [dist.braycurtis,dist.canberra,dist.chebyshev,
                      dist.cityblock,dist.correlation,dist.dice,dist.euclidean,dist.cosine,dist.jaccard,dist.hamming,
                     dist.kulsinski,dist.sqeuclidean]
    for dist_fun in distance_func_list:
        df = scipy_dist_interaction_df_update(dist_fun,df,matrix1,matrix2,label)
    return df

In [10]:
blank_merge_df = merge_df[['qid1','qid2','label']].copy()

In [11]:
base_origin_dir = '../data/feature/reduction_dim/'
base_target_dir = '../data/feature/interaction/'

In [12]:
if not os.path.isdir(base_target_dir):
    os.mkdir(base_target_dir)

# 批量生成特征

In [13]:
def make_interaction(matrix_path,df):
    matrix_fname = matrix_path.split('/')[-1]
    matrix_granu = matrix_fname.split('_')[0]
    matrix_label = matrix_granu.split('.')[0]
    matrix_self = read_pickle(matrix_path)
    
    search_table = {}
    for i,_ in enumerate(all_df.itertuples()):
        search_table[all_df.loc[i,'qid']] = matrix_self[i]
    
    matrix1 = np.array([search_table[i] for i in df.qid1.tolist()])
    matrix2 = np.array([search_table[i] for i in df.qid2.tolist()])
    df = distance_pipline(df,matrix1,matrix2,label=matrix_label)
    return df

# TSNE结果 单独进行拼接

In [14]:
def single_dimension_interaction(df,q1_matrix,q2_matrix,label):
    def interaction_update(func,df,matrix1,matrix2,label='w_tfidf_1gram'):
        func_name = '_' + func.__name__
        result = []
        for index,vec in enumerate(q1_matrix):
            result.append(func(vec,q2_matrix[index]))
        df['q12_{0}_{1}'.format(label,func_name)] = result
        return df
    def add(v1,v2):
        return np.add(v1,v2)
    def minus(v1,v2):
        return v1 - v2
    def minus_abs(v1,v2):
        return np.abs(v1-v2)
    def multi(v1,v2):
        return v1 * v2
    def div(v1,v2):
        return v1 / v2
    
    func_pipline = [add,minus,minus_abs,multi,div]
    for func in func_pipline:
        interaction_update(func,df,q1_matrix,q2_matrix,label)
    df = df.fillna(value=1e-6)
    return df

In [15]:
def make_tsne_interaction(matrix_path,df):
    matrix_fname = matrix_path.split('/')[-1]
    matrix_granu = matrix_fname.split('_')[0]
    matrix_label = matrix_granu.split('.')[0]
    matrix_self = read_pickle(matrix_path)
    matrix_dim = matrix_label.split('_')[-1]
    
    search_table = {}
    for i,_ in enumerate(all_df.itertuples()):
        search_table[all_df.loc[i,'qid']] = matrix_self[i]
    
    matrix1 = np.array([search_table[i] for i in df.qid1.tolist()])
    matrix2 = np.array([search_table[i] for i in df.qid2.tolist()])
    
    if matrix_dim == '1d':
        df = single_dimension_interaction(df,matrix1,matrix2,matrix_label)
    else:
        df = distance_pipline(df,matrix1,matrix2,matrix_label)
    return df

# 计算同时保存特征

In [16]:
csv_feature_base_path = '../data/csv_feature/interaction/'

In [17]:
counter = 0
method_list = ['pca','nmf']
method_list += ['tsne_fast']
for method in method_list:
    target_dir = '{}{}/'.format(base_target_dir,method)
    origin_dir = '{}{}/'.format(base_origin_dir,method)
    csv_feature_dir = '{}{}/'.format(csv_feature_base_path,method)
    
    matrix_dir_list = [i for i in os.listdir(origin_dir)]
    if not os.path.isdir(target_dir):
        os.mkdir(target_dir)
    if not os.path.isdir(csv_feature_dir):
        os.mkdir(csv_feature_dir)
    
    for matrix_fname in matrix_dir_list:
        full_matrix_path = '{}{}'.format(origin_dir,matrix_fname)
        inter_matrix_fname = '{}_interaction_df.pkl'.format(matrix_fname.split('.')[0])
        output_path = '{}{}'.format(target_dir,inter_matrix_fname)
        csv_feature_path = '{}{}'.format(csv_feature_dir,inter_matrix_fname.replace('.pkl','.csv'))
        print('Index:{} Fname:{}'.format(counter, inter_matrix_fname))
        print(output_path)
        if os.path.isfile(output_path) and os.path.isfile(csv_feature_path):
            continue
        temp_blank_merge_df = blank_merge_df.copy()
        if not method == 'tsne_fast':
            temp_blank_merge_df = make_interaction(full_matrix_path,temp_blank_merge_df)
        else:
            temp_blank_merge_df = make_tsne_interaction(full_matrix_path,temp_blank_merge_df)
        
        temp_blank_merge_df.to_pickle(output_path)
        temp_blank_merge_df.to_csv(csv_feature_path,index=False)
        counter += 1

Index:0 Fname:c_tfidf_1gram_pca_50d_interaction_df.pkl
../data/feature/interaction/pca/c_tfidf_1gram_pca_50d_interaction_df.pkl
Index:1 Fname:w_onehot_1gram_pca_150d_interaction_df.pkl
../data/feature/interaction/pca/w_onehot_1gram_pca_150d_interaction_df.pkl
Index:2 Fname:w_onehot_3gram_pca_150d_interaction_df.pkl
../data/feature/interaction/pca/w_onehot_3gram_pca_150d_interaction_df.pkl
Index:3 Fname:w_onehot_1gram_pca_50d_interaction_df.pkl
../data/feature/interaction/pca/w_onehot_1gram_pca_50d_interaction_df.pkl
Index:4 Fname:c_onehot_1gram_pca_150d_interaction_df.pkl
../data/feature/interaction/pca/c_onehot_1gram_pca_150d_interaction_df.pkl
Index:5 Fname:c_onehot_2gram_pca_150d_interaction_df.pkl
../data/feature/interaction/pca/c_onehot_2gram_pca_150d_interaction_df.pkl
Index:6 Fname:w_tfidf_1gram_pca_150d_interaction_df.pkl
../data/feature/interaction/pca/w_tfidf_1gram_pca_150d_interaction_df.pkl
Index:7 Fname:c_tfidf_3gram_pca_50d_interaction_df.pkl
../data/feature/interaction/p

  return l1_diff.sum() / l1_sum.sum()
  dist = 1.0 - uv / np.sqrt(uu * vv)
  return float((ntf + nft) / np.array(2.0 * ntt + ntf + nft))
  dist = np.double(unequal_nonzero.sum()) / np.double(nonzero.sum())


Index:39 Fname:w_onehot_1gram_nmf_150d_interaction_df.pkl
../data/feature/interaction/nmf/w_onehot_1gram_nmf_150d_interaction_df.pkl
Index:40 Fname:w_onehot_2gram_nmf_50d_interaction_df.pkl
../data/feature/interaction/nmf/w_onehot_2gram_nmf_50d_interaction_df.pkl
Index:41 Fname:w_tfidf_1gram_nmf_50d_interaction_df.pkl
../data/feature/interaction/nmf/w_tfidf_1gram_nmf_50d_interaction_df.pkl
Index:42 Fname:w_onehot_2gram_nmf_150d_interaction_df.pkl
../data/feature/interaction/nmf/w_onehot_2gram_nmf_150d_interaction_df.pkl
Index:43 Fname:c_onehot_2gram_nmf_50d_interaction_df.pkl
../data/feature/interaction/nmf/c_onehot_2gram_nmf_50d_interaction_df.pkl
Index:44 Fname:c_tfidf_3gram_nmf_300d_interaction_df.pkl
../data/feature/interaction/nmf/c_tfidf_3gram_nmf_300d_interaction_df.pkl
Index:45 Fname:c_tfidf_3gram_nmf_150d_interaction_df.pkl
../data/feature/interaction/nmf/c_tfidf_3gram_nmf_150d_interaction_df.pkl
Index:46 Fname:c_tfidf_3gram_nmf_50d_interaction_df.pkl
../data/feature/interacti

  dist = 1.0 - uv / np.sqrt(uu * vv)


Index:55 Fname:c_tfidf_1gram_nmf_150d_interaction_df.pkl
../data/feature/interaction/nmf/c_tfidf_1gram_nmf_150d_interaction_df.pkl
Index:56 Fname:c_tfidf_2gram_nmf_150d_interaction_df.pkl
../data/feature/interaction/nmf/c_tfidf_2gram_nmf_150d_interaction_df.pkl
Index:57 Fname:c_onehot_2gram_nmf_150d_interaction_df.pkl
../data/feature/interaction/nmf/c_onehot_2gram_nmf_150d_interaction_df.pkl
Index:58 Fname:c_onehot_3gram_nmf_150d_interaction_df.pkl
../data/feature/interaction/nmf/c_onehot_3gram_nmf_150d_interaction_df.pkl
Index:59 Fname:w_onehot_3gram_nmf_150d_interaction_df.pkl
../data/feature/interaction/nmf/w_onehot_3gram_nmf_150d_interaction_df.pkl
Index:60 Fname:c_onehot_3gram_nmf_50d_interaction_df.pkl
../data/feature/interaction/nmf/c_onehot_3gram_nmf_50d_interaction_df.pkl
Index:61 Fname:w_tfidf_1gram_nmf_300d_interaction_df.pkl
../data/feature/interaction/nmf/w_tfidf_1gram_nmf_300d_interaction_df.pkl
Index:62 Fname:w_tfidf_1gram_nmf_150d_interaction_df.pkl
../data/feature/inte