In [None]:
import gc
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
from sklearn.metrics import roc_auc_score
from keras import backend as K
K.tensorflow_backend._get_available_gpus()
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import *
from keras.layers import *
from keras.layers.embeddings import *
from keras.utils.vis_utils import plot_model
from keras.utils.np_utils import to_categorical

# 读取所有数据并使格式统一
requirement = pd.read_csv('Requirements.csv', header=None, usecols=[0,1,2])
requirement.columns = ['Rid', 'R_title', 'R_content']
requirement['Rid'] = requirement['Rid'].apply(lambda x: x.replace('\'', '').strip())

train_ach = pd.read_csv('Train_Achievements.csv', usecols=[0,1,2], header=None)
train_ach.columns = ['Aid', 'A_title', 'A_content']
train_ach['Aid'] = train_ach['Aid'].apply(lambda x: x.replace('\'', '').strip())

test_ach = pd.read_csv('Test_Achievements.csv', usecols=[0,1,2], header=None)
test_ach.columns = ['Aid', 'A_title', 'A_content']
test_ach['Aid'] = test_ach['Aid'].apply(lambda x: x.replace('\'', '').strip())

train_label = pd.read_csv('Train_Interrelation.csv', usecols=[0,1,2,3])

test_pred = pd.read_csv('TestPrediction.csv')

# merge
train = pd.merge(train_label, requirement, how='left', on='Rid')
train = pd.merge(train, train_ach, how='left', on='Aid')

test = pd.merge(test_pred, requirement, how='left', on='Rid')
test = pd.merge(test, test_ach, how='left', on='Aid')

# 清洗文本
def clean_line(text):
    text = re.sub("[A-Za-z0-9\!\=\？\%\[\]\,\（\）\>\<:&lt;\/#\. -----\_]", "", text)
    text = text.replace('图片', '')
    text = text.replace('\xa0', '') # 删除nbsp
    # new
    r1 =  "\\【.*?】+|\\《.*?》+|\\#.*?#+|[.!/_,$&%^*()<>+""'?@|:~{}#]+|[——！\\\，。=？、：“”‘’￥……（）《》【】]"
    cleanr = re.compile('<.*?>')
    text = re.sub(cleanr, ' ', text)        #去除html标签
    text = re.sub(r1,'',text)
    text = text.strip()
    return text
stop_words = pd.read_table('stop.txt', header=None)[0].tolist()

cols = ['A_title', 'A_content', 'R_title', 'R_content']
for col in cols:
    train[col] = train[col].apply(lambda x: clean_line(x))
    test[col] = test[col].apply(lambda x: clean_line(x))
    
# 中文分词
import jieba
import string
def cut_text(sentence):
    tokens = list(jieba.cut(sentence))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens
jieba.load_userdict('./itwords.txt')
jieba.load_userdict('./ecowords.txt')

In [None]:
def word_match_share(row):
    query = cut_text(row['A_title'])
    title = cut_text(row['R_title'])
    q1words = {}
    q2words = {}
    for word in query:
        q1words[word] = 1
    for word in title:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

def jaccard(row):
    query = set(cut_text(row['A_title']))
    title = set(cut_text(row['R_title']))
    wic = query.intersection(title)
    uw = query.union(title)
    if len(uw) == 0:
        uw = [1]
    return (len(wic) / len(uw))

def common_words(row):
    query = set(cut_text(row['A_title']))
    title = set(cut_text(row['R_title']))
    return len(set(query).intersection(set(title)))

def total_unique_words(row):
    query = set(cut_text(row['A_title']))
    title = set(cut_text(row['R_title']))
    return len(set(query).union(title))

def wc_diff(row):
    query = cut_text(row['A_title'])
    title = cut_text(row['R_title'])
    return abs(len(query) - len(title))

def wc_ratio(row):
    query = cut_text(row['A_title'])
    title = cut_text(row['R_title'])
    l1 = len(query)*1.0 
    l2 = len(title)
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def wc_diff_unique(row):
    query = cut_text(row['A_title'])
    title = cut_text(row['R_title'])
    return abs(len(set(query)) - len(set(title)))

def wc_ratio_unique(row):
    query = cut_text(row['A_title'])
    title = cut_text(row['R_title'])
    l1 = len(set(query)) * 1.0
    l2 = len(set(title))
    if l2 == 0:
        return np.nan
    if l1 / l2:
        return l2 / l1
    else:
        return l1 / l2

def same_start_word(row):
    query = cut_text(row['A_title'])
    title = cut_text(row['R_title'])
    if not query or not title:
        return np.nan
    return int(query[0] == title[0])

def get_weight(count, eps=10000, min_count=2):
    query = row['query'].replace('\t', '').split()
    title = row['title'].replace('\t', '').split()
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

def tfidf_word_match_share(row, weights=None):
    query = cut_text(row['A_title'])
    title = cut_text(row['R_title'])
    q1words = {}
    q2words = {}
    for word in query:
        q1words[word] = 1
    for word in title:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

# tfidf 所需
from collections import Counter
train_qs = pd.Series(train['A_title'].apply(lambda x: cut_text(x)).tolist() 
                    + train['R_title'].apply(lambda x: cut_text(x)).tolist()
                    + test['A_title'].apply(lambda x: cut_text(x)).tolist() 
                    + test['R_title'].apply(lambda x: cut_text(x)).tolist())
words = [x for y in train_qs for x in y]
counts = Counter(words)

def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)
weights = {word: get_weight(count) for word, count in counts.items()}

def get_statistic_features(train):
    # 14 dimensions
    train['word_match'] = train.apply(word_match_share, axis=1) #1
    train['jaccard'] = train.apply(jaccard, axis=1, raw=True) #2
    train['common_words'] = train.apply(common_words, axis=1, raw=True) #3
    train['total_unique_words'] = train.apply(total_unique_words, axis=1, raw=True) #4
    train['wc_diff'] = train.apply(wc_diff, axis=1, raw=True) #5
    train['wc_ratio'] = train.apply(wc_ratio, axis=1, raw=True) #6
    train['wc_diff_unique'] = train.apply(wc_diff_unique, axis=1, raw=True) #7
    train['wc_ratio_unique'] = train.apply(wc_ratio_unique, axis=1, raw=True) #8
    train['same_start_word'] = train.apply(same_start_word, axis=1, raw=True) #9
    train['tfidf_wm'] = train.apply(lambda x: tfidf_word_match_share(x, weights), axis=1, raw=True) #11
    train['query_length'] = train['A_title'].apply(lambda x: len(cut_text(x))) #12
    train['title_length'] = train['R_title'].apply(lambda x: len(cut_text(x))) #13
    train['query_isin_title'] = train.apply(lambda row:1 if row['A_title'] in row['R_title'] else 0, axis = 1) #14
    return train
train = get_statistic_features(train)
test = get_statistic_features(test)

In [None]:
import gensim
import time
class EpochSaver(gensim.models.callbacks.CallbackAny2Vec):
    '''用于保存模型, 打印损失函数等等'''
    def __init__(self, save_path):
        self.save_path = save_path
        self.epoch = 0
        self.pre_loss = 0
        self.best_loss = 999999999.9
        self.since = time.time()

    def on_epoch_end(self, model):
        self.epoch += 1
        cum_loss = model.get_latest_training_loss() # 返回的是从第一个epoch累计的
        epoch_loss = cum_loss - self.pre_loss
        time_taken = time.time() - self.since
        print("Epoch %d, loss: %.2f, time: %dmin %ds" % 
                    (self.epoch, epoch_loss, time_taken//60, time_taken%60))
        if self.best_loss > epoch_loss:
            self.best_loss = epoch_loss
            print("Better model. Best loss: %.2f" % self.best_loss)
            model.save(self.save_path)
            print("Model %s save done!" % self.save_path)

        self.pre_loss = cum_loss
        self.since = time.time()
model = gensim.models.Word2Vec.load('./final_word2vec_model')

In [None]:
from fuzzywuzzy import fuzz
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis

def wmd(s1, s2, model):
        model = model
        s1 = cut_text(s1)
        s2 = cut_text(s2)
        return model.wv.wmdistance(s1, s2)

def sent2vec(s, model):
    model = model
    words = cut_text(s)
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

def get_other_features(data):
    print('---basic features begin---')   
    # 15 dimensions
    data['len_A_title'] = data["A_title"].apply(lambda x: len(str(x)))
    data['len_R_title'] = data["R_title"].apply(lambda x: len(str(x)))
    data['diff_len'] = data.len_A_title - data.len_R_title
    data['fuzz_qratio'] = data.apply(lambda x: fuzz.QRatio(str(x['A_title']), str(x['R_title'])), axis=1)
    data['fuzz_WRatio'] = data.apply(lambda x: fuzz.WRatio(str(x['A_title']), str(x['R_title'])), axis=1)
    data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['A_title']), str(x['R_title'])), axis=1)
    data['fuzz_partial_token_set_ratio'] = data.apply(lambda x: fuzz.partial_token_set_ratio(str(x['A_title']), str(x['R_title'])), axis=1)
    data['fuzz_partial_token_sort_ratio'] = data.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['A_title']), str(x['R_title'])), axis=1)
    data['fuzz_token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(str(x['A_title']), str(x['R_title'])), axis=1)
    data['fuzz_token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(str(x['A_title']), str(x['R_title'])), axis=1)
    print('---basic features finished---')
    print('---wmd features begin---')
    data['wmd'] = data.apply(lambda x: wmd(x['A_title'], x['R_title'], model), axis=1)
    print('---wmd features finished---')

    print('---sent2vec begin---')
    # 提取的词向量维度为256
    question1_vectors = np.zeros((data.shape[0], 256))
    error_count = 0

    for i, q in tqdm(enumerate(data["A_title"].values)):
        question1_vectors[i, :] = sent2vec(q, model)

    question2_vectors  = np.zeros((data.shape[0], 256))
    for i, q in tqdm(enumerate(data["R_title"].values)):
        question2_vectors[i, :] = sent2vec(q, model)
    print('---sent2vec finished---')

    print('---distance features begin---')
    data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]
    data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]
    data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]
    data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]
    data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]
    data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                              np.nan_to_num(question2_vectors))]
    print('---distance features finished---')

    print('---skew_kur features begin---')
    data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
    data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
    data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
    data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]
    print('---skew_kur features finished---')
    return data

In [None]:
train = get_other_features(train)
test = get_other_features(test)

In [None]:
drop_fea = ['Aid', 'Rid', 'R_title', 'A_title', 'A_content', 'R_content', 'Level']

In [None]:
train.drop(drop_fea, axis=1).to_csv('train_features.csv', index=False)
test.drop(drop_fea, axis=1).to_csv('test_features.csv', index=False)

In [None]:
test.columns