In [None]:
import gc
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
from sklearn.metrics import roc_auc_score
from keras import backend as K
K.tensorflow_backend._get_available_gpus()
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import *
from keras.layers import *
from keras.layers.embeddings import *
from keras.utils.vis_utils import plot_model
from keras.utils.np_utils import to_categorical

In [None]:
# 读取所有数据并使格式统一
requirement = pd.read_csv('Requirements.csv', header=None, usecols=[0,1,2])
requirement.columns = ['Rid', 'R_title', 'R_content']
requirement['Rid'] = requirement['Rid'].apply(lambda x: x.replace('\'', '').strip())

train_ach = pd.read_csv('Train_Achievements.csv', usecols=[0,1,2], header=None)
train_ach.columns = ['Aid', 'A_title', 'A_content']
train_ach['Aid'] = train_ach['Aid'].apply(lambda x: x.replace('\'', '').strip())

test_ach = pd.read_csv('Test_Achievements.csv', usecols=[0,1,2], header=None)
test_ach.columns = ['Aid', 'A_title', 'A_content']
test_ach['Aid'] = test_ach['Aid'].apply(lambda x: x.replace('\'', '').strip())

train_label = pd.read_csv('Train_Interrelation.csv', usecols=[0,1,2,3])

test_pred = pd.read_csv('TestPrediction.csv')

In [None]:
train_features = pd.read_csv('train_features.csv')
test_features = pd.read_csv('test_features.csv')

In [None]:
# merge
train = pd.merge(train_label, requirement, how='left', on='Rid')
train = pd.merge(train, train_ach, how='left', on='Aid')

test = pd.merge(test_pred, requirement, how='left', on='Rid')
test = pd.merge(test, test_ach, how='left', on='Aid')

In [None]:
train = pd.merge(train, train_features, how='left', on='Guid')
test = pd.merge(test, test_features, how='left', on='Guid')

In [None]:
# 清洗文本
def clean_line(text):
    text = re.sub("[A-Za-z0-9\!\=\？\%\[\]\,\（\）\>\<:&lt;\/#\. -----\_]", "", text)
    text = text.replace('图片', '')
    text = text.replace('\xa0', '') # 删除nbsp
    # new
    r1 =  "\\【.*?】+|\\《.*?》+|\\#.*?#+|[.!/_,$&%^*()<>+""'?@|:~{}#]+|[——！\\\，。=？、：“”‘’￥……（）《》【】]"
    cleanr = re.compile('<.*?>')
    text = re.sub(cleanr, ' ', text)        #去除html标签
    text = re.sub(r1,'',text)
    text = text.strip()
    return text
stop_words = pd.read_table('stop.txt', header=None)[0].tolist()

cols = ['A_title', 'A_content', 'R_title', 'R_content']
for col in cols:
    train[col] = train[col].apply(lambda x: clean_line(x))
    test[col] = test[col].apply(lambda x: clean_line(x))

In [None]:
train = train[(train.len_A_title != 0) & (train.len_R_title != 0)]

In [None]:
garbage = test[(test.len_A_title == 0) | (test.len_R_title == 0)][['Guid', 'Level']]
garbage.Level = 1

test_need = test[(test.len_A_title != 0) & (test.len_R_title != 0)]

In [None]:
# 中文分词
import jieba
import string
def cut_text(sentence):
    tokens = list(jieba.cut(sentence))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens
jieba.load_userdict('./itwords.txt')
jieba.load_userdict('./ecowords.txt')

In [None]:
train_R_title = [cut_text(sent) for sent in train.R_title.values]
train_R_content = [cut_text(sent) for sent in train.R_content.values]
train_A_title = [cut_text(sent) for sent in train.A_title.values]
train_A_content = [cut_text(sent) for sent in train.A_content.values]

test_R_title = [cut_text(sent) for sent in test_need.R_title.values]
test_R_content = [cut_text(sent) for sent in test_need.R_content.values]
test_A_title = [cut_text(sent) for sent in test_need.A_title.values]
test_A_content = [cut_text(sent) for sent in test_need.A_content.values]

In [None]:
# 训练w2v
all_doc = train_R_content + train_R_title + train_A_content + train_A_title + \
          test_R_content + test_R_title + test_A_content + test_A_title


In [None]:
import gensim
import time
class EpochSaver(gensim.models.callbacks.CallbackAny2Vec):
    '''用于保存模型, 打印损失函数等等'''
    def __init__(self, save_path):
        self.save_path = save_path
        self.epoch = 0
        self.pre_loss = 0
        self.best_loss = 999999999.9
        self.since = time.time()

    def on_epoch_end(self, model):
        self.epoch += 1
        cum_loss = model.get_latest_training_loss() # 返回的是从第一个epoch累计的
        epoch_loss = cum_loss - self.pre_loss
        time_taken = time.time() - self.since
        print("Epoch %d, loss: %.2f, time: %dmin %ds" % 
                    (self.epoch, epoch_loss, time_taken//60, time_taken%60))
        if self.best_loss > epoch_loss:
            self.best_loss = epoch_loss
            print("Better model. Best loss: %.2f" % self.best_loss)
            model.save(self.save_path)
            print("Model %s save done!" % self.save_path)

        self.pre_loss = cum_loss
        self.since = time.time()
model_word2vec = gensim.models.Word2Vec.load('final_word2vec_model')

In [None]:

model_word2vec = gensim.models.Word2Vec(min_count=1, 
                                        window=5, 
                                        size=256,
                                        workers=4,
                                        batch_words=1000)
since = time.time()
model_word2vec.build_vocab(all_doc, progress_per=2000)
time_elapsed = time.time() - since
print('Time to build vocab: {:.0f}min {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

In [None]:
since = time.time()
model_word2vec.train(all_doc, total_examples=model_word2vec.corpus_count, 
                        epochs=5, compute_loss=True, report_delay=60*10, # 每隔10分钟输出一下日志
                        callbacks=[EpochSaver('./final_word2vec_model')])
time_elapsed = time.time() - since
print('Time to train: {:.0f}min {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

In [None]:
# 取title + content 128len
# train_A_doc = [t + c for t, c in zip(train_A_title, train_A_content)]
# test_A_doc = [t + c for t, c in zip(test_A_title, test_A_content)]
# train_R_doc = [t + c for t, c in zip(train_R_title, train_R_content)]
# test_R_doc = [t + c for t, c in zip(test_R_title, test_R_content)]

In [None]:
# 文本序列化。
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_A_title + test_A_title + train_R_title + test_R_title)
# tokenizer.fit_on_texts(train_A_doc + test_A_doc + train_R_doc + test_R_doc)

In [None]:
# 转化成词向量矩阵，利用新的word2vec模型
vocab_size = len(tokenizer.word_index)
error_count=0
embedding_matrix = np.zeros((vocab_size + 1, 256))
for word, i in tqdm(tokenizer.word_index.items()):
    if word in model_word2vec:
        embedding_matrix[i] = model_word2vec.wv[word]
    else:
        error_count += 1

In [None]:
max_len = 30

sequence = tokenizer.texts_to_sequences(train_A_title)
train_query = pad_sequences(sequence, maxlen=max_len)
sequence = tokenizer.texts_to_sequences(train_R_title)
train_title = pad_sequences(sequence, maxlen=max_len)

sequence = tokenizer.texts_to_sequences(test_A_title)
test_query = pad_sequences(sequence, maxlen=max_len)
sequence = tokenizer.texts_to_sequences(test_R_title)
test_title = pad_sequences(sequence, maxlen=max_len)

# sequence = tokenizer.texts_to_sequences(train_A_doc)
# train_query = pad_sequences(sequence, maxlen=max_len)
# sequence = tokenizer.texts_to_sequences(train_R_doc)
# train_title = pad_sequences(sequence, maxlen=max_len)

# sequence = tokenizer.texts_to_sequences(test_A_doc)
# test_query = pad_sequences(sequence, maxlen=max_len)
# sequence = tokenizer.texts_to_sequences(test_R_doc)
# test_title = pad_sequences(sequence, maxlen=max_len)

In [None]:
features = ['word_match', 'jaccard', 'common_words',
       'total_unique_words', 'wc_diff', 'wc_ratio', 'wc_diff_unique',
       'wc_ratio_unique', 'same_start_word', 'tfidf_wm', 'query_length',
       'title_length', 'query_isin_title', 'len_A_title', 'len_R_title',
       'diff_len', 'fuzz_qratio', 'fuzz_WRatio', 'fuzz_partial_ratio',
       'fuzz_partial_token_set_ratio', 'fuzz_partial_token_sort_ratio',
       'fuzz_token_set_ratio', 'fuzz_token_sort_ratio', 'wmd',
       'cosine_distance', 'cityblock_distance', 'canberra_distance',
       'euclidean_distance', 'minkowski_distance', 'braycurtis_distance',
       'skew_q1vec', 'skew_q2vec', 'kur_q1vec', 'kur_q2vec']

In [None]:
# 定义模型
import keras
def model_conv1D_(emb_matrix, max_len):
    # The embedding layer containing the word vectors
    emb_layer = Embedding(
        input_dim=emb_matrix.shape[0],
        output_dim=emb_matrix.shape[1],
        weights=[emb_matrix],
        input_length=max_len,
        trainable=False
    )
    
    # 1D convolutions that can iterate over the word vectors
    conv1 = Conv1D(filters=128, kernel_size=1, padding='same', activation='relu')
    conv2 = Conv1D(filters=128, kernel_size=2, padding='same', activation='relu')
    conv3 = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')
    conv4 = Conv1D(filters=128, kernel_size=4, padding='same', activation='relu')
    conv5 = Conv1D(filters=32, kernel_size=5, padding='same', activation='relu')
    conv6 = Conv1D(filters=32, kernel_size=6, padding='same', activation='relu')

    # Define inputs
    seq1 = Input(shape=(max_len,))
    seq2 = Input(shape=(max_len,))

    # Run inputs through embedding
    emb1 = emb_layer(seq1)
    emb2 = emb_layer(seq2)

    # Run through CONV + GAP layers
    conv1a = conv1(emb1)
    glob1a = GlobalAveragePooling1D()(conv1a)
    conv1b = conv1(emb2)
    glob1b = GlobalAveragePooling1D()(conv1b)

    conv2a = conv2(emb1)
    glob2a = GlobalAveragePooling1D()(conv2a)
    conv2b = conv2(emb2)
    glob2b = GlobalAveragePooling1D()(conv2b)

    conv3a = conv3(emb1)
    glob3a = GlobalAveragePooling1D()(conv3a)
    conv3b = conv3(emb2)
    glob3b = GlobalAveragePooling1D()(conv3b)

    conv4a = conv4(emb1)
    glob4a = GlobalAveragePooling1D()(conv4a)
    conv4b = conv4(emb2)
    glob4b = GlobalAveragePooling1D()(conv4b)

    conv5a = conv5(emb1)
    glob5a = GlobalAveragePooling1D()(conv5a)
    conv5b = conv5(emb2)
    glob5b = GlobalAveragePooling1D()(conv5b)

    conv6a = conv6(emb1)
    glob6a = GlobalAveragePooling1D()(conv6a)
    conv6b = conv6(emb2)
    glob6b = GlobalAveragePooling1D()(conv6b)

    mergea = concatenate([glob1a, glob2a, glob3a, glob4a, glob5a, glob6a])
    mergeb = concatenate([glob1b, glob2b, glob3b, glob4b, glob5b, glob6b])

    # We take the explicit absolute difference between the two sentences
    # Furthermore we take the multiply different entries to get a different measure of equalness
    diff = Lambda(lambda x: K.abs(x[0] - x[1]), output_shape=(4 * 128 + 2*32,))([mergea, mergeb])
    mul = Lambda(lambda x: x[0] * x[1], output_shape=(4 * 128 + 2*32,))([mergea, mergeb])

    # Add the magic features
    # magic_input = Input(shape=(5,))
    # magic_dense = BatchNormalization()(magic_input)
    # magic_dense = Dense(64, activation='relu')(magic_dense)

    # Add the distance features (these are now TFIDF (character and word), Fuzzy matching, 
    # nb char 1 and 2, word mover distance and skew/kurtosis of the sentence vector)
    
    distance_input = Input(shape=(34,))
    distance_dense = BatchNormalization()(distance_input)
    distance_dense = Dense(128, activation='relu')(distance_dense)

    # Merge the Magic and distance features with the difference layer
    merge = concatenate([diff, mul, distance_dense])
#     merge = concatenate([diff, mul])
    # merge = concatenate([diff, mul, magic_dense, distance_dense])


    # The MLP that determines the outcome
    x = Dropout(0.2)(merge)
    x = BatchNormalization()(x)
    x = Dense(300, activation='relu')(x)
    x = Dropout(0.2)(x)
    x = BatchNormalization()(x)
    pred = Dense(4, activation='softmax')(x)
    
    optimizer = keras.optimizers.Adam(lr=0.005)
    model = Model(inputs=[seq1, seq2, distance_input], outputs=pred)
#     model = Model(inputs=[seq1, seq2], outputs=pred)
    # model = Model(inputs=[seq1, seq2, magic_input, distance_input], outputs=pred)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])

    return model

model = model_conv1D_(embedding_matrix, max_len)
model.summary()

In [None]:
label = np.array(train.Level.tolist()) - 1
train_query_train, train_query_val = train_query[:6000], train_query[6000:]
train_title_train, train_title_val = train_title[:6000], train_title[6000:]
label_train, label_val = label[:6000], label[6000:]
train_features_train, train_features_val = train[:6000][features], train[6000:][features]
label_train = to_categorical(label_train)

In [None]:
# model.fit([train_query_train, train_title_train, train_features_train],         
#           label_train,
#           batch_size = 64,
#           epochs=5,
#           shuffle=True,
#           )
model.fit([train_query, train_title, train[features]],         
          to_categorical(label),
          batch_size = 128,
          epochs=15,
          shuffle=True,
          )
# 25 good

In [None]:
from sklearn.metrics import f1_score
pred_val = model.predict([train_query_val, train_title_val, train_features_val])
print(f1_score(label_val, np.argmax(pred_val, axis=1), average='macro'))

In [None]:
preds = np.argmax(model.predict([test_query, test_title, test_need[features]]), axis=1) + 1
test_need['Level'] = preds

In [None]:
all_pred = pd.concat([garbage, test_need[['Guid', 'Level']]])

In [None]:
test = test.drop('Level', axis=1)
test = pd.merge(test, all_pred, how='left', on='Guid')

In [None]:
test.Level.value_counts()

In [None]:
test[['Guid', 'Level']].to_csv('./subs/baseline8_add_features.csv', header=None, index=False)

In [None]:
pd.read_csv('./subs/baseline2.csv', header=None)[1].value_counts()