In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils.vis_utils import plot_model
from sklearn.model_selection  import KFold
from sklearn.model_selection import train_test_split
from utils import _save, _load, SaveData
from model_build.dnn_model import *
np.set_printoptions(suppress=True)

Using TensorFlow backend.


In [2]:
# 加载词序列和embedding_matrix
save_data = _load('./dataset/glove_embedding_data.pkl')
train_pad_1 = save_data.train_pad_1
train_pad_2 = save_data.train_pad_2
labels = save_data.labels
test_pad_1 = save_data.test_pad_1
test_pad_2 = save_data.test_pad_2
test_ids = save_data.test_ids
embedding_matrix = save_data.embedding_matrix
num_words = save_data.num_words

In [4]:
sequence_length = train_pad_1.shape[1]
embedding_dim = embedding_matrix.shape[1]

In [5]:
# 加载特征
fs_basic = pd.read_csv('feature_store/train_feature_basic.csv')
fs_fuzz = pd.read_csv('feature_store/train_feature_fuzz.csv')
fs_w2v_gnews = pd.read_csv('feature_store/train_feature_w2v_gnews.csv')
fs_tfidf = pd.read_csv('feature_store/train_feature_tfidf.csv')
fs_w2v_glove = pd.read_csv('feature_store/train_feature_w2v_glove.csv')
fs_graph = pd.read_csv('feature_store/train_feature_graph.csv')
fs_freq = pd.read_csv('feature_store/train_feature_freq.csv')

In [6]:
fs_w2v_gnews.fillna(0,inplace=True)
fs_w2v_glove.fillna(0,inplace=True)
fs_tfidf.fillna(0,inplace=True)

In [21]:
X = np.hstack((fs_basic, fs_fuzz, fs_w2v_gnews, fs_tfidf, fs_w2v_glove, fs_freq, fs_graph))
X[np.isinf(X)] = 0

In [22]:
all_X = np.hstack([train_pad_1, train_pad_2, X])
X_train, X_valid, y_train, y_valid = train_test_split(all_X, np.array(labels), test_size=0.2, random_state=0)
X_train_1 = X_train[:,:sequence_length]
X_train_2 = X_train[:,sequence_length:-X.shape[1]]
X_train_fs = X_train[:,-X.shape[1]:]
X_valid_1 = X_valid[:,:sequence_length]
X_valid_2 = X_valid[:,sequence_length:-X.shape[1]]
X_valid_fs = X_valid[:,-X.shape[1]:]

### 模型实验

In [5]:
model = build_model_lstm_cnn_fs_v2(
    feature_num=52,
    num_words=num_words,
    embedding_dim=embedding_dim,
    embedding_matrix=embedding_matrix,
    max_sequence_length=sequence_length,
    rate_drop_dense=0.4,
    rate_drop_lstm=0.2
)

In [6]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 30)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 30, 300)      1329300     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 30, 64)       19264       embedding_1[0][0]                
          

In [61]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
best_model_path = 'model_store/lstm_cnn_fs_v2.h5'
model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)

In [62]:
hist = model.fit([X_train_1, X_train_2, X_train_fs],
          y_train, 
          validation_data=([X_valid_1, X_valid_2, X_valid_fs], y_valid),
          epochs=50,
          batch_size=512,
          shuffle=True,
          callbacks=[early_stopping, model_checkpoint],
          verbose = 2)

Train on 323432 samples, validate on 80858 samples
Epoch 1/50
 - 64s - loss: 0.3028 - acc: 0.8564 - val_loss: 0.2598 - val_acc: 0.8780
Epoch 2/50
 - 63s - loss: 0.2603 - acc: 0.8806 - val_loss: 0.2373 - val_acc: 0.8920
Epoch 3/50
 - 63s - loss: 0.2457 - acc: 0.8883 - val_loss: 0.2323 - val_acc: 0.8948
Epoch 4/50
 - 65s - loss: 0.2351 - acc: 0.8943 - val_loss: 0.2305 - val_acc: 0.8946
Epoch 5/50
 - 65s - loss: 0.2263 - acc: 0.8982 - val_loss: 0.2221 - val_acc: 0.8999
Epoch 6/50
 - 65s - loss: 0.2182 - acc: 0.9022 - val_loss: 0.2195 - val_acc: 0.9013
Epoch 7/50
 - 64s - loss: 0.2128 - acc: 0.9058 - val_loss: 0.2183 - val_acc: 0.9025
Epoch 8/50
 - 66s - loss: 0.2067 - acc: 0.9084 - val_loss: 0.2178 - val_acc: 0.9027
Epoch 9/50
 - 65s - loss: 0.2015 - acc: 0.9105 - val_loss: 0.2189 - val_acc: 0.9026
Epoch 10/50
 - 64s - loss: 0.1975 - acc: 0.9125 - val_loss: 0.2160 - val_acc: 0.9043
Epoch 11/50
 - 63s - loss: 0.1935 - acc: 0.9144 - val_loss: 0.2176 - val_acc: 0.9030
Epoch 12/50
 - 64s - lo

In [40]:
# 加载测试数据
fs_basic_test = pd.read_csv('feature_store/test_feature_basic.csv')
fs_fuzz_test = pd.read_csv('feature_store/test_feature_fuzz.csv')
fs_w2v_gnews_test = pd.read_csv('feature_store/test_feature_w2v_gnews.csv')
fs_tfidf_test = pd.read_csv('feature_store/test_feature_tfidf.csv')
fs_w2v_glove_test = pd.read_csv('feature_store/test_feature_w2v_glove.csv')
fs_graph_test = pd.read_csv('feature_store/test_feature_graph.csv')
fs_freq_test = pd.read_csv('feature_store/test_feature_freq.csv')

# 填充缺失值
fs_w2v_gnews_test.fillna(0,inplace=True)
fs_w2v_glove_test.fillna(0,inplace=True)
fs_tfidf_test.fillna(0,inplace=True)

# 连接特征
X_test = np.hstack((fs_basic_test, fs_fuzz_test, fs_w2v_gnews_test, fs_tfidf_test, fs_w2v_glove_test, fs_freq_test, fs_graph_test))
# 处理异常值
X_test[np.isinf(X_test)] = 0

In [None]:
model.load_weights(best_model_path)
preds = model.predict([test_pad_1, test_pad_2, X_test], batch_size=2048, verbose=2)
submission = pd.DataFrame({'test_id':test_ids,'is_duplicate':preds.ravel()})
a = 0.174264424749 / 0.369197853026
b = (1 - 0.174264424749) / (1 - 0.369197853026)
submission['is_duplicate'] = submission.is_duplicate.apply(lambda x: a * x / (a * x + b * (1 - x)))
submission.to_csv('submission/lstm_cnn_fs_v2.csv', index=False)