In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import scipy
import pickle
import time
from utils import _save, _load, SaveData

In [58]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils.vis_utils import plot_model
from model_build.dnn_model import *

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgbm

In [4]:
# 加载词序列和embedding_matrix
save_data = _load('./dataset/glove_embedding_data.pkl')
train_pad_1 = save_data.train_pad_1
train_pad_2 = save_data.train_pad_2
labels = save_data.labels
test_pad_1 = save_data.test_pad_1
test_pad_2 = save_data.test_pad_2
test_ids = save_data.test_ids
embedding_matrix = save_data.embedding_matrix
num_words = save_data.num_words

In [6]:
# 加载特征
fs_basic = pd.read_csv('feature_store/train_feature_basic.csv')
fs_fuzz = pd.read_csv('feature_store/train_feature_fuzz.csv')
fs_w2v_gnews = pd.read_csv('feature_store/train_feature_w2v_gnews.csv')
fs_tfidf = pd.read_csv('feature_store/train_feature_tfidf.csv')
fs_w2v_glove = pd.read_csv('feature_store/train_feature_w2v_glove.csv')
fs_graph = pd.read_csv('feature_store/train_feature_graph.csv')
fs_freq = pd.read_csv('feature_store/train_feature_freq.csv')

In [7]:
# 填充缺失值
fs_w2v_gnews.fillna(0,inplace=True)
fs_w2v_glove.fillna(0,inplace=True)
fs_tfidf.fillna(0,inplace=True)

In [8]:
# 连接特征
X = np.hstack((fs_basic, fs_fuzz, fs_w2v_gnews, fs_tfidf, fs_w2v_glove, fs_freq, fs_graph))
# 处理异常值
X[np.isinf(X)] = 0

In [9]:
# 加载测试数据
fs_basic_test = pd.read_csv('feature_store/test_feature_basic.csv')
fs_fuzz_test = pd.read_csv('feature_store/test_feature_fuzz.csv')
fs_w2v_gnews_test = pd.read_csv('feature_store/test_feature_w2v_gnews.csv')
fs_tfidf_test = pd.read_csv('feature_store/test_feature_tfidf.csv')
fs_w2v_glove_test = pd.read_csv('feature_store/test_feature_w2v_glove.csv')
fs_graph_test = pd.read_csv('feature_store/test_feature_graph.csv')
fs_freq_test = pd.read_csv('feature_store/test_feature_freq.csv')

In [10]:
# 填充缺失值
fs_w2v_gnews_test.fillna(0,inplace=True)
fs_w2v_glove_test.fillna(0,inplace=True)
fs_tfidf_test.fillna(0,inplace=True)

In [11]:
# 连接特征
X_test = np.hstack((fs_basic_test, fs_fuzz_test, fs_w2v_gnews_test, fs_tfidf_test, fs_w2v_glove_test, fs_freq_test, fs_graph_test))
# 处理异常值
X_test[np.isinf(X_test)] = 0

---

In [27]:
# 训练lgbm，保存模型
def train_lgbm(X_train, y_train, X_valid, y_valid, fold_round, time_dict):
    start = time.time()
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary', # 目标函数
        'metric': 'binary_logloss', # 设置提升类型
        'num_leaves': 47, # 叶子节点数
        'learning_rate': 0.02, # 学习速率
        'feature_fraction': 0.75, # 建树的特征选择比例
        'bagging_fraction': 0.8, # 建树的样本采样比例
        'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
        'verbose': 0, # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
        'save_binary': True,
        'min_data_in_leaf': 100, 
        'max_bin': 1023,
    }
    
    lgbm_train = lgbm.Dataset(X_train, y_train)
    lgbm_valid = lgbm.Dataset(X_valid, y_valid, reference=lgbm_train)

    lgbm_bst = lgbm.train(params, lgbm_train, num_boost_round=4000, 
                 valid_sets=lgbm_valid, valid_names='valid',
                 early_stopping_rounds=20, verbose_eval=False)
    
    y_pred = lgbm_bst.predict(X_valid, num_iteration=lgbm_bst.best_iteration)
    df_pred = pd.DataFrame({'y_true':y_valid,'y_pred':y_pred})
    df_pred.to_csv('./model_store/lgbm_model/lgbm_pred_{}.csv'.format(fold_round), index=False)
    
    y_pred_test = lgbm_bst.predict(X_test, num_iteration=lgbm_bst.best_iteration)
    submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':y_pred_test})
    submission.to_csv('./model_store/lgbm_model/lgbm_submission_{}.csv'.format(fold_round), index=False)
    
    lgbm_bst.save_model('./model_store/lgbm_model/lgbm_{}.model'.format(fold_round))
    
    time_dict['lgbm'].append(time.time() - start)

In [75]:
# 训练深度学习网络，保存模型，保存训练过程
def train_DNN(seq_train_1, seq_train_2, X_train, y_train,
              seq_valid_1, seq_valid_2, X_valid, y_valid,
              fold_round, time_dict):
    start = time.time()
    sequence_length = seq_train_1.shape[1]
    embedding_dim = embedding_matrix.shape[1]
    
    model = build_model_lstm_cnn_fs_v2(
        feature_num=X.shape[1],
        num_words=num_words,
        embedding_dim=embedding_dim,
        embedding_matrix=embedding_matrix,
        max_sequence_length=sequence_length,
        rate_drop_lstm=0.2,
        rate_drop_dense=0.4
    )
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=5)
    best_model_path = 'model_store/dnn_model/dnn_model_{}.h5'.format(fold_round)
    model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)
    
    hist = model.fit(
        [seq_train_1, seq_train_2, X_train],y_train, 
        validation_data=([seq_valid_1, seq_valid_2, X_valid], y_valid),
        epochs=50,
        batch_size=512,
        shuffle=True, 
        class_weight='auto',
        callbacks=[early_stopping, model_checkpoint],
        verbose = 0)
    
    df_hist = pd.DataFrame(hist.history)
    df_hist.to_csv('model_store/dnn_model/dnn_model_history_{}.csv'.format(fold_round), index=False)
    
    model.load_weights(best_model_path)
    y_pred = model.predict([seq_valid_1, seq_valid_2, X_valid], batch_size=2048, verbose=0)
    df_pred = pd.DataFrame({'y_true':y_valid,'y_pred':y_pred.ravel()})
    df_pred.to_csv('./model_store/dnn_model/dnn_pred_{}.csv'.format(fold_round), index=False)
    
    y_pred_test = model.predict([test_pad_1, test_pad_2, X_test], batch_size=2048, verbose=0)
    submission = pd.DataFrame({"test_id": test_ids, "is_duplicate": y_pred_test.ravel()})
    submission.to_csv('./model_store/dnn_model/dnn_submission_{}.csv'.format(fold_round), index=False)
    
    time_dict['DNN'].append(time.time() - start)

In [69]:
# 10折交叉验证
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)

In [76]:
time_dict = {
    'lgbm':[]
    'DNN':[]
}
y = np.array(labels)
for fold_round, fold_indexs in enumerate(skf.split(X, y)):
    print('Round {} start···························'.format(fold_round))
    train_index, valid_index = fold_indexs
    
    seq_train_1 = train_pad_1[train_index]
    seq_train_2 = train_pad_2[train_index]
    X_train = X[train_index]
    y_train = y[train_index]
    
    seq_valid_1 = train_pad_1[valid_index]
    seq_valid_2 = train_pad_2[valid_index]
    X_valid = X[valid_index]
    y_valid = y[valid_index]
    
    # 训练lgbm
    print('LightGBM train start·····')
    train_lgbm(X_train, y_train, X_valid, y_valid, fold_round, time_dict)
    # 训练DNN
    print('DNN train start·····')
    train_DNN(seq_train_1, seq_train_2, X_train, y_train,
              seq_valid_1, seq_valid_2, X_valid, y_valid,
              fold_round, time_dict)

df_time = pd.DataFrame(time_dict)
df_time.to_csv('model_store/10fold_time_dnn.csv', index=False)
lgbm_submission = pd.DataFrame({'test_id':test_ids})
dnn_submission = pd.DataFrame({'test_id':test_ids})
lgbm_submission['is_duplicate'] = 0
dnn_submission['is_duplicate'] = 0
for i in range(10):
    lgbm_part = pd.read_csv('./model_store/lgbm_model/lgbm_submission_{}.csv'.format(i))
    dnn_part = pd.read_csv('./model_store/dnn_model/dnn_submission_{}.csv'.format(i))
    lgbm_submission['is_duplicate'] = lgbm_submission.is_duplicate + lgbm_part.is_duplicate
    dnn_submission['is_duplicate'] = dnn_submission.is_duplicate + dnn_part.is_duplicate

lgbm_submission['is_duplicate'] /= 10
dnn_submission['is_duplicate'] /= 10
lgbm_submission.to_csv('submission/lgbm_10stack_submission.csv', index=False)
dnn_submission.to_csv('submission/dnnV2_10stack_submission.csv', index=False)
# 转换分布
a = 0.174264424749 / 0.369197853026
b = (1 - 0.174264424749) / (1 - 0.369197853026)
lgbm_submission['is_duplicate'] = lgbm_submission.is_duplicate.apply(lambda x: a * x / (a * x + b * (1 - x)))
dnn_submission['is_duplicate'] = dnn_submission.is_duplicate.apply(lambda x: a * x / (a * x + b * (1 - x)))
lgbm_submission.to_csv('submission/lgbm_10stack_submission_trans.csv', index=False)
dnn_submission.to_csv('submission/dnnV2_10stack_submission_trans.csv', index=False)

Round 0 start···························
DNN train start·····
Round 1 start···························
DNN train start·····
Round 2 start···························
DNN train start·····
Round 3 start···························
DNN train start·····
Round 4 start···························
DNN train start·····
Round 5 start···························
DNN train start·····
Round 6 start···························
DNN train start·····
Round 7 start···························
DNN train start·····
Round 8 start···························
DNN train start·····
Round 9 start···························
DNN train start·····
