In [1]:
import numpy as np
import pandas as pd
import scipy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import lightgbm as lgbm

%matplotlib inline

In [2]:
train = pd.read_csv('dataset/train.csv')
train.fillna('', inplace=True)
y = train['is_duplicate']

In [3]:
# 加载特征
fs_basic = pd.read_csv('feature_store/train_feature_basic.csv')
fs_fuzz = pd.read_csv('feature_store/train_feature_fuzz.csv')
fs_w2v_gnews = pd.read_csv('feature_store/train_feature_w2v_gnews.csv')
fs_tfidf = pd.read_csv('feature_store/train_feature_tfidf.csv')
fs_w2v_glove = pd.read_csv('feature_store/train_feature_w2v_glove.csv')
fs_graph = pd.read_csv('feature_store/train_feature_graph.csv')
fs_freq = pd.read_csv('feature_store/train_feature_freq.csv')

In [4]:
# 填充缺失值
fs_w2v_gnews.fillna(0,inplace=True)
fs_w2v_glove.fillna(0,inplace=True)
fs_tfidf.fillna(0,inplace=True)

In [26]:
feature_names = None

In [5]:
X = np.hstack((fs_basic, fs_fuzz, fs_w2v_gnews, fs_tfidf, fs_w2v_glove, fs_freq, fs_graph))
# 处理异常值
X[np.isinf(X)] = 0

---

### LightGBM 算法

In [19]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', # 目标函数
    'metric': 'binary_logloss', # 设置提升类型
    'num_leaves': 47, # 叶子节点数
    'learning_rate': 0.02, # 学习速率
    'feature_fraction': 0.75, # 建树的特征选择比例
    'bagging_fraction': 0.8, # 建树的样本采样比例
    'bagging_freq': 5, # k 意味着每 k 次迭代执行bagging
    'verbose': 0, # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
    'save_binary': True,
    'min_data_in_leaf': 100, 
    'max_bin': 1023,
}


X_train, X_test, y_train, y_test = train_test_split(mm_X, y, test_size=0.2, random_state=0)

lgbm_train = lgbm.Dataset(X_train, y_train, feature_name=feature_names)
lgbm_valid = lgbm.Dataset(X_test, y_test, reference=lgbm_train, feature_name=feature_names)

bst = lgbm.train(params, lgbm_train, num_boost_round=5000, 
                 valid_sets=lgbm_valid, valid_names='valid',
                 early_stopping_rounds=20, verbose_eval=200)

Training until validation scores don't improve for 20 rounds.
[200]	valid's binary_logloss: 0.280147
[400]	valid's binary_logloss: 0.262643
[600]	valid's binary_logloss: 0.257706
[800]	valid's binary_logloss: 0.255385
[1000]	valid's binary_logloss: 0.253837
[1200]	valid's binary_logloss: 0.25279
[1400]	valid's binary_logloss: 0.251856
[1600]	valid's binary_logloss: 0.251167
[1800]	valid's binary_logloss: 0.250642
[2000]	valid's binary_logloss: 0.250081
[2200]	valid's binary_logloss: 0.249656
[2400]	valid's binary_logloss: 0.249262
[2600]	valid's binary_logloss: 0.248836
[2800]	valid's binary_logloss: 0.24853
[3000]	valid's binary_logloss: 0.248289
[3200]	valid's binary_logloss: 0.248069
Early stopping, best iteration is:
[3208]	valid's binary_logloss: 0.248056
