In [1]:
import time
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import KFold



In [2]:
# 加上词向量特征
w2v_feature = pd.read_csv('./data/w2v_feature.csv', header=0, encoding='utf-8')

In [3]:
# 加上freq_feature
# freq_feature = pd.read_csv('./data/freq_feature.csv', header=0, encoding='utf-8')

In [4]:
all_data = pd.read_csv('./data/all_data', sep=' ', header=None, encoding='utf-8')
all_data.columns = ['id', 'sent1', 'sent2', 'label']
feature_data = pd.read_csv('./data/feature_table.csv', header=0, encoding='utf-8')

In [5]:
feature_data = pd.concat([feature_data, w2v_feature], axis=1)

In [6]:
# feature_data.drop(columns=['q1_hash', 'q2_hash'], inplace=True)

In [7]:
feature_data.columns.values.tolist()
feature_data.head()

Unnamed: 0,shared_word,tfidf_shared,tfidf_dif,word_len1,word_len2,char_len1,char_len2,length_dif,length_dif_rate,common_words,...,dup_sent_3,dup_sent_4,ngram_jac_1,ngram_jac_2,ngram_jac_3,ngram_di_1,ngram_di_2,ngram_di_3,w2v_cos,w2v_idf_cos
0,0.545455,0.577535,11.627644,4,4,7,8,1,0.875,4,...,9,1,0.375,0.25,0.125,0.545455,0.4,0.222222,0.817394,0.810542
1,0.580645,0.669126,7.722772,5,5,11,10,1,0.909091,6,...,1,1,0.409091,0.26087,0.125,0.580645,0.413793,0.222222,0.87986,0.918389
2,0.774194,0.669747,24.605322,6,7,10,13,3,0.769231,9,...,1,1,0.666667,0.318182,0.173913,0.8,0.482759,0.296296,0.921079,0.909509
3,0.62069,0.539571,0.362998,7,6,11,10,1,0.909091,6,...,4,1,0.473684,0.35,0.25,0.642857,0.518519,0.4,0.876567,0.839398
4,0.521739,0.307486,8.16671,5,5,8,10,2,0.8,5,...,8,1,0.352941,0.3125,0.266667,0.521739,0.47619,0.421053,0.896401,0.748673


In [8]:
all_x = feature_data.values
all_y = all_data.label.values

In [9]:
# Feature selection based on ANOVA-f score 特征选择(降维)
# selectK = SelectKBest(f_classif, k=30)
# selectK.fit(all_x, all_y)
# all_x = selectK.transform(all_x)

In [10]:
x_test = all_x[:10000]
y_test = all_y[:10000]
x = all_x[10000:]
y = all_y[10000:]

In [41]:
xgb_train = xgb.DMatrix(data=x, label=y)
xgb_test = xgb.DMatrix(data=x_test, label=y_test)

In [42]:
# F1值度量方法
def threshold(i):
    if i > 0.20:
        return 1.0
    else:
        return 0.0
def f1_metric(y_pred, train_data):
    y_true = train_data.get_label()
    #y_pred = np.round(y_pred)
    y_pred = list(map(threshold, y_pred))
    return 'F1', f1_score(y_true, y_pred)

In [43]:
# parameters = {
#             'booster':'gbtree',
#             'objective':'binary:logistic',
#             'eta': list(np.linspace(1, 9, 17)),
#             'max_depth':list(range(3, 10, 1)),
#             'subsample':list(np.linspace(0.5, 1, 6)),
#             'min_child_weight': list(range(1,10,1)),
#             'colsample_bytree':list(np.linspace(0.5, 1, 6)),
#             'scale_pos_weight':list(np.linspace(0, 0.5, 6)),
#             'eval_metric':'logloss',
#             'gamma':list(np.linspace(0, 1, 11))          
# }
parameters = {
            'booster':'gbtree',
            'objective':'binary:logistic',
            'eta':0.2,
            'max_depth':10,
            'subsample':1.0,
            'min_child_weight':2,
            'colsample_bytree':0.8,
            'scale_pos_weight':0.5,
            'eval_metric':'logloss',
            'gamma':0.2,            
            'lambda':0
}

In [None]:
watchlist = [(xgb_train,'train'),(xgb_test,'val')]
xgb_model = xgb.train(params=parameters,
                      dtrain=xgb_train,
                      num_boost_round=5000,
                      evals=watchlist,
                      early_stopping_rounds=100,
                      feval=f1_metric,
                      )

In [11]:
cnn_model = pd.read_csv('./training_data/cnn_model.csv', header=0)  # cnn预测结果

In [12]:
rnn_model = pd.read_csv('./training_data/rnn_model.csv', header=0)  # rnn预测结果

In [13]:
ntrain = len(x)
ntest = len(x_test)
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
k_fold = KFold(len(x), n_folds=NFOLDS, random_state=SEED)

In [14]:
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)

In [15]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(k_fold):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [16]:
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [17]:
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [18]:
y_train = y
x_train = x
x_test = x_test

In [19]:
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees

In [20]:
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest

  warn("Warm-start fitting without increasing n_estimators does not "


In [30]:
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 

In [31]:
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost

In [None]:
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier