In [1]:
import numpy as np
import scipy
import sklearn
import xgboost as xgb
import re
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv("../data/ori_data/train_process.csv", sep="\t", header=None)
test_data = pd.read_csv("../data/ori_data/dev_process.csv", sep="\t", header=None)

train_data.columns = ["label", "sent_1_seg", "sent_2_seg", "id"]
test_data.columns = ["label", "sent_1_seg", "sent_2_seg", "id"]

trainL = train_data["label"].values
devL = test_data["label"].values

In [3]:
import pickle

In [5]:
# load the tfkdl data matrix
train_matrix = pickle.load(open("../data/ori_data/dr100.tfkdl.train.matrix", "rb"))
dev_matrix = pickle.load(open("../data/ori_data/dr100.tfkdl.dev.matrix", "rb"))

In [4]:
# load the tfidf vector
pca_train_corpus = pickle.load(open("../data/ori_data/ngram1.train.tfidf.pca.data", "rb"))
dev_pca_tfidf_vector = pickle.load(open("../data/ori_data/ngram1.dev.tfidf.pca.data", "rb"))

In [5]:
def make_matrix(datasetM):
    """
    :param datasetM
    """
    corpus_matrix = []
    for index in range(datasetM.shape[0] // 2):
        vector_sum = datasetM[index] + datasetM[datasetM.shape[0] // 2 + index]
        vector_diff = abs(datasetM[index] - datasetM[datasetM.shape[0] // 2 + index])
        
        vector = np.concatenate((vector_sum, vector_diff))
        
        corpus_matrix.append(vector)
    
    return np.array(corpus_matrix)

train_matrix = make_matrix(pca_train_corpus)
dev_matrix = make_matrix(dev_pca_tfidf_vector)

In [10]:
# specify parameters via map
xgb_classifier = xgb.XGBClassifier(max_depth=5, learning_rate=0.01, n_estimators=1000, 
                                   silent=0, objective='binary:logistic', booster='gbtree', 
                                   n_jobs=1, nthread=None, gamma=0, min_child_weight=2, max_delta_step=0, subsample=0.8, 
                                   colsample_bytree=0.8, colsample_bylevel=1, reg_alpha=0, 
                                   reg_lambda=1, scale_pos_weight=4, 
                                   base_score=0.5, random_state=0, seed=None, missing=None)

xgb_classifier.fit(train_matrix, trainL, 
                         eval_set=[(train_matrix, trainL), (dev_matrix, devL)],
                         early_stopping_rounds=10, eval_metric="auc")

[0]	validation_0-auc:0.614158	validation_1-auc:0.59649
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.639898	validation_1-auc:0.615047
[2]	validation_0-auc:0.648081	validation_1-auc:0.621706
[3]	validation_0-auc:0.657643	validation_1-auc:0.628596
[4]	validation_0-auc:0.657117	validation_1-auc:0.625219
[5]	validation_0-auc:0.659886	validation_1-auc:0.625843
[6]	validation_0-auc:0.661741	validation_1-auc:0.627239
[7]	validation_0-auc:0.663142	validation_1-auc:0.627727
[8]	validation_0-auc:0.664263	validation_1-auc:0.628399
[9]	validation_0-auc:0.66477	validation_1-auc:0.629619
[10]	validation_0-auc:0.665416	validation_1-auc:0.630043
[11]	validation_0-auc:0.666788	validation_1-auc:0.630884
[12]	validation_0-auc:0.668078	validation_1-auc:0.63241
[13]	validation_0-auc:0.670553	validation_1-auc:0.634313
[14]	validation_0-auc:0.670894	validation_1-auc:0.635478
[

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=5, min_child_weight=2, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=4, seed=None, silent=0,
       subsample=0.8)

In [11]:
# make prediction
preds = xgb_classifier.predict(dev_matrix)

pred_label = preds >= 0.5
pred_label = pred_label.astype(int)
from sklearn.metrics import classification_report
print(classification_report(devL, pred_label))

             precision    recall  f1-score   support

          0       0.88      0.78      0.83     16751
          1       0.35      0.53      0.42      3744

avg / total       0.78      0.73      0.75     20495



  if diff:


In [None]:
pickle.load()

In [13]:
# concatenate the tfidf_vector and the feature added
feature_added_train_datamatrix = np.load("../data/ori_data/train.featurematrix.data")
feature_added_dev_datamatrix = np.load("../data/ori_data/dev.featurematrix.data")

In [14]:
feature_added_train_datamatrix.shape

(81981, 74)

In [7]:
whole_train_datamatrix = np.concatenate((train_matrix, feature_added_train_datamatrix), axis=1)
whole_dev_datamatrix = np.concatenate((dev_matrix, feature_added_dev_datamatrix), axis=1)

In [8]:
print(whole_train_datamatrix.shape)

(81981, 274)


In [9]:
# specify parameters via map
xgb_classifier = xgb.XGBClassifier(max_depth=7, learning_rate=0.01, n_estimators=1000, 
                                   silent=0, objective='binary:logistic', booster='gbtree', 
                                   n_jobs=1, nthread=None, gamma=0, min_child_weight=2, max_delta_step=0, subsample=0.9, 
                                   colsample_bytree=0.9, colsample_bylevel=1, reg_alpha=0, 
                                   reg_lambda=1, scale_pos_weight=4, 
                                   base_score=0.5, random_state=0, seed=None, missing=None)

xgb_classifier.fit(whole_train_datamatrix, trainL, 
                         eval_set=[(whole_train_datamatrix, trainL), (whole_dev_datamatrix, devL)],
                         early_stopping_rounds=50, eval_metric="auc")

[0]	validation_0-auc:0.770342	validation_1-auc:0.725927
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 50 rounds.
[1]	validation_0-auc:0.78272	validation_1-auc:0.735597
[2]	validation_0-auc:0.787168	validation_1-auc:0.739238
[3]	validation_0-auc:0.788947	validation_1-auc:0.742101
[4]	validation_0-auc:0.790352	validation_1-auc:0.743393
[5]	validation_0-auc:0.791071	validation_1-auc:0.744348
[6]	validation_0-auc:0.791839	validation_1-auc:0.744129
[7]	validation_0-auc:0.793152	validation_1-auc:0.744504
[8]	validation_0-auc:0.793885	validation_1-auc:0.744871
[9]	validation_0-auc:0.795561	validation_1-auc:0.745871
[10]	validation_0-auc:0.795785	validation_1-auc:0.745999
[11]	validation_0-auc:0.796141	validation_1-auc:0.746095
[12]	validation_0-auc:0.797379	validation_1-auc:0.747022
[13]	validation_0-auc:0.799463	validation_1-auc:0.749416
[14]	validation_0-auc:0.799786	validation_1-auc:0.749627

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=7, min_child_weight=2, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=4, seed=None, silent=0,
       subsample=0.9)

In [10]:
# make prediction
preds = xgb_classifier.predict(whole_dev_datamatrix)

pred_label = preds >= 0.5
pred_label = pred_label.astype(int)
from sklearn.metrics import classification_report
print(classification_report(devL, pred_label))

             precision    recall  f1-score   support

          0       0.90      0.82      0.86     16751
          1       0.43      0.61      0.50      3744

avg / total       0.82      0.78      0.79     20495



  if diff:


In [19]:
pickle.dump(xgb_classifier, open("../data/m_result/tfidf_featured.xgb.model", "wb"), 2)

In [18]:
preds

array([0, 0, 1, ..., 0, 0, 1])