In [1]:
import numpy as np
import scipy
import sklearn
import xgboost as xgb
import re
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv("../data/ori_data/train_process.csv", sep="\t", header=None)
test_data = pd.read_csv("../data/ori_data/dev_process.csv", sep="\t", header=None)

train_data.columns = ["label", "sent_1_seg", "sent_2_seg", "id"]
test_data.columns = ["label", "sent_1_seg", "sent_2_seg", "id"]

trainL = train_data["label"].values
devL = test_data["label"].values

In [3]:
import pickle

In [5]:
# load the tfkdl data matrix
train_matrix = pickle.load(open("../data/ori_data/dr100.tfkdl.train.matrix", "rb"))
dev_matrix = pickle.load(open("../data/ori_data/dr100.tfkdl.dev.matrix", "rb"))

In [4]:
# load the tfidf vector with ngram1
# pca_train_corpus = pickle.load(open("../data/ori_data/ngram1.train.tfidf.pca.data", "rb"))
# dev_pca_tfidf_vector = pickle.load(open("../data/ori_data/ngram1.dev.tfidf.pca.data", "rb"))


# load the tfidf vector with ngram1 hmm
pca_train_corpus = pickle.load(open("../data/ori_data/ngram1.hmm.train.tfidf.pca.data", "rb"))
dev_pca_tfidf_vector = pickle.load(open("../data/ori_data/ngram1.hmm.dev.tfidf.pca.data", "rb"))


In [16]:
def make_matrix(datasetM):
    """
    :param datasetM
    """
    corpus_matrix = []
    for index in range(datasetM.shape[0] // 2):
        vector_sum = datasetM[index] + datasetM[datasetM.shape[0] // 2 + index]
        vector_diff = abs(datasetM[index] - datasetM[datasetM.shape[0] // 2 + index])
        
        vector = np.concatenate((vector_sum, vector_diff))
        
        corpus_matrix.append(vector)
    
    return np.array(corpus_matrix)

train_matrix = make_matrix(pca_train_corpus)
dev_matrix = make_matrix(dev_pca_tfidf_vector)

In [6]:
# specify parameters via map
xgb_classifier = xgb.XGBClassifier(max_depth=5, learning_rate=0.01, n_estimators=1000, 
                                   silent=0, objective='binary:logistic', booster='gbtree', 
                                   n_jobs=1, nthread=None, gamma=0, min_child_weight=2, max_delta_step=0, subsample=0.8, 
                                   colsample_bytree=0.8, colsample_bylevel=1, reg_alpha=0, 
                                   reg_lambda=1, scale_pos_weight=4, 
                                   base_score=0.5, random_state=0, seed=None, missing=None)

xgb_classifier.fit(train_matrix, trainL, 
                         eval_set=[(train_matrix, trainL), (dev_matrix, devL)],
                         early_stopping_rounds=10, eval_metric="auc")

[0]	validation_0-auc:0.615046	validation_1-auc:0.601535
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.63937	validation_1-auc:0.615548
[2]	validation_0-auc:0.64658	validation_1-auc:0.618825
[3]	validation_0-auc:0.65493	validation_1-auc:0.626999
[4]	validation_0-auc:0.655318	validation_1-auc:0.626508
[5]	validation_0-auc:0.656266	validation_1-auc:0.624766
[6]	validation_0-auc:0.660297	validation_1-auc:0.628387
[7]	validation_0-auc:0.660348	validation_1-auc:0.627862
[8]	validation_0-auc:0.66044	validation_1-auc:0.627524
[9]	validation_0-auc:0.661792	validation_1-auc:0.627537
[10]	validation_0-auc:0.66209	validation_1-auc:0.626628
[11]	validation_0-auc:0.664281	validation_1-auc:0.628578
[12]	validation_0-auc:0.666219	validation_1-auc:0.630878
[13]	validation_0-auc:0.668218	validation_1-auc:0.632617
[14]	validation_0-auc:0.668289	validation_1-auc:0.633654
[15

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=5, min_child_weight=2, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=4, seed=None, silent=0,
       subsample=0.8)

In [7]:
# make prediction
preds = xgb_classifier.predict(dev_matrix)

pred_label = preds >= 0.5
pred_label = pred_label.astype(int)
from sklearn.metrics import classification_report
print(classification_report(devL, pred_label))

             precision    recall  f1-score   support

          0       0.88      0.77      0.82     16751
          1       0.34      0.52      0.41      3744

avg / total       0.78      0.73      0.75     20495



  if diff:


In [10]:
# load the ngram1 feature vector
feature_added_train_datamatrix = pickle.load(open("../data/ori_data/ngram1.train.featurematrix.data", "rb"))
feature_added_dev_datamatrix = pickle.load(open("../data/ori_data/ngram1.dev.featurematrix.data", "rb"))

In [6]:
# concatenate the tfidf_vector and the feature added
feature_added_train_datamatrix = np.load("../data/ori_data/train.featurematrix.data")
feature_added_dev_datamatrix = np.load("../data/ori_data/dev.featurematrix.data")

In [None]:
feature_added_train_datamatrix = pickle.load(open("../data/ori_data/ngram1.hmm.train.featurematrix.data", "rb"))
feature_added_dev_datamatrix = pickle.load(open("../data/ori_data/ngram1.hmm.dev.featurematrix.data", "rb"))

In [11]:
feature_added_train_datamatrix.shape

(81981, 74)

In [12]:
whole_train_datamatrix = np.concatenate((train_matrix, feature_added_train_datamatrix), axis=1)
whole_dev_datamatrix = np.concatenate((dev_matrix, feature_added_dev_datamatrix), axis=1)

In [13]:
print(whole_train_datamatrix.shape)

(81981, 274)


In [14]:
# specify parameters via map
xgb_classifier = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=1000, 
                                   silent=0, objective='binary:logistic', booster='gbtree', 
                                   n_jobs=1, nthread=None, gamma=0, min_child_weight=2, max_delta_step=0, subsample=0.9, 
                                   colsample_bytree=0.9, colsample_bylevel=1, reg_alpha=0, 
                                   reg_lambda=1, scale_pos_weight=4, 
                                   base_score=0.5, random_state=0, seed=None, missing=None)

xgb_classifier.fit(whole_train_datamatrix, trainL, 
                         eval_set=[(whole_train_datamatrix, trainL), (whole_dev_datamatrix, devL)],
                         early_stopping_rounds=50, eval_metric="auc")

[0]	validation_0-auc:0.747963	validation_1-auc:0.722341
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 50 rounds.
[1]	validation_0-auc:0.756832	validation_1-auc:0.729823
[2]	validation_0-auc:0.760618	validation_1-auc:0.732827
[3]	validation_0-auc:0.762475	validation_1-auc:0.735096
[4]	validation_0-auc:0.764515	validation_1-auc:0.73677
[5]	validation_0-auc:0.766666	validation_1-auc:0.738196
[6]	validation_0-auc:0.768416	validation_1-auc:0.740328
[7]	validation_0-auc:0.769761	validation_1-auc:0.742004
[8]	validation_0-auc:0.771088	validation_1-auc:0.743265
[9]	validation_0-auc:0.772601	validation_1-auc:0.744408
[10]	validation_0-auc:0.774219	validation_1-auc:0.745888
[11]	validation_0-auc:0.775359	validation_1-auc:0.746719
[12]	validation_0-auc:0.777202	validation_1-auc:0.748186
[13]	validation_0-auc:0.779985	validation_1-auc:0.750937
[14]	validation_0-auc:0.781292	validation_1-auc:0.75186


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.9, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=2, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=4, seed=None, silent=0,
       subsample=0.9)

In [15]:
# make prediction
preds = xgb_classifier.predict(whole_dev_datamatrix)

pred_label = preds >= 0.5
pred_label = pred_label.astype(int)
from sklearn.metrics import classification_report
print(classification_report(devL, pred_label))

             precision    recall  f1-score   support

          0       0.90      0.81      0.85     16751
          1       0.42      0.61      0.50      3744

avg / total       0.81      0.77      0.79     20495



  if diff:


In [19]:
pickle.dump(xgb_classifier, open("../data/m_result/tfidf_featured.xgb.model", "wb"), 2)

In [20]:
xgb_classifier.predict_proba(whole_dev_datamatrix).shape

(20495, 2)

In [21]:
whole_dev_datamatrix.shape

(20495, 274)