In [15]:
from __future__ import division
import xml.etree.ElementTree as ET
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import sys
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
from collections import Counter
import numpy as np

In [16]:
tree_train = ET.parse('../data/raw/news_sentiment_romip2012/train/news_eval_train.xml')
root_train = tree_train.getroot()
tree_test = ET.parse('../data/raw/news_sentiment_romip2012/test/news_eval_test.xml')
root_test = tree_test.getroot()

In [17]:
train_data = []
train_target = []
test_data = []
test_target = []
evs = {'0':2, '+':0, '-':1}
for child in tqdm(root_train):
    speech = child.find('speech').text
    ev = child.find('evaluation').text.strip()
    if ev in evs.keys():
        train_data.append(speech)
        train_target.append(evs[ev])
for child in tqdm(root_test):
    speech = child.find('speech').text
    ev = child.find('evaluation').text.strip()
    if ev in evs.keys():
        test_data.append(speech)
        test_target.append(evs[ev])

100%|██████████| 4260/4260 [00:00<00:00, 63549.83it/s]
100%|██████████| 5500/5500 [00:00<00:00, 62495.73it/s]


In [18]:
print np.unique(train_target)
print np.unique(test_target)
print Counter(train_target)
print Counter(test_target)

[0 1 2]
[0 1 2]
Counter({1: 1864, 0: 1115, 2: 914})
Counter({1: 1890, 0: 1448, 2: 1235})


In [5]:
baseline_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer(use_idf=False)),
('clf', LinearSVC()),
])
baseline_clf.fit(train_data, train_target)
predicted = baseline_clf.predict(test_data)

In [6]:
new_target = []
new_predicted = []
for i in range(len(predicted)):
    if (predicted[i] in [0, 1]) and (test_target[i] in [0, 1]):
        new_target.append(test_target[i])
        new_predicted.append(predicted[i])

In [7]:
score = f1_score(new_target, new_predicted)
print 'LinearSVC', score

LinearSVC 0.811444921316


In [8]:
conv = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
])
train_data = conv.fit_transform(train_data)
test_data = conv.transform(test_data)

In [9]:
dtrain = xgb.DMatrix(train_data, label=train_target)
param = {}
param['objective'] = 'multi:softmax'
param['num_class'] = 3
bst = xgb.train(param, dtrain)

In [10]:
dtest = xgb.DMatrix(test_data)
ypred = bst.predict(dtest)

In [11]:
new_target = []
new_predicted = []
for i in range(len(ypred)):
    if (ypred[i] in [0, 1]) and (test_target[i] in [0, 1]):
        new_target.append(test_target[i])
        new_predicted.append(ypred[i])

In [12]:
score = f1_score(new_target, new_predicted)
print 'xgboost', score

xgboost 0.747465645416


In [13]:
def modelfit(alg, train_data, train_target,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgb_param['num_class'] = 3
        xgtrain = xgb.DMatrix(train_data, label=train_target)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='merror', early_stopping_rounds=early_stopping_rounds, show_stdv=True)
        print 'n_estimators (num_boost_round)', cvresult.shape[0]
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(train_data, train_target,eval_metric='merror')
        
    #Predict training set:
    dtrain_predictions = alg.predict(train_data)
    dtrain_predprob = alg.predict_proba(train_data)[:,1]
    
    new_target = []
    new_predicted = []
    for i in range(len(dtrain_predictions)):
        if (dtrain_predictions[i] in [0, 1]) and (train_target[i] in [0, 1]):
            new_target.append(train_target[i])
            new_predicted.append(dtrain_predictions[i])    
    #Print model report:
    print "\nModel Report"
    print "f1score(Train) : %.4g" % metrics.f1_score(new_target, new_predicted)
    #print "F1 Score (Train): %f" % metrics.f1_score(train_target, dtrain_predprob)

In [14]:
%%time
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 #max_depth=16,
 min_child_weight=1,
 #min_child_weight=3,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softmax',
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, train_data, train_target)

n_estimators (num_boost_round) 293

Model Report
f1score(Train) : 0.9484
CPU times: user 1min 48s, sys: 2.93 s, total: 1min 51s
Wall time: 56.2 s


In [15]:
#dtest = xgb.DMatrix(test_data)
ypred = xgb1.predict(test_data)

In [16]:
new_target = []
new_predicted = []
for i in range(len(ypred)):
    if (ypred[i] in [0, 1]) and (test_target[i] in [0, 1]):
        new_target.append(test_target[i])
        new_predicted.append(ypred[i])

In [17]:
score = f1_score(new_target, new_predicted)
print 'xgboost', score

xgboost 0.766085059978


In [None]:
%%time
param_test1 = {
 'max_depth':range(1,20,2),
 'min_child_weight':range(1,10,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softmax', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='f1',iid=False, cv=5, verbose=100)
gsearch1.fit(train_data,train_target)

print gsearch1.grid_scores_
print gsearch1.best_params_
print gsearch1.best_score_

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] max_depth=1, min_child_weight=1 .................................
[CV] ........ max_depth=1, min_child_weight=1, score=0.443422 -   1.6s
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    1.6s
[CV] max_depth=1, min_child_weight=1 .................................
[CV] ........ max_depth=1, min_child_weight=1, score=0.416910 -   1.4s
[Parallel(n_jobs=1)]: Done   2 jobs       | elapsed:    3.0s
[CV] max_depth=1, min_child_weight=1 .................................
[CV] ........ max_depth=1, min_child_weight=1, score=0.466848 -   1.5s
[Parallel(n_jobs=1)]: Done   3 jobs       | elapsed:    4.5s
[CV] max_depth=1, min_child_weight=1 .................................
[CV] ........ max_depth=1, min_child_weight=1, score=0.415190 -   1.5s
[Parallel(n_jobs=1)]: Done   4 jobs       | elapsed:    6.0s
[CV] max_depth=1, min_child_weight=1 .................................
[CV] ........ max_depth=1, min_child_weight=1, score=0.

In [None]:
%%time
param_test2 = {
 'max_depth':[14,15, 16],
 'min_child_weight':[2, 3, 4]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softmax', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2, scoring='f1',iid=False, cv=5, verbose=100)
gsearch1.fit(train_data,train_target)

print gsearch1.grid_scores_
print gsearch1.best_params_
print gsearch1.best_score_

In [None]:
%%time
param_test2b = {
 'min_child_weight':[3, 5, 7, 9]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=16,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softmax', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2b, scoring='f1',iid=False, cv=5, verbose=100)
gsearch1.fit(train_data,train_target)

print gsearch1.grid_scores_
print gsearch1.best_params_
print gsearch1.best_score_

In [None]:
%%time
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=16,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softmax', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test3, scoring='f1',iid=False, cv=5, verbose=100)
gsearch1.fit(train_data,train_target)

print gsearch1.grid_scores_
print gsearch1.best_params_
print gsearch1.best_score_

In [None]:
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=16,
 min_child_weight=3,
 gamma=0.0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softmax',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb2, train_data, train_target)

In [None]:
%%time
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=16,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softmax', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test4, scoring='f1',iid=False, cv=5, verbose=100)
gsearch1.fit(train_data,train_target)

print gsearch1.grid_scores_
print gsearch1.best_params_
print gsearch1.best_score_

In [None]:
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=16,
 min_child_weight=3,
 gamma=0.0,
 subsample=0.9,
 colsample_bytree=0.9,
 objective= 'multi:softmax',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb2, train_data, train_target)

In [None]:
%%time
param_test5 = {
 'subsample':[i/100.0 for i in range(85,100,5)],
 'colsample_bytree':[i/100.0 for i in range(85,100,5)]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=16,
 min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'multi:softmax', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test5, scoring='f1',iid=False, cv=5, verbose=100)
gsearch1.fit(train_data,train_target)

print gsearch1.grid_scores_
print gsearch1.best_params_
print gsearch1.best_score_

In [None]:
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=16,
 min_child_weight=3,
 gamma=0.0,
 subsample=0.95,
 colsample_bytree=0.95,
 objective= 'multi:softmax',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb2, train_data, train_target)

In [None]:
%%time
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=16,
 min_child_weight=3, gamma=0, subsample=0.9, colsample_bytree=0.9,
 objective= 'multi:softmax', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test6, scoring='f1',iid=False, cv=5, verbose=100)
gsearch1.fit(train_data,train_target)

print gsearch1.grid_scores_
print gsearch1.best_params_
print gsearch1.best_score_

In [None]:
%%time
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=16,
 min_child_weight=3, gamma=0, subsample=0.95, colsample_bytree=0.95,
 objective= 'multi:softmax', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test6, scoring='f1',iid=False, cv=5, verbose=100)
gsearch1.fit(train_data,train_target)

print gsearch1.grid_scores_
print gsearch1.best_params_
print gsearch1.best_score_

In [None]:
%%time
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=5000,
 max_depth=16,
 min_child_weight=3,
 gamma=0.0,
 subsample=0.95,
 colsample_bytree=0.95,
 objective= 'multi:softmax',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb2, train_data, train_target)

In [None]:
#dtest = xgb.DMatrix(test_data)
ypred = xgb1.predict(test_data)

In [None]:
new_target = []
new_predicted = []
for i in range(len(ypred)):
    if (ypred[i] in [0, 1]) and (test_target[i] in [0, 1]):
        new_target.append(test_target[i])
        new_predicted.append(ypred[i])

In [None]:
score = f1_score(new_target, new_predicted)
print 'xgboost', score