In [1]:
import numpy as np
import pickle
np.set_printoptions(precision=4,suppress=True)
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
documents = pickle.load(open('documents.pl','rb'))
labels = pickle.load(open('labels.pl','rb'))

# FEATURES

In [3]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

BoW = CountVectorizer(documents, strip_accents='unicode', ngram_range=(1,3), min_df=3)
X_bag = BoW.fit_transform(documents)
tf_transformer = TfidfTransformer().fit(X_bag)
X_bag = tf_transformer.fit_transform(X_bag)

In [4]:
labels = np.array(labels)

In [5]:
Y = labels[labels != 'nan']
X_bag = X_bag[labels != 'nan']

In [6]:
Y.shape

(35126,)

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(np.unique(Y))

LabelEncoder()

In [8]:
Y = le.transform(Y)

In [9]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# CLASSIFIERS

In [10]:
import xgboost as xgb
# Code for cross-validation
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold



In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

## XGBOOST

In [20]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'multi:softmax'
params['eval_metric'] = 'mlogloss'
params['eta'] = 0.07
params['max_depth'] = 30
params['min_child_weight'] = 6
params['lambda'] = 1
params['nthread'] = 4
params['num_class'] = len(np.unique(Y))

In [None]:
# Create 2 folds
k = 3
kfold = StratifiedKFold(Y, k, shuffle=True, random_state=0)

results = np.zeros((k,3))

# iterate over two folds
for i, (train_ind, test_ind) in enumerate(kfold):
    X_train, X_val, Y_train, Y_val = X_bag[train_ind], X_bag[test_ind], Y[train_ind], Y[test_ind]
    
    print(X_train.shape, X_val.shape, Y_train.shape, Y_val.shape)
    D_train = xgb.DMatrix(X_train, label=Y_train)
    D_val = xgb.DMatrix(X_val, label=Y_val)

    watchlist = [(D_train, 'train'), (D_val, 'valid')]
    bst = xgb.train(params, D_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)
    predictions = bst.predict(data=D_val).round()
    f1 = f1_score(Y_val, predictions)
    acc = accuracy_score(Y_val, predictions)
    prio_acc = accuracy_score(Y_val[Y_val==1], predictions[Y_val==1])
    results[i] = [f1, acc, prio_acc]
    print('f1 score: {}'.format(f1))
    print('accuracy: {}\%'.format(acc))
    print('Prio accuracy: {}'.format(prio_acc))
    print(confusion_matrix(Y_val, predictions))



(23410, 321767) (11716, 321767) (23410,) (11716,)
[0]	train-mlogloss:2.94462	valid-mlogloss:2.95007
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 50 rounds.
[10]	train-mlogloss:1.91953	valid-mlogloss:1.97359
[20]	train-mlogloss:1.54844	valid-mlogloss:1.64698
[30]	train-mlogloss:1.34631	valid-mlogloss:1.48078
[40]	train-mlogloss:1.22103	valid-mlogloss:1.38682
[50]	train-mlogloss:1.13716	valid-mlogloss:1.32992
[60]	train-mlogloss:1.07683	valid-mlogloss:1.29467


In [None]:
np.mean(results, 0), np.var(results, 0)