In [21]:
import numpy as np
import pickle
np.set_printoptions(precision=4,suppress=True)
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [40]:
documents = pickle.load(open('documents.pl','rb'))
labels = pickle.load(open('labels.pl','rb'))
tags = pickle.load(open('tags.pl','rb'))

# FEATURES

In [45]:
with open('./stopwords-de.txt') as f:
    stopwords = f.read().split('\n')

In [46]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

CountVectorizer()
BoW = CountVectorizer(documents, stop_words=stopwords, strip_accents='unicode', ngram_range=(1,3), min_df=3)
X_bag = BoW.fit_transform(documents)
tf_transformer = TfidfTransformer().fit(X_bag)
X_bag = tf_transformer.fit_transform(X_bag)

In [47]:
Y = np.array(np.array(tags)=='Prio-Fall', dtype=int)

In [48]:
sum(Y)/len(Y)

0.1450676982591876

In [49]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# CLASSIFIERS

In [50]:
import xgboost as xgb
# Code for cross-validation
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.utils import class_weight
from sklearn.metrics import f1_score

## XG BOOST 1A

In [None]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.05
params['max_depth'] = 7
params['lambda'] = 1.3

In [None]:
# Create 2 folds
k = 5
kfold = StratifiedKFold(Y, k, shuffle=True, random_state=0)

results = np.zeros(k)

# iterate over two folds
for i, (train_ind, test_ind) in enumerate(kfold):
    X_train, X_val, Y_train, Y_val = X_bag[train_ind], X_bag[test_ind], Y[train_ind], Y[test_ind]
    
    ros = RandomOverSampler(random_state=0)
    X_resampled, Y_resampled = ros.fit_sample(X_train, Y_train)
    
    D_train = xgb.DMatrix(X_resampled, label=Y_resampled)
    D_val = xgb.DMatrix(X_val, label=Y_val)

    weights = class_weight.compute_sample_weight('balanced', [0,1], Y_resampled)
    weightss = []
    for y in Y_train:
        weightss.append(weights[y])

    watchlist = [(D_train, 'train'), (D_val, 'valid')]
    bst = xgb.train(params, D_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)
    predictions = bst.predict(data=D_val).round()
    results[i] = f1_score(predictions, Y_val)
    print('f1 score: {}'.format(results[i]))
    print('accuracy: {}\%'.format(sum(predictions==Y_val)/len(Y_val)*100))

[0]	train-logloss:0.684786	valid-logloss:0.685085
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.631128	valid-logloss:0.633862
[20]	train-logloss:0.604485	valid-logloss:0.609447
[30]	train-logloss:0.587953	valid-logloss:0.59431
[40]	train-logloss:0.576828	valid-logloss:0.584374
[50]	train-logloss:0.568891	valid-logloss:0.577605
[60]	train-logloss:0.562475	valid-logloss:0.572332
[70]	train-logloss:0.557057	valid-logloss:0.567929
[80]	train-logloss:0.552206	valid-logloss:0.564199
[90]	train-logloss:0.548292	valid-logloss:0.561225
[100]	train-logloss:0.544725	valid-logloss:0.558601
[110]	train-logloss:0.541464	valid-logloss:0.556111
[120]	train-logloss:0.538231	valid-logloss:0.553831
[130]	train-logloss:0.535306	valid-logloss:0.551573
[140]	train-logloss:0.532362	valid-logloss:0.549521
[150]	train-logloss:0.529876	valid-logloss:0.547642
[160]	train-logloss:0.527297	

In [None]:
np.mean(results), np.var(results)

# XGBOOST 1B

In [None]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 5

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Create 2 folds
k = 3
enc = OneHotEncoder()
Y = enc.fit(labels)
kfold = StratifiedKFold(Y, k, shuffle=True, random_state=0)

results = np.zeros(k)

# iterate over two folds
for i, (train_ind, test_ind) in enumerate(kfold):
    X_train, X_val, Y_train, Y_val = X_bag[train_ind], X_bag[test_ind], Y[train_ind], Y[test_ind]
    
    D_train = xgb.DMatrix(X_train, label=Y_train)
    D_val = xgb.DMatrix(X_val, label=Y_val)

    weights = class_weight.compute_sample_weight('balanced', [0,1], Y_train)
    weightss = []
    for y in Y_train:
        weightss.append(weights[y])

    watchlist = [(D_train, 'train'), (D_val, 'valid')]
    bst = xgb.train(params, D_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)
    predictions = bst.predict(data=D_val).round()
    results[i] = f1_score(predictions, Y_val, )
    print('f1 score: {}'.format(results[i]))
    print('accuracy: {}\%'.format(sum(predictions==Y_val)/len(Y_val)*100))