In [1]:
import numpy as np
import pickle
np.set_printoptions(precision=4,suppress=True)
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
documents = pickle.load(open('documents.pl','rb'))
labels = pickle.load(open('labels.pl','rb'))
tags = pickle.load(open('tags.pl','rb'))

# FEATURES

In [3]:
with open('./stopwords-de.txt') as f:
    stopwords = f.read().split('\n')

In [4]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer

CountVectorizer()
BoW = CountVectorizer(documents, stop_words=stopwords, strip_accents='unicode', ngram_range=(1,3), min_df=3)

In [5]:
X_bag = BoW.fit_transform(documents)

In [6]:
X_bag[0].shape

(1, 367992)

In [7]:
Y = np.array(np.array(tags)=='Prio-Fall', dtype=int)

In [8]:
sum(Y)/len(Y)

0.14504915987966835

In [13]:
print(sorted(Counter(Y).items()))

[(0, 93217), (1, 15815)]


In [12]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X_bag, Y)
from collections import Counter
print(sorted(Counter(y_resampled).items()))

[(0, 93217), (1, 93217)]


# CLASSIFIERS

In [15]:
import xgboost as xgb
# Code for cross-validation
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.utils import class_weight
from sklearn.metrics import f1_score



## XG BOOST 1A

In [14]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 5

In [None]:
# Create 2 folds
k = 3
kfold = StratifiedKFold(Y, k, shuffle=True, random_state=0)

results = np.zeros(k)

# iterate over two folds
for i, (train_ind, test_ind) in enumerate(kfold):
    X_train, X_val, Y_train, Y_val = X_bag[train_ind], X_bag[test_ind], Y[train_ind], Y[test_ind]
    
    D_train = xgb.DMatrix(X_train, label=Y_train)
    D_val = xgb.DMatrix(X_val, label=Y_val)

    weights = class_weight.compute_sample_weight('balanced', [0,1], Y_train)
    weightss = []
    for y in Y_train:
        weightss.append(weights[y])

    watchlist = [(D_train, 'train'), (D_val, 'valid')]
    bst = xgb.train(params, D_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)
    predictions = bst.predict(data=D_val).round()
    results[i] = f1_score(predictions, Y_val, )
    print('f1 score: {}'.format(results[i]))
    print('accuracy: {}\%'.format(sum(predictions==Y_val)/len(Y_val)*100))

[0]	train-logloss:0.682238	valid-logloss:0.682267
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.593489	valid-logloss:0.593768
[20]	train-logloss:0.531532	valid-logloss:0.532108
[30]	train-logloss:0.487235	valid-logloss:0.488025
[40]	train-logloss:0.454939	valid-logloss:0.455922
[50]	train-logloss:0.431154	valid-logloss:0.432307
[60]	train-logloss:0.413446	valid-logloss:0.414819
[70]	train-logloss:0.400243	valid-logloss:0.401775
[80]	train-logloss:0.390154	valid-logloss:0.391952
[90]	train-logloss:0.382498	valid-logloss:0.384482
[100]	train-logloss:0.376485	valid-logloss:0.378778
[110]	train-logloss:0.371759	valid-logloss:0.374354
[120]	train-logloss:0.368129	valid-logloss:0.370959
[130]	train-logloss:0.365171	valid-logloss:0.368195
[140]	train-logloss:0.362806	valid-logloss:0.36608
[150]	train-logloss:0.360865	valid-logloss:0.364339
[160]	train-logloss:0.359201	

In [None]:
np.mean(results), np.var(results)

# XGBOOST 1B

In [None]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 5

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Create 2 folds
k = 3
enc = OneHotEncoder()
Y = enc.fit(labels)
kfold = StratifiedKFold(Y, k, shuffle=True, random_state=0)

results = np.zeros(k)

# iterate over two folds
for i, (train_ind, test_ind) in enumerate(kfold):
    X_train, X_val, Y_train, Y_val = X_bag[train_ind], X_bag[test_ind], Y[train_ind], Y[test_ind]
    
    D_train = xgb.DMatrix(X_train, label=Y_train)
    D_val = xgb.DMatrix(X_val, label=Y_val)

    weights = class_weight.compute_sample_weight('balanced', [0,1], Y_train)
    weightss = []
    for y in Y_train:
        weightss.append(weights[y])

    watchlist = [(D_train, 'train'), (D_val, 'valid')]
    bst = xgb.train(params, D_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)
    predictions = bst.predict(data=D_val).round()
    results[i] = f1_score(predictions, Y_val, )
    print('f1 score: {}'.format(results[i]))
    print('accuracy: {}\%'.format(sum(predictions==Y_val)/len(Y_val)*100))