In [1]:
import pandas as pd
import glob
pd.set_option("display.max_rows", 30)
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_colwidth", 1000)
pd.set_option("display.width", 2000)

In [2]:
import numpy as np
import pickle
np.set_printoptions(precision=4,suppress=True)
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
documents = pickle.load(open('documents.pl','rb'))
labels = pickle.load(open('labels.pl','rb'))
tags = pickle.load(open('tags.pl','rb'))

# FEATURES

In [5]:
with open('./stopwords-de.txt') as f:
    stopwords = f.read().split('\n')

In [6]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer

CountVectorizer()
BoW = CountVectorizer(documents, stop_words=stopwords, strip_accents='unicode', ngram_range=(1,3), min_df=3)

In [None]:
X_bag = BoW.fit_transform(documents)

In [None]:
X_bag[0].shape

In [None]:
Y = np.array(np.array(tags)=='Prio-Fall', dtype=int)

In [None]:
sum(Y)/len(Y)

# XG BOOST

In [None]:
import xgboost as xgb

# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['max_delta_step'] = 1
params['eta'] = 0.02
params['max_depth'] = 10

In [None]:
# Code for cross-validation
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.utils import class_weight
from sklearn.metrics import f1_score

# Create 2 folds
k = 3
kfold = StratifiedKFold(Y, k, shuffle=True, random_state=0)

results = np.zeros(k)

# iterate over two folds
for i, (train_ind, test_ind) in enumerate(kfold):
    X_train, X_val, Y_train, Y_val = X_bag[train_ind], X_bag[test_ind], Y[train_ind], Y[test_ind]
    
    D_train = xgb.DMatrix(X_train, label=Y_train)
    D_val = xgb.DMatrix(X_val, label=Y_val)

    weights = class_weight.compute_sample_weight('balanced', [0,1], Y_train)
    weightss = []
    for y in Y_train:
        weightss.append(weights[y])

    watchlist = [(D_train, 'train'), (D_val, 'valid')]
    bst = xgb.train(params, D_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10)
    predictions = bst.predict(data=D_val).round()
    results[i] = f1_score(predictions, Y_val, )
    print('f1 score: {}'.format(results[i]))
    print('accuracy: {}\%'.format(sum(predictions==Y_val)/len(Y_val)*100))

In [None]:
np.mean(results), np.var(results)