In [25]:
import numpy as np
import pickle
np.set_printoptions(precision=4,suppress=True)
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [26]:
documents = pickle.load(open('documents.pl','rb'))[:1000]
tags = pickle.load(open('tags.pl','rb'))[:1000]
labels = pickle.load(open('labels.pl','rb'))[:1000]

# FEATURES

In [27]:
import spacy
import keras

In [29]:
nlp = spacy.load('de')
# FastText
nlp.vocab.from_disk('/Users/oguzserbetci/.local/share/virtualenvs/mlhackathon--icYboxr/lib/python3.6/site-packages/spacy/data/de-fast/vocab/')

<spacy.vocab.Vocab at 0x12f631248>

In [30]:
docs = [[tok.vector for tok in nlp(document)] for document in documents]
docs = np.array(docs)

In [37]:
docs[0][2]

array([-0.119 , -0.0328, -0.2378,  0.1706, -0.2219,  0.092 , -0.7   ,
       -0.1201,  0.5505,  0.4706, -0.0538, -0.2432, -0.0371, -0.0529,
       -0.5739,  0.0293, -0.0719, -0.1114, -0.6196, -0.2961,  0.6649,
        0.4769, -0.0018, -0.1757, -0.0676, -0.4982, -0.0188,  0.2169,
       -0.1715, -0.2993,  0.5301,  0.0784,  0.1276, -0.2313, -0.227 ,
       -0.1562,  0.2866,  0.4701,  0.108 ,  0.0181,  0.401 , -0.147 ,
       -0.0786, -0.1445,  0.169 , -0.14  , -0.0652, -0.4676, -0.2353,
       -0.166 ,  0.1309,  0.3498,  0.2842,  0.0272,  0.1394, -0.3507,
       -0.176 ,  0.0944,  0.1464, -0.3408,  0.2305,  0.3354, -0.0541,
       -0.2709, -0.4125,  0.4232, -0.132 , -0.1587,  0.0677,  0.1926,
       -0.1195, -0.2901, -0.1141,  0.445 , -0.2425, -0.2975,  0.1088,
       -0.0694, -0.1729,  0.4553,  0.1636, -0.65  , -0.0381,  0.0194,
       -0.2564, -0.1079,  0.1931,  0.1039, -0.6595, -0.2412, -0.3128,
        0.0627, -0.4009,  0.1297, -0.0957,  0.3742,  0.2748,  0.6883,
       -0.5009, -0.1

In [4]:
Y = np.array(np.array(tags)=='Prio-Fall', dtype=int)

In [5]:
sum(Y)/len(Y)

0.1450676982591876

In [6]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# MODEL

In [None]:
from keras.models import Model
from keras import backend as K
from keras.layers import LSTM, Input, Dense, Activation, Add, Reshape, Lambda, Concatenate, \
                         TimeDistributed, Bidirectional, Masking

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, plot_model

from keras.optimizers import Adam
from keras.callbacks import TensorBoard

from keras.models import load_model

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.utils import class_weight
from tqdm import tqdm

In [None]:
# pad with zeros, truncate longer than 6
X = pad_sequences(documents, dtype=float, truncating='post',padding='post')

# convert links 1,1,2 -> [1,0,0]
#                        [1,0,0]
#                        [0,1,0]
Yl = np.array([to_categorical(np.array(y)-1, num_classes=10) for y in labels])
Yl_ind = [len(y) for y in Yl]
Yl = pad_sequences(Yl, dtype=int, truncating='post', padding='post')

Yt = np.array([to_categorical(np.array(y)-1, num_classes=10) for y in tags])
Yt_ind = [len(y) for y in Yt]
Yt = pad_sequences(Yt, dtype=int, truncating='post', padding='post')

In [12]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

## XG BOOST

In [17]:
# Set our parameters for xgboost
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.07
params['max_depth'] = 50
params['min_child_weight'] = 6
params['lambda'] = 1
params['nthread'] = 4

In [None]:
# Create 2 folds
k = 3
kfold = StratifiedKFold(Y, k, shuffle=True, random_state=0)

results = np.zeros((k,3))

# iterate over two folds
for i, (train_ind, test_ind) in enumerate(kfold):
    X_train, X_val, Y_train, Y_val = X_bag[train_ind], X_bag[test_ind], Y[train_ind], Y[test_ind]
    
    ros = RandomOverSampler(random_state=0)
    X_resampled, Y_resampled = ros.fit_sample(X_train, Y_train)
    print(X_resampled.shape, X_val.shape, Y_resampled.shape, Y_val.shape)
    
    D_train = xgb.DMatrix(X_resampled, label=Y_resampled)
    D_val = xgb.DMatrix(X_val, label=Y_val)

    watchlist = [(D_train, 'train'), (D_val, 'valid')]
    bst = xgb.train(params, D_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=10, )
    predictions = bst.predict(data=D_val).round()
    f1_score = f1_score(Y_val, predictions)
    acc = accuracy_score(Y_val, predictions)
    prio_acc = accuracy_score(Y_val[Y_val==1], predictions[Y_val==1])
    results[i] = [f1_score, acc, prio_acc]
    print('f1 score: {}'.format(f1_score))
    print('accuracy: {}\%'.format(acc))
    print('Prio accuracy: {}'.format(prio_acc))
    print(confusion_matrix(Y_val, predictions))

(124348, 321767) (36363, 321767) (124348,) (36363,)
[0]	train-logloss:0.668219	valid-logloss:0.672347
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[10]	train-logloss:0.516264	valid-logloss:0.548634
[20]	train-logloss:0.445863	valid-logloss:0.495861
[30]	train-logloss:0.406458	valid-logloss:0.468739


In [None]:
np.mean(results), np.var(results)