In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.base import TransformerMixin,BaseEstimator
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [4]:
import os

In [5]:
os.chdir('dineral')

In [6]:
from internaldata import Database, Classifier



In [7]:
clf = Classifier()

In [8]:
os.chdir('..')

In [9]:
db=Database()
data = db.load_data()

In [10]:
data.head()

Unnamed: 0,Hash,Datum,Text,Lastschrift,Deleted,Kategorie
0,3c3550b40b42a3367a2cb4b29d28412d,2014-01-03,01-37897-1 100000013241880130751593563\nAssur...,176.75,False,Krankenkasse
1,868f4d0ba29c129f408f979f3a9f8e0e,2014-01-03,BARGELDBEZUG\nVOM 01.01.2014\nKARTEN NR. 64186...,200.0,True,
2,6848fb24b03ed71e56d98fd8f3af3401,2014-01-03,Zürcher Kantonalbank CH2300700350040352767 K...,35.0,False,Anschaffungen
3,5c42bb51a03c0c035f074a9a9b8ddcbb,2014-01-03,KAUF/DIENSTLEISTUNG\nVOM 31.12.2013\nKARTEN NR...,58.0,False,Ausgang
4,cf97222b176cb612026a061b0757d230,2014-01-05,ÜBERTRAG\nAUS KONTO 92-900275-2\nSCHOCH TOBIAS...,-1000.0,False,Sparen


In [11]:
class Scaler(BaseEstimator,TransformerMixin):
    
    def fit(self, X, y=None, **fit_params):
        self._factor = np.max(np.abs(X))
        return self
    
    def transform(self, X, y=None, **fit_params):
        return np.matrix((X/self._factor)).T

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

In [12]:
class ItemSelector(BaseEstimator,TransformerMixin):
    
    def __init__(self, key=0):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data):
        return data[self.key]

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

In [13]:
class DenseTransformer(BaseEstimator,TransformerMixin):

        def transform(self, X, y=None, **fit_params):
            return X.todense()

        def fit_transform(self, X, y=None, **fit_params):
            self.fit(X, y, **fit_params)
            return self.transform(X)

        def fit(self, X, y=None, **fit_params):
            return self

In [14]:
features = FeatureUnion([('text',Pipeline([('select',ItemSelector('Text')),
                                           ('vect',CountVectorizer(analyzer='char_wb',lowercase=True, strip_accents='unicode')),
                                           ('trans',TfidfTransformer(use_idf=True)),
                                           ('dense',DenseTransformer())])),
                         ('number',Pipeline([('select',ItemSelector('Lastschrift')),('scale',Scaler())]))])

In [15]:
features

FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(steps=[('select', ItemSelector(key='Text')), ('vect', CountVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
...])), ('number', Pipeline(steps=[('select', ItemSelector(key='Lastschrift')), ('scale', Scaler())]))],
       transformer_weights=None)

In [16]:
data['Kategorie'] = data.Kategorie.cat.add_categories([u'Delete'])

In [17]:
data.Kategorie[data.Kategorie.isnull()]=u'Delete'

In [18]:
data.Kategorie[data.Deleted]=np.nan

In [19]:
categories = data.Kategorie.cat.categories
target = data.Kategorie.cat.codes

In [22]:
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.05)

In [23]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    xtrain = features.fit_transform(data_train)
    xtest = features.fit_transform(data_test)

In [25]:
from __future__ import division, print_function, absolute_import

import tflearn
import tensorflow as tf
from tflearn.layers import input_data, merge, dropout, fully_connected, regression
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb


  return f(*args, **kwds)


hdf5 is not supported on this machine (please install/reinstall h5py for optimal experience)


In [26]:
tf.logging.set_verbosity(tf.logging.INFO)

In [27]:
ytrain = to_categorical(target_train, nb_classes=categories.shape[0])

In [28]:
ytest = to_categorical(target_test, nb_classes=categories.shape[0])

In [29]:
MAX_DOCUMENT_LENGTH = 30

# create vocabulary
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
vocab_processor.fit(data_train['Text'])

<tensorflow.contrib.learn.python.learn.preprocessing.text.VocabularyProcessor at 0x7f02184c0630>

In [30]:
xtrain = np.array(list(vocab_processor.transform(data_train['Text'])))

In [31]:
ytrain.shape

(2250, 26)

In [60]:
# Building convolutional network
tf.reset_default_graph()
network = input_data(shape=[None, MAX_DOCUMENT_LENGTH], name='input')
network = tflearn.embedding(network, input_dim=5000, output_dim=32)
network = conv_1d(network, 16, 3, padding='valid', activation='relu', regularizer="L2")
#network = conv_1d(network, 16, 2, padding='valid', activation='relu', regularizer="L2")
#network = merge([branch1, branch2], mode='concat', axis=1)
#network = tf.expand_dims(network, 2)
#network = global_max_pool(network)
network = fully_connected(network, ytrain.shape[1], activation='softmax')
network = regression(network, optimizer='adam', learning_rate=0.01,
                     loss='categorical_crossentropy', name='target')

In [61]:
# Training
model = tflearn.DNN(network, tensorboard_verbose=0)
model.fit(xtrain, ytrain, n_epoch =  30, shuffle=True, validation_set=0.1, show_metric=True, batch_size=50)

Training Step: 1229  | total loss: [1m[32m0.03246[0m[0m | time: 0.253s
| Adam | epoch: 030 | loss: 0.03246 - acc: 0.9908 -- iter: 2000/2025
Training Step: 1230  | total loss: [1m[32m0.03087[0m[0m | time: 1.261s
| Adam | epoch: 030 | loss: 0.03087 - acc: 0.9908 | val_loss: 1.29302 - val_acc: 0.7333 -- iter: 2025/2025
--


In [48]:
xtest = np.array(list(vocab_processor.transform(data_test['Text'])))

In [49]:
preds = categories[model.predict_label(xtest)[:,0]]
print(metrics.classification_report(categories[target_test],preds))

                   precision    recall  f1-score   support

    Anschaffungen       0.67      0.67      0.67         3
       Ausbildung       0.00      0.00      0.00         0
          Ausgang       0.64      0.64      0.64        11
       Bekleidung       0.50      0.40      0.44         5
           Bussen       0.00      0.00      0.00         0
           Bücher       0.50      1.00      0.67         1
           Delete       1.00      0.69      0.82        13
        Eishockey       0.58      1.00      0.74         7
            Essen       0.87      0.82      0.84        33
         Gebühren       1.00      1.00      1.00         3
        Geschenke       0.50      0.33      0.40         9
     Krankenkasse       1.00      1.00      1.00         2
             Lohn       1.00      0.67      0.80         3
          Medizin       0.00      0.00      0.00         0
            Miete       1.00      0.33      0.50         6
     Mobiltelefon       1.00      1.00      1.00       

  'recall', 'true', average, warn_for)


In [128]:
preds = categories[model.predict_label(xtrain)[:,0]]
print(metrics.classification_report(categories[target_train],preds))

                   precision    recall  f1-score   support

    Anschaffungen       0.93      0.93      0.93        54
       Ausbildung       0.95      0.95      0.95        20
          Ausgang       0.99      0.98      0.98       251
       Bekleidung       0.93      0.92      0.92        60
           Bussen       1.00      1.00      1.00         4
           Bücher       1.00      1.00      1.00        13
           Delete       0.99      0.99      0.99       201
        Eishockey       0.96      0.98      0.97        81
            Essen       0.97      0.99      0.98       522
         Gebühren       1.00      0.97      0.99        38
        Geschenke       0.97      0.97      0.97        80
         Hochzeit       0.96      0.88      0.92        25
     Krankenkasse       1.00      1.00      1.00        58
             Lohn       1.00      0.98      0.99        66
          Medizin       1.00      0.96      0.98        24
            Miete       0.95      0.98      0.96       