In [1]:
import pandas as pd
import numpy as np

In [2]:
import warnings

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion

In [4]:
from sklearn.base import TransformerMixin,BaseEstimator

In [5]:
from sklearn.decomposition import pca

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
from sklearn import svm
import sklearn.ensemble as ens

In [9]:
import os

In [10]:
os.chdir('dineral')

In [11]:
from internaldata import Database, Classifier

In [12]:
clf = Classifier()

In [13]:
os.chdir('..')

In [14]:
db=Database()
data = db.load_data()

In [15]:
data.head()

Unnamed: 0,Datum,Deleted,Hash,Kategorie,Lastschrift,Text
0,1971-01-01,True,xxx,Deleted,0.0,Dummy
1,2016-01-04,False,72073c2b5a265cc22dfd83e09a0ce151,Möbel,676.33,E-Banking Auftrag (Kontoübertrag) \nTobias Sch...
2,2016-01-29,False,6d189ab59af509847b56e06d8990da7c,Miete,-1400.0,Postvergütung von Schoch Tobias
3,2016-01-29,False,dc7509b8a034a5511ea0cccd851ca45f,Miete,-1400.0,Postvergütung von Alos Colomer Nuria
4,2016-02-01,False,fccd4a6a1962305abccc92e88f4128d9,Miete,2420.0,E-Banking Dauerauftrag an Ronald Schmid


In [16]:
class Scaler(BaseEstimator,TransformerMixin):
    
    def fit(self, X, y=None, **fit_params):
        self._factor = np.max(np.abs(X))
        return self
    
    def transform(self, X, y=None, **fit_params):
        return np.matrix((X/self._factor)).T

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

In [17]:
class ItemSelector(BaseEstimator,TransformerMixin):
    
    def __init__(self, key=0):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data):
        return data[self.key]

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

In [18]:
class DenseTransformer(BaseEstimator,TransformerMixin):

        def transform(self, X, y=None, **fit_params):
            return X.todense()

        def fit_transform(self, X, y=None, **fit_params):
            self.fit(X, y, **fit_params)
            return self.transform(X)

        def fit(self, X, y=None, **fit_params):
            return self

## Feature Pipeline

In [19]:
features = FeatureUnion([('text',Pipeline([('select',ItemSelector('Text')),
                                           ('vect',CountVectorizer(analyzer='char_wb',lowercase=True, strip_accents='unicode')),
                                           ('trans',TfidfTransformer(use_idf=True)),
                                           ('dense',DenseTransformer())])),
                         ('number',Pipeline([('select',ItemSelector('Lastschrift')),('scale',Scaler())]))])

In [20]:
features = FeatureUnion([('text',Pipeline([('select',ItemSelector('Text')),
                                           ('vect',CountVectorizer(analyzer='char_wb',lowercase=True, strip_accents='unicode')),
                                           ('trans',TfidfTransformer(use_idf=True)),
                                           ('dense',DenseTransformer())]))])

In [21]:
features = Pipeline([('vect',CountVectorizer(ngram_range=(4,5),analyzer='char_wb',lowercase=True, strip_accents='unicode')),
                     ('trans',TfidfTransformer(use_idf=True))])

### prepare data

In [22]:
data['Kategorie'] = data.Kategorie.cat.add_categories([u'Delete'])

In [23]:
data.Kategorie[data.Kategorie.isnull()]=u'Delete'

In [24]:
data.Kategorie[data.Deleted]=np.nan

In [25]:
categories = data.Kategorie.cat.categories
target = data.Kategorie.cat.codes

### test train split

In [26]:
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.05)

extract features

In [27]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    xtrain = features.fit_transform(data_train)
    xtest = features.fit_transform(data_test)

In [34]:
text_clf = Pipeline([('features',features),('clf',SGDClassifier())])

In [35]:
text_clf.fit(data_train, target_train, alpha=0.003)

ValueError: not enough values to unpack (expected 2, got 1)

In [36]:
text_clf.fit(data_train)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


ValueError: bad input shape ()

In [37]:
parameters = {
              'features__vect__ngram_range':[(i,j) for j in range(3,7) for i in range(1,j)],
              'clf__alpha': np.linspace(3e-3,1e-4,20)
             }

In [38]:
text_clf.fit(data_train['Text'],target_train)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Pipeline(steps=[('features', Pipeline(steps=[('vect', CountVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(4, 5), preprocessor=None...   penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False))])

## Grid search

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    gs_clf = GridSearchCV(text_clf,parameters,cv=5,verbose=True, n_jobs=8)
    gs_clf.fit(data_train['Text'],target_train)

Fitting 5 folds for each of 280 candidates, totalling 1400 fits


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   11.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   26.3s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:   52.8s


In [None]:
print(gs_clf.best_params_)

In [None]:
preds = gs_clf.predict(data_test['Text'])

In [None]:
preds_all = gs_clf.predict(data['Text'])

In [None]:
print(metrics.classification_report(categories[target_test],categories[preds]))

In [63]:
pd.DataFrame(metrics.confusion_matrix(categories[target_test],categories[preds],labels=categories),columns=categories,index=categories)

Unnamed: 0,Anschaffungen,Ausbildung,Ausgang,Bekleidung,Bussen,Bücher,Eishockey,Essen,Gebühren,Geschenke,...,Reisen/Ausflüge,Schulden,Sparen,Spenden,Sport allgemein,Steuern,Transport,Vorsorge,Wellness/Coiffeur,Delete
Anschaffungen,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
Ausbildung,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Ausgang,0,0,14,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
Bekleidung,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
Bussen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Bücher,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Eishockey,0,0,0,0,0,0,4,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Essen,0,0,3,0,0,0,0,28,0,0,...,0,0,0,0,0,0,0,0,0,0
Gebühren,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Geschenke,0,0,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [63]:
import datetime

In [64]:
text_clf = gs_clf.best_estimator_
text_clf.classes_names = categories

text_clf.TRAINING_DATE = datetime.datetime.now()
text_clf.TRAINING_SAMPLES = data_train.shape[0]
text_clf.TEST_SAMPLES = data_test.shape[0]
text_clf.SCORE = metrics.precision_recall_fscore_support(target_test,preds)

In [65]:
import pickle
with open(clf.properties,"wb+") as fp:
    pickle.dump(text_clf,fp)