In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
r = "..\"

 Volume in drive D is Data
 Volume Serial Number is 8665-F573

 Directory of D:\Projects\datathon-receipt-bank\src

25.03.2017 Ј.  12:05    <DIR>          .
25.03.2017 Ј.  12:05    <DIR>          ..
25.03.2017 Ј.  12:03    <DIR>          .ipynb_checkpoints
25.03.2017 Ј.  12:05             1я084 Main.ipynb
               1 File(s)          1я084 bytes
               3 Dir(s)  514я577я948я672 bytes free


In [10]:
dataset_path = '../big_data/rb-suppliers-challange/receipts.csv'
texts_path = '../big_data/rb-suppliers-challange/texts/'

In [11]:
dataset = pd.read_csv(dataset_path, index_col=['id'])
print("Initial shape:", dataset.shape)

dataset = dataset[~dataset.supplier_name.isnull()]
dataset = dataset[~dataset.currency_code.isnull()]
np.random.seed(1)
shuffled = dataset.reindex(np.random.permutation(dataset.index))
print("Shape after filter:", dataset.shape)
print("Columns names:")
print(dataset.columns.values)

unique_supplier_names = shuffled.supplier_name.unique()
unique_currency_codes = shuffled.currency_code.unique()

print("Dataset size {0}".format(len(shuffled)))
print("Unique supplier names {0}".format(len(unique_supplier_names)))
print("Unique currency names {0}".format(len(unique_currency_codes)))

print("Currency codes:")
print(unique_currency_codes)

Initial shape: (9009, 15)
Shape after filter: (9009, 15)
Columns names:
['id.1' 'created_at' 'currency_code' 'total_amount' 'vat_amount' 'date'
 'due_date' 'invoice_number' 'received_via' 'supplier_name' 'ocr_method'
 'manual_review' 'account_default_currency' 'payment_type' 'document_type']
Dataset size 9009
Unique supplier names 1948
Unique currency names 20
Currency codes:
['AUD' 'USD' 'GBP' 'CAD' 'NZD' 'EUR' 'ZAR' 'HKD' 'BGN' 'AED' 'JPY' 'RUB'
 'SGD' 'BRL' 'UAH' 'IDR' 'THB' 'CNY' 'INR' 'NOK']


In [12]:
def find_class_frequencies(dataset, group_col='currency_code'):
    codes = dataset.groupby(group_col).size()
    return codes

currency_codes = find_class_frequencies(dataset)
print('Unique currencies:', len(currency_codes))
print(currency_codes)

currency_names = currency_codes.index.values
currency_counts = currency_codes.values
# plt.bar(range(len(currency_names.tolist())), currency_counts.tolist(), align='edge', alpha=0.5, tick_label=currency_names.tolist())
# plt.xticks(rotation=70);

Unique currencies: 20
currency_code
AED       7
AUD     800
BGN     294
BRL       1
CAD      88
CNY       1
EUR     443
GBP    5658
HKD       9
IDR       1
INR       1
JPY       1
NOK       1
NZD     362
RUB       3
SGD      11
THB       1
UAH       2
USD    1215
ZAR     110
dtype: int64


In [14]:
suppliers = find_class_frequencies(dataset, 'supplier_name')
print('Unique suppliers:', len(currency_codes))
print(suppliers)

suppliers_names = suppliers.index.values
suppliers_counts = suppliers.values
# plt.bar(range(len(suppliers_names)), suppliers_counts, align='edge', alpha=0.5);

Unique suppliers: 20
supplier_name
155 Bar and Kitchen                      2
2Talk                                    1
333 Estates                              1
51st State Tavern                        2
57 Hotel                                 1
7-Eleven                                 8
75 Paris                                 1
A & J Road Services                      1
A Teck Or Tambo                          1
A.W. Curtis                              1
ACE Envelopes                            1
ACR Sales & Marketing                    1
AMT Coffee                               9
ASDA                                     3
ATM                                      4
AVIS                                     2
Aagrah Leopold LLP                       1
Aaroport Services                        1
Abellio Stansted Express                 1
Abode Machester                          2
Accor Hotels                             2
Adam Zweig                               2
Adams              

In [15]:
from os import listdir
from os.path import isfile, join
import json

In [None]:
file_names = [f.split(".")[0] for f in listdir(texts_path) if isfile(join(texts_path, f)) and f.endswith('.json')]
ids = np.array(list(set([int(f) for f in file_names]).intersection(set(dataset.index.values))))
texts = pd.Series(['' for i in ids], index=ids)
for index in ids:
    json_data = open(texts_path + str(index) + '.json', encoding="utf8")
    data = json.load(json_data)
    try:
        texts[index] = data[0]['textAnnotations'][0]['description']
    except:
        texts[index] = ''
dataset['texts'] = texts

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [None]:
np.random.seed(1)
shuffled = dataset.reindex(np.random.permutation(dataset.index))

test_slice = int(len(shuffled) * (10 / 100))
training_set = shuffled[test_slice:]
test_set = shuffled[:test_slice]

print("Training set size {0}".format(len(training_set)))
freq = find_class_frequencies(training_set)
print(freq)
print("Trest set size {0}".format(len(test_set)))
freq = find_class_frequencies(test_set)
print(freq)

In [None]:
classifier = Pipeline([
    ('counts', CountVectorizer(max_features=2000000, min_df=2, ngram_range=(1, 3))),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(class_weight='balanced', random_state=1))])

classifier.fit(training_set['texts'], training_set['currency_code'])
print("Test set score {0}".format(classifier.score(test_set['texts'],  test_set['currency_code'])))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = Pipeline([
    ('counts', CountVectorizer(max_features=2000000, min_df=2, ngram_range=(1, 3))),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=64, n_jobs=4, random_state=1))])

classifier.fit(training_set['texts'], training_set['supplier_name'])
print("Test set score {0}".format(classifier.score(test_set['texts'],  test_set['supplier_name'])))

In [None]:
result_set = test_set.copy()
result_set['prediction'] = classifier.predict(result_set['texts'])

In [None]:
result_set[['supplier_name', 'prediction']]