In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
dataset_path = '../../big_data/rb-suppliers-challange/receipts.csv'
texts_path = '../../big_data/rb-suppliers-challange/texts/'

In [6]:
dataset = pd.read_csv(dataset_path, index_col=['id'])

dataset = dataset[~dataset.supplier_name.isnull()]
dataset = dataset[~dataset.currency_code.isnull()]
np.random.seed(1)
shuffled = dataset.reindex(np.random.permutation(dataset.index))

unique_supplier_names = shuffled.supplier_name.unique()
unique_currency_codes = shuffled.currency_code.unique()

In [7]:
from os import listdir
from os.path import isfile, join
import json

In [8]:
file_names = [f.split(".")[0] for f in listdir(texts_path) if isfile(join(texts_path, f)) and f.endswith('.json')]
ids = np.array(list(set([int(f) for f in file_names]).intersection(set(dataset.index.values))))
texts = pd.Series(['' for i in ids], index=ids)
for index in ids:
    json_data = open(texts_path + str(index) + '.json', encoding="utf8")
    data = json.load(json_data)
    try:
        texts[index] = data[0]['textAnnotations'][0]['description']
    except:
        texts[index] = ''
dataset['texts'] = texts

In [9]:
import re

In [10]:
prices = pd.Series([[] for i in ids], index=ids)
for index in ids:
    text = dataset['texts'][index]
    total_amount = dataset['total_amount'][index]
    lines = text.split('\n')
    current_prices = []
    for line in lines:
        found_prices = re.findall('\d[\d\.,]+', line)
        found_digits = re.findall('\d', line)
        if len(found_prices) >= 1:
            for item in found_prices:
                try:
                    current_price = float(item)
                except:
                    break
                
                new_price = dict()
                new_price['value'] = current_price
                new_price['line'] = line
                new_price['is_total'] = 1 if current_price == total_amount else 0
                new_price['has_dot'] = '.' in line
                new_price['has_comma'] = ',' in line
                new_price['has_total_word'] = 'total' in line.lower()
                new_price['digits_count'] = len(found_digits)
                
                current_prices.append(new_price)
    
    for i, price in enumerate(current_prices):
        if len(current_prices) == 1:
            price['order'] = 1.00
        else:
            price['order'] = round(i / (len(current_prices) - 1), 2)
        
    prices[index] = current_prices
    
dataset['prices'] = prices

In [11]:
X_data = []
Y_data = []

for id_index in ids:
    for price_index, price in enumerate(dataset['prices'][id_index]):
        # new_index = str(id_index) + '_' + str(price_index)
        
        features = []
        features.append(price['has_dot'])
        features.append(price['has_comma'])
        features.append(price['has_total_word'])
        features.append(price['digits_count'])
        features.append(price['order'])
        X_data.append(features)
        
        Y_data.append(1 if price['is_total'] else 0)

In [13]:
test_items_count = int(len(X_data) * (10 / 100))
X_train =  np.array(X_data[test_items_count:])
X_test = np.array(X_data[:test_items_count])
Y_train = np.array(Y_data[test_items_count:])
Y_test = np.array(Y_data[:test_items_count])

In [14]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [15]:
total_amount_classifier = RandomForestClassifier()
total_amount_classifier.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [17]:
correct = 0
predictions_for_total = []
predictions_for_non_total = []
for x_t, y_t in zip(X_test, Y_test):
    predicted = total_amount_classifier.predict_proba(x_t.reshape(1, -1))
    if y_t == 1:
        predictions_for_total.append(predicted[1])
    else:
        predictions_for_non_total.append(predicted[1])
            
print('Average for totals: ', float(sum(predictions_for_total)) / len(predictions_for_total))
print('Average for non totals: ', float(sum(predictions_for_total)) / len(predictions_for_non_total))

TypeError: only length-1 arrays can be converted to Python scalars