In [107]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from os import listdir
from os.path import isfile, join
import json

In [108]:
dataset_path = '../../big_data/rb-suppliers-challange/receipts.csv'
texts_path = '../../big_data/rb-suppliers-challange/texts/'

In [124]:
dataset = pd.read_csv(dataset_path, index_col=['id'])

dataset = dataset[~dataset.supplier_name.isnull()]
dataset = dataset[~dataset.currency_code.isnull()]
np.random.seed(1)
shuffled = dataset.reindex(np.random.permutation(dataset.index))

unique_supplier_names = shuffled.supplier_name.unique()
unique_currency_codes = shuffled.currency_code.unique()

In [125]:
file_names = [f.split(".")[0] for f in listdir(texts_path) if isfile(join(texts_path, f)) and f.endswith('.json')]
ids = np.array(list(set([int(f) for f in file_names]).intersection(set(dataset.index.values))))
texts = pd.Series(['' for i in ids], index=ids)
for index in ids:
    json_data = open(texts_path + str(index) + '.json', encoding="utf8")
    data = json.load(json_data)
    try:
        texts[index] = data[0]['textAnnotations'][0]['description']
    except:
        texts[index] = ''
dataset['texts'] = texts

In [126]:
import re
from itertools import combinations

In [127]:
prices = pd.Series([[] for i in ids], index=ids)
for index in ids:
    text = dataset['texts'][index]
    total_amount = dataset['total_amount'][index]
    lines = text.split('\n')
    current_prices = []
    for line_index, line in enumerate(lines):
        found_prices = re.findall('\d+[\.,][\d\.,]+', line)
        found_digits = re.findall('\d', line)
        for item in found_prices:
            try:
                current_price = float(item)
            except:
                continue

            new_price = dict()
            new_price['value'] = current_price
            new_price['is_total'] = 1 if current_price == total_amount else 0
            new_price['digits_count'] = len(found_digits)
            
            sourrounding_text = ''
            if line_index > 0:
                sourrounding_text += lines[line_index-1] + ' '
            sourrounding_text += lines[line_index] + ' '
            if line_index + 1 < len(lines):
                sourrounding_text += lines[line_index+1]
                
            new_price['text'] = sourrounding_text

            current_prices.append(new_price)
                               
    prices[index] = current_prices
    
dataset['prices'] = prices

In [128]:
def get_possible_sums(items):
    result = set()
    filtered = list(item for item in items if item > 0)
    
    start_combinations = max(2, len(filtered) - 5)
    end_combinations = len(filtered) + 1
    
    for length in range(start_combinations, end_combinations):
        for combination in combinations(filtered, length):
            summed = sum(combination)
            # round to avoid float errors
            result.add(round(summed, 3))
    
    return result

In [129]:
for index in ids:
    current_prices = dataset['prices'][index]
    
    possible_sums = get_possible_sums(list(p['value'] for p in current_prices))
    
    for i, price in enumerate(current_prices):
        price['order'] = round(i / (max(len(current_prices) - 1, 1)), 2)
        price['can_be_sum'] = price['value'] in possible_sums

In [130]:
TEXT_INDEX = 0

def get_features_array(price):
    features = []
    features.append(price['text'])
    features.append(price['digits_count'])
    features.append(price['order'])
    features.append(price['can_be_sum'])
    return features

In [131]:
test_items_count = int(len(ids) * (10 / 100))

train_ids = ids[test_items_count:]
X_train = []
Y_train = []
for train_index in train_ids:
    for price in dataset['prices'][train_index]:
        X_train.append(get_features_array(price))
        Y_train.append(1 if price['is_total'] else 0)
        
test_ids = ids[:test_items_count]

In [132]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline

In [133]:
train_texts = list(item[TEXT_INDEX] for item in X_train)

tfidf_lines_classifier = Pipeline([
    ('counts', CountVectorizer(ngram_range=(1, 3))),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier())])
tfidf_lines_classifier.fit(train_texts, Y_train)

Pipeline(steps=[('counts', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        str...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [134]:
for index in ids:
    texts = list(str(item['text']) for item in dataset['prices'][index])
    if len(texts) == 0:
        continue
    
    predicted = tfidf_lines_classifier.predict_proba(np.array(texts))
    for price_index, price in enumerate(dataset['prices'][index]):
        price['text'] = float(predicted[price_index][1])

In [135]:
texts = list(str(item[TEXT_INDEX]) for item in X_train)
predicted = tfidf_lines_classifier.predict_proba(np.array(texts))

for price_index, price in enumerate(X_train):
    price[TEXT_INDEX] = float(predicted[price_index][1])

In [136]:
total_amount_classifier = RandomForestClassifier()
total_amount_classifier.fit(np.array(X_train), np.array(Y_train))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [137]:
def predict_totals(receipt_prices, max_top_n):
    predicted_totals = []
    for price in receipt_prices:
        features = get_features_array(price)
        features = np.array(features)
        predicted = total_amount_classifier.predict_proba(features.reshape(1, -1))
        predicted_totals.append((price['value'], predicted[0][1]))
                              
    sorted_predictions = sorted(predicted_totals, key=lambda tup: tup[1], reverse = True)
    top_n_list = list(range(1, max_top_n + 1))
    return tuple(list(tup[0] for tup in sorted_predictions[:top_n]) for top_n in top_n_list)

In [138]:
max_top_n = 3

total_counts = list(0 for _ in range(max_top_n))
correct_counts = list(0 for _ in range(max_top_n))

for index in test_ids:
    total_amount = dataset['total_amount'][index]
    pridictions_tup = predict_totals(dataset['prices'][index], max_top_n)
    
    for top_n, pred_list in enumerate(pridictions_tup):
        total_counts[top_n] += 1
        if total_amount in pred_list:
            correct_counts[top_n] += 1
        
for pred_index, pred_result in enumerate(zip(correct_counts, total_counts)):
    print('TOP', str(pred_index + 1), 'Accuracy:', round(pred_result[0] / pred_result[1], 3))

TOP 1 Accuracy: 0.532
TOP 2 Accuracy: 0.606
TOP 3 Accuracy: 0.621


In [139]:
#BASELINE!

import random

def predict_random_totals(receipt_prices, max_top_n):
    predicted_totals = []
    for price in receipt_prices:
        predicted_totals.append((price['value'], random.random()))
                              
    sorted_predictions = sorted(predicted_totals, key=lambda tup: tup[1], reverse = True)
    top_n_list = list(range(1, max_top_n + 1))
    return tuple(list(tup[0] for tup in sorted_predictions[:top_n]) for top_n in top_n_list)

max_top_n = 3

total_counts = list(0 for _ in range(max_top_n))
correct_counts = list(0 for _ in range(max_top_n))

for index in test_ids:
    total_amount = dataset['total_amount'][index]
    pridictions_tup = predict_random_totals(dataset['prices'][index], max_top_n)
    
    for top_n, pred_list in enumerate(pridictions_tup):
        total_counts[top_n] += 1
        if total_amount in pred_list:
            correct_counts[top_n] += 1
        
for pred_index, pred_result in enumerate(zip(correct_counts, total_counts)):
    print('TOP', str(pred_index + 1), 'Accuracy:', round(pred_result[0] / pred_result[1], 3))

TOP 1 Accuracy: 0.33
TOP 2 Accuracy: 0.414
TOP 3 Accuracy: 0.494


In [99]:
correct = 0
predictions_for_total = []
predictions_for_non_total = []
for x_t, y_t in zip(np.array(X_test), np.array(Y_test)):
    predicted = total_amount_classifier.predict_proba(x_t.reshape(1, -1))[0][1]
    if y_t == 1:
        predictions_for_total.append(predicted)
    else:
        predictions_for_non_total.append(predicted)
            
print('Average for totals: ', float(sum(predictions_for_total)) / len(predictions_for_total))
print('Average for non totals: ', float(sum(predictions_for_total)) / len(predictions_for_non_total))

Average for totals:  0.6688385352919202
Average for non totals:  0.21464159298766491
