# Demonstrate Data Pipeline

### Load modules

In [31]:
import re
import json
import random
from dotenv import load_dotenv

load_dotenv() 

from convfinqa.document_processing import execute_document_processing
from convfinqa.query_processing import execute_query_processing
from convfinqa.calculate import execute_calculation

### Load Data

In [32]:
# Load data
train_js = json.load(open("data/train.json", "r"))

In [33]:
# Get 4 examples of question / steps pairs
random.seed(10)

random_sample = random.sample(train_js, 4)

test_js = [i for i in train_js if i not in random_sample]

# Small sample to test MVP

In [34]:
# Get type 1 documents
random.seed(3)

type_1s = [t for t in test_js if t.get("qa", None) is not None]
sub_sample = random.sample(type_1s, 50)

# Preprocess document

In [69]:
# Process documents
processed_documents = [execute_document_processing(sample) for sample in sub_sample]

# Generate calculation steps

In [36]:
# Questions
questions = [q["qa"]["question"] for q in sub_sample] 

In [68]:
# Fetch calculations
calculations = [execute_query_processing(question=questions[idx], data=processed_document) for idx, processed_document in enumerate(processed_documents)]

# Calculate & Compare

In [61]:
def compare_numbers_with_factors(answer, calculation):
    # Check if they are equal after rounding
    if round(answer, 2) == round(calculation, 2):
        return True
    
    # Check if they are off by a factor of 10
    factors = [0.1, 0.01, 1, 10, 100, 0.001, 1000]
    for factor in factors:
        if round(answer, 2) == round(calculation * factor, 2):
            return True
        if round(answer, 2) == round(calculation / factor, 2):
            return True
    
    # Check if they are negatives of each other
    if round(answer, 2) == round(-calculation, 2):
        return True
    
    return False


# Compare answers
answers = [q["qa"]["answer"] for q in sub_sample] 

counter = 0
rough_counter = 0
for idx, calculation in enumerate(calculations):
    calculation = execute_calculation(calculation)
    number = float(re.findall(r'\d+(?:\.\d+)?', answers[idx])[0]) if re.findall(r'\d+(?:\.\d+)?', answers[idx]) else 0

    print(f"Answer: {number}, Calculated: {calculation}")

    number = round(number)
    calculation = round(calculation)

    if number == calculation:
        counter += 1

    if compare_numbers_with_factors(number, calculation):
        rough_counter += 1


Answer: 54.0, Calculated: 0.521487204249155
Answer: 36.7, Calculated: 36.666666666666664
Answer: 733.35, Calculated: 866250000.0
Answer: 15.1, Calculated: 15.11627906976744
Answer: 11.5, Calculated: -0.11349437734277384
Answer: 11.3, Calculated: -11.349437734277384
Answer: 26.0, Calculated: 25.98670825986708
Answer: 21.5, Calculated: 0.21480000000000005
Answer: 881.7, Calculated: 881.7443999999999
Answer: 20.2, Calculated: 20202.0
Answer: 13.2, Calculated: 0.13183385322607125
Answer: 52.2, Calculated: 0.5224930277676731
Answer: 83.6, Calculated: 1631.0
Answer: 154.0, Calculated: 154.0
Answer: 3.4, Calculated: 33720.0
Answer: 22.9, Calculated: 22.9
Answer: 4.75, Calculated: 0.00475
Answer: 0.6, Calculated: -0.5611035286309944
Answer: 28125000.0, Calculated: 29812500.0
Answer: 5.5, Calculated: 5.501618122977356
Answer: 27000000.0, Calculated: -27.0
Answer: 20.5, Calculated: 20.6
Answer: 6.1, Calculated: 6.134205200062276
Answer: 10.41, Calculated: 109608.0
Answer: 137.8, Calculated: 137.

In [65]:
# Percentage of numbers that are exactly equal
(counter / 50)*100

30.0

In [66]:
# Where numbers are off by a factor of 10
(rough_counter / 50)*100

34.0

In [67]:
# Combined
((counter + rough_counter) / 50)*100

64.0