# Import Data

Download the data `train.json` and place it in the "data folder".

In [1]:
import os
train_file = os.path.join("..", "data", "train.json")

In [2]:
import json

with open(train_file) as f:
    train_data = json.load(f)

In [3]:
len(train_data)

3037

# Prepare the Dataframe

In [5]:
import pandas as pd

rows = []
for datapoint in train_data:
    keys = [key for key in datapoint.keys() if "qa" in key]
    for key in keys:
        row = {}
        row["pre_text"] = " ".join(datapoint['pre_text'])
        row["table"] = str(datapoint['table_ori'])
        row["post_text"] = " ".join(datapoint['post_text'])
        row["question"] = datapoint[key]['question']
        row["answer"] = datapoint[key]['answer']
        rows.append(row)

df_train = pd.DataFrame(rows)

In [6]:
len(df_train)

3965

Since the set is quite large, we select a random sample from it with which we will play. This should keep times and costs under control.

In [7]:
seed_value = 99
df_sample = df_train.sample(n=100, random_state=seed_value)

# Testing Framework

To evaluate a model's output, we will run it against all entries of `df_sample`, getting the output for each row, and comparing the prediciton with the `answer` column (after some cleaning, such as removing characters such as "%" and rounding the numbers to 1 decimal place). We create a column `score`, which is $1$ if output and answer coincide, and 0 otherwise. Finally, we will simply compute the average of the the `score` column, to get the % of questions that were answered correctly.

In [8]:
import re
import numpy as np
import time

class TestingFramework:

    def __init__(self, data, model):
        self.data = data.copy()
        self.model = model

    def evaluate(self):
        self.data[['row_output', 'time', 'cost']] = self.data.apply(self.model.predict, axis=1, result_type='expand')
        self.data["clean_output"] = self.data["row_output"].apply(self.extract_float_from_string)
        self.data["clean_answer"] = self.data["answer"].apply(self.extract_float_from_string)
        
        self.data["score"] = np.where(self.data["clean_output"] == self.data["clean_answer"], 1, 0)

    # UTILITIES

    def extract_float_from_string(self, input_string):
        float_pattern = r'[-+]?\d*\.\d+|\d+'
        matches = re.findall(float_pattern, input_string)
        floats = [float(match) for match in matches]
        if len(floats) > 0:
            return np.around(floats[0], 1)
        else:
            return np.nan

    def get_scores(self):
        score_statistics = self.data["score"].describe()
        time_statistics = self.data["time"].describe()
        cost_statistics = self.data["cost"].describe()

        statistics = pd.concat([score_statistics, time_statistics, cost_statistics], axis=1)
        return statistics

Note that the `evaluate` function relies on the model having a method `predict` that, given a row of the dataframe, returns the output, the time it took to compute it, and the cost (for non-open-source models).

Let's test thel class with a couple of "trivial" models:

In [9]:
class Trivial:

    def __init__(self, name, desired_score):
        self.name = name
        self.desired_score = desired_score

    def predict(self, x):

        start_time = time.time()
        
        coin_toss = np.random.choice([0, 1], size=1, p=[1-self.desired_score, self.desired_score])
        if coin_toss == 1:
            output = x["answer"]
        else:
            output = ""
            
        end_time = time.time()
        
        elapsed_time = end_time - start_time
        cost = 0

        return output, elapsed_time, cost

In [10]:
cheater = Trivial("cheater", 1)
testing_cheater = TestingFramework(df_sample, cheater)

testing_cheater.evaluate()
testing_cheater.get_scores()

Unnamed: 0,score,time,cost
count,100.0,100.0,100.0
mean,1.0,5.3e-05,0.0
std,0.0,4.9e-05,0.0
min,1.0,1.2e-05,0.0
25%,1.0,4e-05,0.0
50%,1.0,4.3e-05,0.0
75%,1.0,5.3e-05,0.0
max,1.0,0.000443,0.0


In [11]:
fiftyfifty = Trivial("fifty-fifty", 0.5)
testing_fiftyfifty = TestingFramework(df_sample, fiftyfifty)

testing_fiftyfifty.evaluate()
testing_fiftyfifty.get_scores()

Unnamed: 0,score,time,cost
count,100.0,100.0,100.0
mean,0.48,4.3e-05,0.0
std,0.502117,3e-05,0.0
min,0.0,3e-05,0.0
25%,0.0,3.4e-05,0.0
50%,0.0,3.7e-05,0.0
75%,1.0,4.1e-05,0.0
max,1.0,0.000306,0.0


Ok, it works well. We can now use the framework to test actual models!

# Llama2

To get a free baseline, let's start with an open source model, such as Llama2.

In [12]:
from llama_cpp import Llama

In [13]:
class llama2LLM:

    def __init__(self, name, model):
        self.name = name
        self.model = model

    def predict(self, x):
        
        start_time = time.time()
        output, cost = self.create_chat_completion(x)
        end_time = time.time()
        
        elapsed_time = end_time - start_time

        return output, elapsed_time, cost

    # UTILITIES

    def create_chat_completion(self, x):

        instructions = "Answer the user's question returning only the numerical value: do not provide any textual comments."
            
        question = f"QUESTION: {x['question']}"
        context = f"INFORMATION:\n'''\n{x['table']}\n'''"

        query = question + "\n\n" + context

        try:
            
            response = self.model.create_chat_completion(
                messages = [
                    {
                        "role": "system",
                        "content": instructions
                    },
                    {
                        "role": "user",
                        "content": query
                    }
                ]
            )
            
            output = response["choices"][0]["message"]["content"]
            cost = 0

            return output, cost
        
        except:

            return "", 0

A model that can run locally in a reasonable amount of time is 7b-Q2. You can download it from [https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q2_K.gguf].

In [14]:
model_path = "../models/llama2/llama-2-7b-chat.Q2_K.gguf"
model = Llama(model_path=model_path, chat_format="llama-2")

llm = llama2LLM("7b-Q2_K", model)
testing = TestingFramework(df_sample, llm)

testing.evaluate()

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../models/llama2/llama-2-7b-chat.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32 

In [15]:
df_scores = testing.get_scores()
df_scores

Unnamed: 0,score,time,cost
count,100.0,100.0,100.0
mean,0.0,42.759319,0.0
std,0.0,19.459063,0.0
min,0.0,0.000705,0.0
25%,0.0,27.947745,0.0
50%,0.0,40.444605,0.0
75%,0.0,59.778341,0.0
max,0.0,84.137024,0.0


The time taken is very large, and the results are exceedingly bad. What is the reason for this? Let's take a look at the row output:

In [16]:
testing.data["row_output"]

3599      Sure! Here are the numerical values for the ...
10        Sure! According to the information provided ...
3410      The numerical value for the percentage of ch...
521       The percentage change in the allowance for d...
2679      Sure! The percent of growth in total net rev...
                              ...                        
1310                                                -$9.3
1711      The 2010 balance for securities purchased un...
3567      Sure! The percentage change in Entergy New O...
1431      Sure! The growth rate of debt to capital rat...
2867      The percentage change in total operating exp...
Name: row_output, Length: 100, dtype: object

We see that the LLM does not follow the instruction of outputting just a number. Moreover, a deeper analysis reveals that the computations described in the output are usually incomplete, so using another LLM to simply extract the numerical result from the text is not an option.

# OpenAI

In this section we consider OpenAI's models, expected to be better (both in terms of accuracy and time elapsed), though they required paying.

While the code below has been generated using my personal OpenAI API key, I am not sharing it in this code. To run the code, you will need your own API key.

In [17]:
import openai

os.environ['OPENAI_API_KEY'] = # your API here

In [18]:
class openaiLLM:

    def __init__(self, name, client, model, prices=[0, 0]):
        self.name = name
        self.client = client
        self.model = model
        self.price_1kprompt, self.price_1kcompletion = prices

    def predict(self, x):
        
        start_time = time.time()
        output, cost = self.create_chat_completion(x)
        end_time = time.time()
        
        elapsed_time = end_time - start_time

        return output, elapsed_time, cost

    # UTILITIES

    def create_chat_completion(self, x):

        instructions = "Answer the user's question returning only the numerical value: do not provide any textual comments."
            
        question = f"QUESTION: {x['question']}"
        context = f"INFORMATION:\n'''\n{x['pre_text']}\n\nTABLE:\n{x['table']}\n\n{x['post_text']}\n'''"

        query = question + "\n\n" + context

        response = self.client.chat.completions.create(
            model=self.model,
            temperature=0,
            messages = [
                {
                    "role": "system",
                    "content": instructions
                },
                {
                    "role": "user", "content": query
                }
            ]
        )

        output = response.choices[0].message.content
        cost = (self.price_1kprompt * response.usage.prompt_tokens + self.price_1kcompletion * response.usage.completion_tokens)/1000

        return output, cost

### GPT-3.5

Let's start with the cheaper model, GPT-3.5.

In [19]:
client = openai.OpenAI()
model = "gpt-3.5-turbo-1106"
prices = [0.001, 0.002]

llm = openaiLLM("gpt-3.5", client, model, prices)
testing = TestingFramework(df_sample, llm)

testing.evaluate()

In [20]:
df_scores = testing.get_scores()
df_scores

Unnamed: 0,score,time,cost
count,100.0,100.0,100.0
mean,0.05,0.906318,0.000982
std,0.219043,0.357965,0.000388
min,0.0,0.408353,0.000261
25%,0.0,0.614724,0.000769
50%,0.0,0.819332,0.000965
75%,0.0,1.02474,0.001149
max,1.0,2.253693,0.002858


In [21]:
print("total time: ", df_scores["time"]["count"] * df_scores["time"]["mean"])
print("total cost: ", df_scores["cost"]["count"] * df_scores["cost"]["mean"])

total time:  90.631844997406
total cost:  0.098215


We see that only 5% of the questions are answered correctly. Let's see how the situation changes if we adopt a more advanced model.

### GPT-4

GPT-4 is currently the most powerful of OpenAI's models.

In [22]:
client = openai.OpenAI()
model = "gpt-4-1106-preview"
prices = [0.01, 0.03]

llm = openaiLLM("gpt-4", client, model, prices)
testing = TestingFramework(df_sample, llm)

testing.evaluate()

In [23]:
df_scores = testing.get_scores()
df_scores

Unnamed: 0,score,time,cost
count,100.0,100.0,100.0
mean,0.39,0.998892,0.009829
std,0.490207,0.430136,0.003875
min,0.0,0.488925,0.00264
25%,0.0,0.817771,0.007698
50%,0.0,0.908447,0.00973
75%,1.0,1.024689,0.011545
max,1.0,3.071766,0.02871


In [24]:
print("total time: ", df_scores["time"]["count"] * df_scores["time"]["mean"])
print("total cost: ", df_scores["cost"]["count"] * df_scores["cost"]["mean"])

total time:  99.88923597335815
total cost:  0.98286


As expected, the situation got much better, with 39% of questions answered correctly, though the price is also gone up.

Let's take a look at the row output:

In [25]:
testing.data["row_output"]

3599        341
10      $45,161
3410      1.25%
521        16.5
2679       77.8
         ...   
1310     -88.57
1711    202,002
3567    -10.01%
1431       5.3%
2867     178.43
Name: row_output, Length: 100, dtype: object

We see that now the instruction of returning only numbers is satisfied. Upon a deeper inspection, it turns out that most of the wrong cases are due to either incomplete computations or arbitrary (and wrong) approximations of the correct output. This is expected, as LLM are known to not be great at mathematical tasks. To improve on this, we can make use of a chain of prompts.

# OpenAI with Chain of Prompts

In [31]:
class openai_chainLLM:

    def __init__(self, name, client, model, prices=[0, 0]):
        self.name = name
        self.client = client
        self.model = model
        self.price_1kprompt, self.price_1kcompletion = prices

    def predict(self, x):
        
        start_time = time.time()
        output, cost = self.create_chat_completion(x)
        end_time = time.time()
        
        elapsed_time = end_time - start_time

        return output, elapsed_time, cost

    # UTILITIES

    def create_chat_completion(self, x):

        # first step

        instructions = "Answer the user's question using the information provided. Present a step-by-step reasoning."
        question = f"QUESTION: {x['question']}"
        context = f"INFORMATION:\n'''\n{x['pre_text']}\n\nTABLE:\n{x['table']}\n\n{x['post_text']}\n'''"
        query = question + "\n\n" + context
        output1, cost1 = self.api_call(instructions, query)
        
        # second step
        
        instructions = "Answer the user's question from the context provided. Return only the numerical value: no textual comments."    
        question = f"QUESTION: {x['question']}"
        context = f"CONTEXT:\n'''\n{output1}\n'''"
        query = question + "\n\n" + context
        output2, cost2 = self.api_call(instructions, query)

        cost = cost1 + cost2

        return output2, cost

    def api_call(self, instructions, query):

        response = self.client.chat.completions.create(
            model=self.model,
            temperature=0,
            messages = [
                {
                    "role": "system",
                    "content": instructions
                },
                {
                    "role": "user", "content": query
                }
            ]
        )

        output = response.choices[0].message.content
        cost = (self.price_1kprompt * response.usage.prompt_tokens + self.price_1kcompletion * response.usage.completion_tokens)/1000

        return output, cost

In [32]:
client = openai.OpenAI()
model = "gpt-4-1106-preview"
prices = [0.01, 0.03]

llm = openai_chainLLM("gpt-4", client, model, prices)
testing = TestingFramework(df_sample, llm)

testing.evaluate()

In [33]:
df_scores = testing.get_scores()
df_scores

Unnamed: 0,score,time,cost
count,100.0,100.0,100.0
mean,0.48,18.182599,0.020415
std,0.502117,8.070635,0.004419
min,0.0,7.024198,0.01061
25%,0.0,11.780622,0.017225
50%,0.0,16.213748,0.020255
75%,1.0,23.16421,0.022225
max,1.0,41.637337,0.04254


In [34]:
print("total time: ", df_scores["time"]["count"] * df_scores["time"]["mean"])
print("total cost: ", df_scores["cost"]["count"] * df_scores["cost"]["mean"])

total time:  1818.2598712444308
total cost:  2.0415


While the price and the time required for a single answer went up (as expected), we see an improvement in the percentage of correct answers, which is now 48%. This indicates that the chain-of-prompt approach is potentially a good way to go. Further improvment (in terms of lowering cost) could be achieved by implementing RAG as a pre-selection of the information to be passed in the context for the first prompt (e.g., we might find that, for a question, the table is enough and thus we might decide not present the `pre_text` and the `post_text` to the LLM). However, given that these texts are not excessively long, we believe that not much would be gained in this specific case.