# LLM Strict JSON Tabular Data Interpolator
- Created by John Tan Chong Min
- https://github.com/tanchongmin/strictjson
- 21 Aug 2023
- Trying to see how to adapt Strict JSON Framework for Tabular Data

In [1]:
import os
import openai
import json
import re

#API Keys
os.environ['OPENAI_API_TOKEN'] = '<Your API key>'
openai.api_key = os.environ['OPENAI_API_TOKEN']

# Strict Output Formatting
- Use when you want to force the function output to be a json format
- Helps a lot with minimizing unnecessary explanations of ChatGPT, and ensuring all output fields are there

In [2]:
def strict_output(system_prompt, user_prompt, output_format, default_category = "", output_value_only = False,
                  model = 'gpt-3.5-turbo', temperature = 0, num_tries = 2, verbose = False):
    ''' Ensures that OpenAI will always adhere to the desired output json format. 
    Uses rule-based iterative feedback to ask GPT to self-correct.
    Keeps trying up to num_tries it it does not. Returns empty json if unable to after num_tries iterations.
    If output field is a list, will treat as a classification problem and output best classification category.
    Text enclosed within < > will generated by GPT accordingly'''

    # if the user input is in a list, we also process the output as a list of json
    list_input = isinstance(user_prompt, list)
    # if the output format contains dynamic elements of < or >, then add to the prompt to handle dynamic elements
    dynamic_elements = '<' in str(output_format)
    # if the output format contains list elements of [ or ], then we add to the prompt to handle lists
    list_output = '[' in str(output_format)
    
    # start off with no error message
    error_msg = ''
    
    for i in range(num_tries):
        
        output_format_prompt = f'''\nYou are to output the following in json format: {output_format}. 
Do not put quotation marks or escape character \ in the output fields.'''
        
        if list_output:
            output_format_prompt += f'''\nIf output field is a list, classify output into the best element of the list.'''
        
        # if output_format contains dynamic elements, process it accordingly
        if dynamic_elements: 
            output_format_prompt += f'''
Any text enclosed by < and > indicates you must generate content to replace it. Example input: Go to <location>, Example output: Go to the garden
Any output key containing < and > indicates you must generate the key name to replace it. Example input: {{'<location>': 'description of location'}}, Example output: {{'school': 'a place for education'}}'''

        # if input is in a list format, ask it to generate json in a list
        if list_input:
            output_format_prompt += '''\nGenerate a list of json, one json for each input element.'''
            
        # Use OpenAI to get a response
        response = openai.ChatCompletion.create(
          temperature = temperature,
          model=model,
          messages=[
            {"role": "system", "content": system_prompt + output_format_prompt + error_msg},
            {"role": "user", "content": str(user_prompt)}
          ]
        )

        res = response['choices'][0]['message']['content'].replace('\'', '"')
        
        # ensure that we don't replace away aprostophes in text 
        res = re.sub(r"(\w)\"(\w)", r"\1'\2", res)

        if verbose:
            print('System prompt:', system_prompt + output_format_prompt + error_msg)
            print('\nUser prompt:', str(user_prompt))
            print('\nGPT response:', res)
        
        # try-catch block to ensure output format is adhered to
        try:
            output = json.loads(res)
            if isinstance(user_prompt, list):
                if not isinstance(output, list): raise Exception("Output format not in a list of json")
            else:
                output = [output]
                
            # check for each element in the output_list, the format is correctly adhered to
            for index in range(len(output)):
                for key in output_format.keys():
                    # unable to ensure accuracy of dynamic output header, so skip it
                    if '<' in key or '>' in key: continue
                    # if output field missing, raise an error
                    if key not in output[index]: raise Exception(f"{key} not in json output")
                    # check that one of the choices given for the list of words is an unknown
                    if isinstance(output_format[key], list):
                        choices = output_format[key]
                        # ensure output is not a list
                        if isinstance(output[index][key], list):
                            output[index][key] = output[index][key][0]
                        # output the default category (if any) if GPT is unable to identify the category
                        if output[index][key] not in choices and default_category:
                            output[index][key] = default_category
                        # if the output is a description format, get only the label
                        if ':' in output[index][key]:
                            output[index][key] = output[index][key].split(':')[0]
                            
                # if we just want the values for the outputs
                if output_value_only:
                    output[index] = [value for value in output[index].values()]
                    # just output without the list if there is only one element
                    if len(output[index]) == 1:
                        output[index] = output[index][0]
                    
            return output if list_input else output[0]

        except Exception as e:
            error_msg = f"\n\nResult: {res}\n\nError message: {str(e)}"
            print("An exception occurred:", str(e))
            print("Current invalid json format:", res)
         
    return {}

## Overall Open-ended generation
- **system_prompt**: Write in whatever you want GPT to become. "You are a \<purpose in life\>"
- **user_prompt**: The user input. Later, when we use it as a function, this is the function input
- **output_format**: JSON format with the key as the output key, and the value as the output description
    - The output keys will be preserved exactly, while GPT will generate content to match the description of the value as best as possible

#### Example Usage
```python
res = strict_output(system_prompt = 'You are a classifier',
                    user_prompt = 'It is a beautiful day',
                    output_format = {"Sentiment": "Type of Sentiment",
                                    "Tense": "Type of Tense"})
                                    
print(res)
```

#### Example output
```{'Sentiment': 'Positive', 'Tense': 'Present'}```


## Generating more tabular data

In [9]:
import pandas as pd

# Read the CSV file
# df = pd.read_csv('jobs_data.csv')
start_info = [{'Name': 'John', 'Gender': 'Male', 'Occupation': 'NaN', 'Occupation Description': 'Works to develop Large Language Models', 'Salary': 'NaN'}, 
{'Name': 'NaN', 'Gender': 'Female', 'Occupation': 'Nurse', 'Occupation Description': 'NaN', 'Salary': '5000'}, 
{'Name': 'Penny', 'Gender': 'NaN', 'Occupation': 'Founder', 'Occupation Description': 'NaN', 'Salary': 'NaN'}]
df = pd.DataFrame(start_info)

# Display the DataFrame
df

Unnamed: 0,Name,Gender,Occupation,Occupation Description,Salary
0,John,Male,,Works to develop Large Language Models,
1,,Female,Nurse,,5000.0
2,Penny,,Founder,,


# Data Imputation
- Impute data

In [12]:
res = strict_output(system_prompt = f'''
You are given rows of Tabular Data with missing values denoted as NaN. 
You must not truncate the data.
''', 
        user_prompt = df,
        output_format = 
{"Filled Data": "<generate missing values for the csv data in the same format as given>"})

print(res)
my_df = pd.DataFrame(res['Filled Data'])

{'Filled Data': [{'Name': 'John', 'Gender': 'Male', 'Occupation': 'Software Engineer', 'Occupation Description': 'Works to develop Large Language Models', 'Salary': '10000'}, {'Name': 'Emma', 'Gender': 'Female', 'Occupation': 'Nurse', 'Occupation Description': 'Provides medical care to patients', 'Salary': '5000'}, {'Name': 'Penny', 'Gender': 'Teacher', 'Occupation': 'Founder', 'Occupation Description': 'Starts and manages a company', 'Salary': '8000'}]}


In [13]:
my_df

Unnamed: 0,Name,Gender,Occupation,Occupation Description,Salary
0,John,Male,Software Engineer,Works to develop Large Language Models,10000
1,Emma,Female,Nurse,Provides medical care to patients,5000
2,Penny,Teacher,Founder,Starts and manages a company,8000


# Data Generation
- Generate a few more rows of data

In [20]:
res = strict_output(system_prompt = f'''
You are given rows of Tabular Data with missing values denoted as NaN. 
You are to generate three more rows of data. Do not repeat names.
You must not truncate the data.
''', 
        user_prompt = df,
        output_format = 
{"Augmented Data": "<generate three more rows for the csv data in the same format as given>"})

print(res)
new_df = pd.DataFrame(res['Augmented Data'])

{'Augmented Data': [{'Name': 'Alice', 'Gender': 'Female', 'Occupation': 'Software Engineer', 'Occupation Description': 'Develops software applications', 'Salary': '8000'}, {'Name': 'Bob', 'Gender': 'Male', 'Occupation': 'Teacher', 'Occupation Description': 'Educates students', 'Salary': '6000'}, {'Name': 'Charlie', 'Gender': 'Male', 'Occupation': 'Doctor', 'Occupation Description': 'Provides medical care', 'Salary': '10000'}]}


In [21]:
new_df

Unnamed: 0,Name,Gender,Occupation,Occupation Description,Salary
0,Alice,Female,Software Engineer,Develops software applications,8000
1,Bob,Male,Teacher,Educates students,6000
2,Charlie,Male,Doctor,Provides medical care,10000
