In [18]:
# import necessary libraries
import os
import logging
from pathlib import Path
from time import sleep
from typing import List

import numpy as np
import pandas as pd
import openai
from sklearn.metrics import accuracy_score

from capstone.config import CapstoneConfig
from capstone.data_access import DataClass

openai.api_key = os.getenv("OPENAI_API_KEY")
PARENT_PATH = Path(os.getcwd()).parent.absolute()
FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO)

In [2]:
config = CapstoneConfig()
config.CURRENT_PATH = PARENT_PATH

data = DataClass(config)

In [3]:
openai.FineTune.list()

2022-11-02 19:13:19,939 - openai - INFO - message='Request to OpenAI API' method=get path=https://api.openai.com/v1/fine-tunes
2022-11-02 19:13:20,245 - openai - INFO - message='OpenAI API response' path=https://api.openai.com/v1/fine-tunes processing_ms=60 request_id=e12d9051ca4aff46347b012bbe55010f response_code=200


<OpenAIObject list at 0x11321cae0> JSON: {
  "data": [
    {
      "created_at": 1667170020,
      "fine_tuned_model": null,
      "hyperparams": {
        "batch_size": 1,
        "learning_rate_multiplier": 0.1,
        "n_epochs": 4,
        "prompt_loss_weight": 0.01
      },
      "id": "ft-BrsXzlDOnXIlOKBmEPaTAOll",
      "model": "curie",
      "object": "fine-tune",
      "organization_id": "org-aMHEQTPYWIZZ6Kcb2h4CPN3I",
      "result_files": [],
      "status": "cancelled",
      "training_files": [
        {
          "bytes": 371730,
          "created_at": 1667167174,
          "filename": "jnj_hygiene_ft_training_data_prepared.jsonl",
          "id": "file-MY2MYiaLnqqbysIfzO7dOopR",
          "object": "file",
          "purpose": "fine-tune",
          "status": "processed",
          "status_details": null
        }
      ],
      "updated_at": 1667170251,
      "validation_files": []
    },
    {
      "created_at": 1667173177,
      "fine_tuned_model": "davinci:ft-col

In [12]:
# Useful constants
MODEL = "davinci:ft-columbia-capstone-jnj:davinci-hygiene-ft-2022-10-31-00-51-43"
TEXT = "text"
TARGET = "completion"
PREDICTED = "category"
PROBAB = "probability"

In [5]:
# read in test set
df1 = pd.read_csv(os.path.join(data.data_path, "eudralex_chapter4_documentation.csv"))
df1.head(2)

Unnamed: 0,Level 1 - Header,Level 1 - Paragraph,Level 2 - Header,Level 2 - Body,Level 3 - Header,Level 3 - Body,Level 4 - Header,Level 4 - Body,text,prompt,completion
0,Chapter 4: Documentation,Legal basis for publishing the detailed guidel...,Generation and Control of Documentation,,4.1,All types of document should be defined and ad...,,,All types of document should be defined and ad...,All types of document should be defined and ad...,other
1,Chapter 4: Documentation,Legal basis for publishing the detailed guidel...,Generation and Control of Documentation,,4.2,"Documents should be designed, prepared, review...",,,"Documents should be designed, prepared, review...","Documents should be designed, prepared, review...",other


In [8]:
# function to predict the categories from the test set; input relevant engine name
def get_category(text):
    sleep(5)                                               # trottle for rate limit
    response = openai.Completion.create(
            prompt=f"Text: {text}\nCategory:",
            temperature=0,
            max_tokens=1,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            model=MODEL,  #"RegulatoryStandardsClassifier2" is the ft-engine in our internal Azure OpenAI, use text-davinci-002 on the free tier. 
            logprobs=2
            #logit_bias={"584":100, "33306":100}
        )
    #return (response["choices"][0])
    return (response["choices"][0]["text"])

# function to extract the probability of categorical response; input relevant engine name
def get_prob(text):
    sleep(5)                                               # trottle for rate limit
    response = openai.Completion.create(
            prompt=f"Text: {text}\nCategory:",
            temperature=0,
            max_tokens=1,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            model=MODEL, #"RegulatoryStandardsClassifier2" is the ft-engine in our internal Azure OpenAI, use text-davinci-002 on the free tier. 
            logprobs=2
            #logit_bias={"584":100, "33306":100}
        )
    #return (response["choices"][0])
    return (np.exp(response["choices"][0]["logprobs"]["top_logprobs"][0][get_category(text)]))

In [9]:
# predict category and extract probability of each cateogorical response for the test set
df1['category'] = df1.text.apply(get_category)
df1['probability'] = df1.text.apply(get_prob)

2022-11-02 19:18:21,158 - openai - INFO - message='Request to OpenAI API' method=post path=https://api.openai.com/v1/completions
2022-11-02 19:18:21,482 - openai - INFO - message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=127 request_id=87d25cfd641e4bcdbed158c833b2a412 response_code=200
2022-11-02 19:18:26,487 - openai - INFO - message='Request to OpenAI API' method=post path=https://api.openai.com/v1/completions
2022-11-02 19:18:26,625 - openai - INFO - message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=85 request_id=3fbce03b2ea04d6838fdaf1a02dd0384 response_code=200
2022-11-02 19:18:31,630 - openai - INFO - message='Request to OpenAI API' method=post path=https://api.openai.com/v1/completions
2022-11-02 19:18:31,756 - openai - INFO - message='OpenAI API response' path=https://api.openai.com/v1/completions processing_ms=66 request_id=426779edde9c361d7a58904d363afbdd response_code=200
2022-11-02 19:18:36,762 - open

In [10]:
list(df1['prompt'][25:35])

['There should be written procedures for testing materials and products at different stages of manufacture, describing the methods and equipment to be used. The tests performed should be recorded.',
 'Written release and rejection procedures should be available for materials and products, and in particular for the certification for sale of the finished product by the Qualified Person(s). All records should be available to the Qualified Person. A system should be in place to indicate special observations and any changes to critical data.',
 'Records should be maintained for the distribution of each batch of a product in order to facilitate recall of any batch, if necessary.',
 'There should be written policies, procedures, protocols, reports and the associated records of actions taken or conclusions reached, where appropriate, for the following examples:/nValidation and qualification of processes, equipment and systems;/nEquipment assembly and calibration;/nTechnology transfer;/nMainten

In [16]:
df1.head(2)

Unnamed: 0,Level 1 - Header,Level 1 - Paragraph,Level 2 - Header,Level 2 - Body,Level 3 - Header,Level 3 - Body,Level 4 - Header,Level 4 - Body,text,prompt,completion,category,probability
0,Chapter 4: Documentation,Legal basis for publishing the detailed guidel...,Generation and Control of Documentation,,4.1,All types of document should be defined and ad...,,,All types of document should be defined and ad...,All types of document should be defined and ad...,other,other,0.999834
1,Chapter 4: Documentation,Legal basis for publishing the detailed guidel...,Generation and Control of Documentation,,4.2,"Documents should be designed, prepared, review...",,,"Documents should be designed, prepared, review...","Documents should be designed, prepared, review...",other,other,0.999968


In [13]:
df1[TARGET].value_counts()

 other      31
 hygiene     1
Name: completion, dtype: int64

In [14]:
df1[PREDICTED].value_counts()

 other      31
 hygiene     1
Name: category, dtype: int64

In [17]:
accuracy_score(df1[TARGET], df1[PREDICTED])

1.0

In [20]:
df1[df1[TARGET].isin([" hygiene"])][[TEXT, TARGET, PREDICTED, PROBAB]]

Unnamed: 0,text,completion,category,probability
28,"There should be written policies, procedures, ...",hygiene,hygiene,0.688206
