In [12]:
import pandas as pd
import streamlit as st
from langchain_community.llms import Ollama
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate
from langchain.prompts import (
    ChatPromptTemplate,
    FewShotChatMessagePromptTemplate,
)
from io import StringIO
import re
import numpy as np
import requests
from typing import Any, List, Mapping, Optional
from langchain_core.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.language_models.llms import LLM
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser
from langchain.prompts.example_selector import SemanticSimilarityExampleSelector
from langchain.embeddings import HuggingFaceEmbeddings
from chromadb.utils import embedding_functions
from chromadb import Documents, EmbeddingFunction, Embeddings
from typing import Optional, Sequence, Union, TypeVar, List, Dict, Any, Tuple, cast
from langchain.prompts.example_selector.ngram_overlap import NGramOverlapExampleSelector
from langchain.output_parsers.enum import EnumOutputParser
from enum import Enum
from gbnf_compiler import *
from random import sample 


#read in grammar file
grammar = ""
with open('osha_grammar.gbnf', 'r') as file:
    grammar = file.read()


#llm connector for Palmetto Cluster (llama.cpp ./server)
class CustomLLM(LLM):
    n: int

    @property
    def _llm_type(self) -> str:
        return "custom"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        if stop is not None:
            raise ValueError("stop kwargs are not permitted.")
        
        payload = {
            "prompt": prompt,
            "temperature": 0.15, 
            "cache_prompt": True,
            "n_predict": self.n,
            "grammar": grammar
        }
        response = requests.post("http://localhost:8080/completion", json=payload)
        
        if response.status_code == 200:
            return response.json().get("content", "")
        else:
            raise Exception(f"Error from llama.cpp server: {response.text}")

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {"n": self.n}
        
#create langchain llm instance
llm = CustomLLM(n=128)


In [13]:
#prepare few-shot file
few_shot_examples = pd.read_csv('fixed.csv')
data_column = 'COMMENT1'
data_column_2 = 'COMMENT1'

headers = []
for col in few_shot_examples.columns:
    if (col != data_column_2):
        headers.append(col)

# parse dataframe into examples using grammar format
examples_list = []
for index, row in few_shot_examples.iterrows():
    ex= {'Comment': row[data_column] }        
    ex['Data'] = ('\n'.join([f'{col}: {row[col]}' for col in headers]))
    examples_list.append(ex)

examples_list = []

In [14]:
#new dataframe
new_data = pd.read_csv('osha.csv')
new_data = new_data[['Abstract Text']]


#template config
qtemp = "<|start_header_id|>user<|end_header_id|>Abstract Text: {Comment}\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n{Data}<|eot_id|>"

labels_s = headers[:]
labels_s.append("Comment")

example_prompt = PromptTemplate(
    input_variables=['Comment','Data'], template=qtemp
)

sys_prompt = '''<|begin_of_text|><|start_header_id|>system<|end_header_id|>
# Task:
The events you are given are OSHA incident reports
Label the data based on the different fields.
If there is no context for the field, return 'unknown' for the field.
Because the comments are anonymized, [person_name] represents an identify name that has been anonymized.
If the comment is '(BLANK)', '(INAUDIBLE)' or '(unreadable)', just label 'unknown' for all categories unless you can extract meaningful statements.
If the comment appears incomplete or a fragment, just label 'unknown' for all categories unless you can extract meaningful statements.
Return only the label structure as follows:

# Output Structure:
Fatal: [Label]
Fault: [Label]
Job: [Label]
Incident Severity: [1-5]
Cause: [Word]

# Output Labels:
Fatal:
This field evaluates if the incident was fatal. Possible labels:
yes: The incident was fatal. Somebody died.
no: The incident was not fatal. Nobody died.
unknown: It is unclear or not recorded whether the incident was fatal or not.

Fault:
This field evaluates who was at fault for the incident. Possible labels:
employee: From the incident description, the employee was not taking correct safety measures or misusing equipment.
company: From the incident description, the company failed to ensure that adequate safety measures were taken.
other: It was not the company or employee's fault that the incident occured.

Job:
The type of job that the incident refers to. Possible Labels:
agriculture
construction
manufacturing
retail
other: If Unknown

Incident Severity:
Return a number from 1 to 5 depending on how severe the incident was.
1: Worst
2: Severe
3. Bad
4. Average
5. Not Severe
unknown: Default, if a score cannot be inferred from the event.

Cause:
This field is one word, max 13 characters. Return the cause of the incident, or 'unknown' if it is not recorded.

<|eot_id|>
'''

prompt = FewShotPromptTemplate(
    example_prompt=example_prompt,
    examples=examples_list,
    prefix=sys_prompt,
    suffix='<|start_header_id|>user<|end_header_id|>Incident: {comment}\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n',
    input_variables=["comment"],
    partial_variables={}
)

sys_prompt = prompt.format(comment='osha')


with open("prompt_osha.txt", "w") as text_file:
    text_file.write(sys_prompt)

In [15]:
# Function to parse the returned grammar
def parse_data(output):
    lines = output.strip().split('\n')
    data = {}

    for line in lines:
        if ': ' in line:
            field, label = line.split(': ', 1)
            data[field] = [label]  # Use a list to be compatible with DataFrame
    
    # Create a DataFrame from the dictionary
    df = pd.DataFrame(data)
    
    return df

In [16]:
new_d = pd.DataFrame().reindex_like(new_data)
new_d = new_d.head(0)


# Perform Analysis
batch_size = 1  # Adjust batch size according to your requirements and limitations
batches = [new_data[i:i + batch_size]
            for i in range(0, new_data.shape[0], batch_size)]

for batch in batches:
    # Create a batched prompt for processing
    batch_prompts = [prompt.format(comment=(row['Abstract Text']).strip())
                        for index, row in batch.iterrows()]
    batch_comments = [row['Abstract Text']
                        for index, row in batch.iterrows()]

    # Process the batch
    try:
        batch_results = llm.batch(batch_prompts)
        for index, result in zip(batch.index, batch_results):     
            try:
                new_row = parse_data(result)
                print(f"row {index}:")
                print("Incident Report: " + batch_comments[0])
                print(result)
                print("")
                new_row[data_column] = batch_comments[0]
                
                new_d = pd.concat([new_d, new_row], ignore_index=True)

            except Exception as e:
                print(f'Error analyzing row {index}: {e}')
    except Exception as e:
        print(f'Error processing batch: {e}')

row 0:
Incident Report: At 9:00 a.m. on August 10, 2017, an employee was operating a 400 ton Bliss Coin "Knuckle" mechanical power press. The press was actuated while the employee's right hand was in the point of operation. The employee's right ring and middle fingers were amputated. Coin "Knuckle" mechanical power press. The press was actuated while the employee's right hand was in the point of operation. The employee's right ring and middle fingers were amputated. 
Fatal: yes
Fault: employee
Job: manufacturing
Incident Severity: 5
Cause: humanerror

row 1:
Incident Report: At 9:45 a.m. on July 17, 2017, an employee was using a battery operated drill to drill a hole on a wooden joist about 109 inches above the floor. The employee's right hand slipped from a drill grip, and the tip of his right hand glove got caught in the drill head. The entanglement caused amputation of the right ring finger. to drill a hole on a wooden joist about 109 inches above the floor. The employee's right han

KeyboardInterrupt: 

In [None]:
new_d.replace('', 'Unknown')
new_d.to_csv('sample_osha.csv')