In [1]:
import torch
import pdfplumber
import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from IPython.display import clear_output

device = torch.cuda.current_device()
name = "mistralai/Mistral-7B-Instruct-v0.1"

model = AutoModelForCausalLM.from_pretrained(name, load_in_4bit=True, device_map=device)
tokenizer = AutoTokenizer.from_pretrained(name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
#doc_path = "data/safety statement.pdf"
doc_path = "data/Personal Data Breach Management Process.pdf"
doc_name = doc_path.split("/")[-1].split(".")[0]
doc = ""

with pdfplumber.open(doc_path) as pdf:

    for page in pdf.pages:
        text = page.extract_text()
        text = text.replace("Safety Statement", "")
        doc += text
# samp_text = doc[:5000]
samp_text = doc
word_split = samp_text.split()
chunk = ""

chunk_size = 150
overlap = 0
n_chunks = ((len(word_split) - chunk_size) // (chunk_size - overlap)) + 1
lstd_text = []

for i in range(n_chunks):
    start = i * (chunk_size - overlap)
    end = start + chunk_size if i < n_chunks - 1 else len(word_split)
    chunk = " ".join(word_split[start:end])
    lstd_text.append(chunk)
    chunk = ""

In [7]:
fxd_text = []
text_cue = "Corrected text:"
tplt_prompt = f"""I need you to fix the grammatical errors and properly separate words that are stitched together within a text. Only reply with "Corrected text:" """

for item in tqdm.tqdm(lstd_text, desc="Processing text"):
    
    prompt = f"""{tplt_prompt}. Original text: "{item}". Corrected text:"""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=chunk_size*4)
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    first_time_idx = output_text.find(text_cue)
    sec_time_idx = output_text.find(text_cue, first_time_idx + len(text_cue))
    
    clear_output(wait=True)     # Clear previous output
    corrected_text = output_text[sec_time_idx + len(text_cue):].strip()
    print("Corrected Text:", corrected_text)  # Debug print
    fxd_text.append(corrected_text)
    
with open(f"data/{doc_name}.csv", "w") as f:
    for item in fxd_text:
        f.write("%s\n" % item)

Processing text: 100%|██████████| 21/21 [26:49<00:00, 76.65s/it]

Corrected Text: Inhalation of chemicals, cleaning chemicals, and spills should be immediately cleaned up. Chemicals, including handsanitiser 2 3, medium (flammable), and stored in a suitable location. All hazardous substances should be separated and stored in accordance with the recommendation of the SDS. Appropriate PPE should be worn as per the cleaning agent label. References Safety, Health and Welfare at Work Act 2005 as amended. Safety, Health and Welfare at Work (General Application) Regulations 2007 and amendments, Safety, Health and Welfare at Work (Chemical Agents) Regulations 2001-2021 and associated Code of Practice 2021, Safety, Health and Welfare at Work (Biological Agents) Regulations 2013 and 2020 and associated Code of Practice 2020.

Area of Risk Assessment: Remote Working

Persons affected: All staff, contractors, and visitors

Ref. Activity/Hazard Risk Controls

Likelihood Severity Risk No. Rating

15 Remote working unsafework systems

Employees’ remote workstations 


