In [1]:
import torch
import pdfplumber
import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from IPython.display import clear_output

device = torch.cuda.current_device()
name = "mistralai/Mistral-7B-Instruct-v0.1"

model = AutoModelForCausalLM.from_pretrained(name, load_in_4bit=True, device_map=device)
tokenizer = AutoTokenizer.from_pretrained(name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
#Read pdf and extract the name
doc_path = "pdf_folder/The_advantage_of_short_paper_titles.pdf"
doc_name = doc_path.split("/")[-1].split(".")[0]
doc = ""

with pdfplumber.open(doc_path) as pdf:

    for page in pdf.pages:
        text = page.extract_text()
        text = text.replace("Safety Statement", "")
        doc += text

samp_text = doc
word_split = samp_text.split()
chunk = ""

""" 
This section breaks down the text into chunks that can be taken by an LLM (context_window).
You can include an overlap between chunks, to make sure the LLM has the entire context before 
making a prediction and fixing the text
"""

chunk_size = 100
overlap = 0
n_chunks = ((len(word_split) - chunk_size) // (chunk_size - overlap)) + 1
lstd_text = []

for i in range(n_chunks):
    start = i * (chunk_size - overlap)
    end = start + chunk_size if i < n_chunks - 1 else len(word_split)
    chunk = " ".join(word_split[start:end])
    lstd_text.append(chunk)
    chunk = ""

In [3]:
fxd_text = []
text_cue = "Corrected text:"
tplt_prompt = f"""I need you to fix the grammatical errors and properly separate words that are stitched together within a text. Only reply with "Corrected text:" """

for item in tqdm.tqdm(lstd_text, desc="Processing text"):
    
    prompt = f"""{tplt_prompt}. Original text: "{item}". Corrected text:"""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=chunk_size*4)
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    first_time_idx = output_text.find(text_cue)
    sec_time_idx = output_text.find(text_cue, first_time_idx + len(text_cue))
    
    clear_output(wait=True)     # Clear previous output
    corrected_text = output_text[sec_time_idx + len(text_cue):].strip()
    print("Corrected Text:", corrected_text)  # Debug print
    fxd_text.append(corrected_text)
    
with open(f"reviewed_pdfs/{doc_name}.csv", "w") as f:
    for item in fxd_text:
        f.write("%s\n" % item)

Processing text: 100%|██████████| 13/13 [08:49<00:00, 40.70s/it]

Corrected Text: Gonçalves, Perra, Vespignani. 2011 Modeling allometric scaling and the decreasing need for new users’ activity on Twitter networks: validation of words. Science Rep. 2, 943. (doi: 10.1038/srep00943) 35. Yogatama, Heilman, O’Connor, Dyer, Dunbar’s number. PLOS ONE 6, e22656. (doi: 10.1371/journal.pone.0022656) 36. Sakaki, Okazaki, Matsuo. 2010 Earthquake routing Twitter users: real-time event detection by community’s response to an article. In Proc. EMNLP, 9. (doi: 10.1145/22.0001) 37. Mocanu, Baronchelli, Perra, Gonçalves. 2010 Social sensors in Twitter networks. In WWW’10, 26–30 April 2010, 27–31 July 2011, Edinburgh, UK., pp. 594–604. (doi: 10.1109/WWW.2010.555895) 38. Zhang, Vespignani. 2013 The Twitter of Babel: Raleigh, NC., pp. 851–860. New York, NY: ACM. Stroudsburg, PA: Association for Computational Linguistics. 23. Preis, Moat, Bishop, Tre



