In [1]:
import torch
import pdfplumber
import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from IPython.display import clear_output

device = torch.cuda.current_device()
name = "mistralai/Mistral-7B-Instruct-v0.1"

model = AutoModelForCausalLM.from_pretrained(name, load_in_4bit=True, device_map=device)
tokenizer = AutoTokenizer.from_pretrained(name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
#Read pdf and extract the name
doc_path = "pdf_folder/The_advantage_of_short_paper_titles.pdf"
doc_name = doc_path.split("/")[-1].split(".")[0]
doc = ""

with pdfplumber.open(doc_path) as pdf:

    for page in pdf.pages:
        text = page.extract_text()
        text = text.replace("Safety Statement", "")
        doc += text

samp_text = doc
word_split = samp_text.split()
chunk = ""

""" 
This section breaks down the text into chunks that can be taken by an LLM (context_window).
You can include an overlap between chunks, to make sure the LLM has the entire context before 
making a prediction and fixing the text
"""

chunk_size = 150
overlap = 0
n_chunks = ((len(word_split) - chunk_size) // (chunk_size - overlap)) + 1
lstd_text = []

for i in range(n_chunks):
    start = i * (chunk_size - overlap)
    end = start + chunk_size if i < n_chunks - 1 else len(word_split)
    chunk = " ".join(word_split[start:end])
    lstd_text.append(chunk)
    chunk = ""

In [3]:
fxd_text = []
text_cue = "Corrected text:"
tplt_prompt = f"""I need you to fix the grammatical errors and properly separate words that are stitched together within a text. Only reply with "Corrected text:" """

for item in tqdm.tqdm(lstd_text, desc="Processing text"):
    
    prompt = f"""{tplt_prompt}. Original text: "{item}". Corrected text:"""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=chunk_size*4)
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    first_time_idx = output_text.find(text_cue)
    sec_time_idx = output_text.find(text_cue, first_time_idx + len(text_cue))
    
    clear_output(wait=True)     # Clear previous output
    corrected_text = output_text[sec_time_idx + len(text_cue):].strip()
    print("Corrected Text:", corrected_text)  # Debug print
    fxd_text.append(corrected_text)
    
with open(f"reviewed_pdfs/{doc_name}.csv", "w") as f:
    for item in fxd_text:
        f.write("%s\n" % item)

Processing text:  88%|████████▊ | 7/8 [07:08<01:04, 64.51s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Corrected Text: "To receive more citations per paper (for years 2007–2011: all ts <−4.215, all ps <0.001; 2012–2013: both ts <−2.022, both ps <0.05; t-test of slope L with FDR correction). The values of the slope L are given for all years in Table 1. 3. Discussion In this study, we investigate whether the length of a scientific paper's title is related to the number of citations it receives. We analyzed the 20,000 most highly cited papers for the years 2007–2013, representing a sample size between 1.12% and 1.53% of all papers published in each of these years. Previous studies analyzing much smaller sets of papers have reported conflicting evidence, suggesting either that the relationship is positive (60%) or negative (40%). Mean quantile title length (%) Figure 3. Journals that publish papers with shorter titles receive more citations per paper. For each year in our dataset, we ranked all of the papers in terms of the number of citations received and in terms of the length of the titl

Processing text:  88%|████████▊ | 7/8 [07:09<01:01, 61.38s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.80 GiB (GPU 0; 8.00 GiB total capacity; 5.77 GiB already allocated; 0 bytes free; 6.01 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF