Run simple inference on a single PDF file

In [None]:
name = "CS"  # distribution against which to compare
pdf_path = ''

# PDF data preparation

In [None]:
# install spacy's large english model
! python -m spacy download en_core_web_lg

In [None]:
import spacy
import re
nlp = spacy.load("en_core_web_lg")
def tokenize(text):
    """
    Processes the input text, splits it into sentences, and further processes each sentence
    to extract non-numeric words. It constructs a list of these words for each sentence.

    Parameters:
    text (str): A string containing multiple sentences.

    Returns:
    list: A list of lists, where each inner list contains the words from one sentence,
          excluding any numeric strings.
    """
    # remove newline characters, this line is not necessary for all cases
    # the reason it is included here is because the abstracts in the dataset contain abnormal newline characters
    # e.g. Recent works on diffusion models have demonstrated a strong capability for\nconditioning image generation,
    text=text.replace('\n',' ')
    # Initialize an empty list to store the list of words for each sentence
    sentence_list=[]
    # Process the sentence using the spacy model to extract linguistic features and split into components
    doc=nlp(text)
    # Iterate over each sentence in the processed text
    for sent in doc.sents:
        # Extract the words from the sentence
        words = re.findall(r'\b\w+\b', sent.text.lower())
        # Remove any words that are numeric
        words_without_digits=[word for word in words if not word.isdigit()]
        # If the list is not empty, append the list of words to the sentence_list
        if len(words_without_digits)!=0:
            sentence_list.append(words_without_digits)
    return sentence_list

In [None]:
from pypdf import PdfReader

reader = PdfReader(pdf_path)
number_of_pages = len(reader.pages)
text = ''
for i in range(number_of_pages):
    page = reader.pages[i]
    text += page.extract_text()
    text += '\n'

print(text[:1000])

In [None]:
tokenized_sentences = tokenize(text)
print(tokenized_sentences)

In [None]:
# save text to parquet with column: 'inference_sentence'
import pandas as pd

df = pd.DataFrame({'inference_sentence': tokenized_sentences})
df.to_parquet('tokenized_sentences.parquet', index=False)
print(df)

# Inference

In [None]:
from src.estimation import estimate_text_distribution
from src.MLE import MLE
import os


In [None]:
subjects = ["CS","EESS","Math","Phys","Stat"]
assert name in subjects, "Data not available for source subject"

if not os.path.exists(f"distribution/{name}.parquet"):
    estimate_text_distribution(f"data/training_data/{name}/human_data.parquet",f"data/training_data/{name}/ai_data.parquet",f"distribution/{name}.parquet")

In [None]:
model = MLE(f"distribution/{name}.parquet")
model.inference("tokenized_sentences.parquet")