In [1]:
import os
import fitz
import pathlib
import sys
import pandas as pd
from tqdm import tqdm
import numpy as np

from pdf_processing import *
from llm_callbacks import anthropic_callback, openai_callback
from parser_1 import SimpleTextRetreiver, PDFLLMParser

from dotenv import load_dotenv
from pathlib import Path

In [2]:
# !pip3 freeze > requirements.txt

In [3]:
prompt_path = "./prompts/complexity_prompt.txt"
directory = "./data/"
write_path = './output/processed_output/'
visualize_path = './output/visualizations/'

In [4]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

os.environ["openai_api_key"] = openai_api_key
os.environ["anthropic_api_key"] = "..."


In [5]:
if openai_api_key is not None:
    print("OpenAI API key is set.")
    # Print a masked version of the API key
    masked_key = openai_api_key[:4] + "*" * len(openai_api_key[4:-4]) + openai_api_key[-4:]
    print("Masked Key:", masked_key)
else:
    print("OpenAI API key is not set.")

OpenAI API key is set.
Masked Key: sk-S*******************************************By5n


In [6]:
import os
prompt = pathlib.Path(prompt_path).read_text()

anthropic_func = anthropic_callback(prompt, model_version='claude-3-sonnet-20240229')
# anthropic_func = anthropic_callback(prompt, model_version='claude-3-haiku-20240307')

# openai_func = openai_callback(prompt, model_version='gpt-3.5-turbo-0125')
openai_func = openai_callback(prompt, model_version='gpt-4-0125-preview')

In [7]:
parser = PDFLLMParser(directory, write_path, visualize_path, openai_func)

In [8]:
all_pdfs = list(parser._scan_directory_for_pdfs())
print(all_pdfs)

[PosixPath('data/B1-Aerodynamique_English.pdf'), PosixPath('data/B1-Aerodynamique_English2.pdf'), PosixPath('data/B1-Aerodynamique_English1.pdf'), PosixPath('data/0000077.pdf')]


In [9]:
path  =  list(parser._scan_directory_for_pdfs())[0]
print(path)

data/B1-Aerodynamique_English.pdf


In [52]:
# path  =  np.random.choice(list(parser._scan_directory_for_pdfs()))

In [54]:
visualized = parser._plot_bbs(path)

 33%|███▎      | 2/6 [00:00<00:00, 16.65it/s]

100%|██████████| 6/6 [00:00<00:00, 25.75it/s]


In [55]:
write_path = parser.analyze_files(path)

Writing llm output to output/processed_output/B1-Aerodynamique_English/scores


100%|██████████| 6/6 [00:08<00:00,  1.37s/it]

Finish doing llm inderence





In [56]:
metadata = parser.aggregate_metadata(write_path)

In [57]:
pd.read_csv(metadata, dtype={"file_name": str})

Unnamed: 0,file_path,file_name,page_number,score
0,output/processed_output/B1-Aerodynamique_Engli...,B1-Aerodynamique_English,2,0.6
1,output/processed_output/B1-Aerodynamique_Engli...,B1-Aerodynamique_English,5,0.55
2,output/processed_output/B1-Aerodynamique_Engli...,B1-Aerodynamique_English,6,0.55
3,output/processed_output/B1-Aerodynamique_Engli...,B1-Aerodynamique_English,3,0.55
4,output/processed_output/B1-Aerodynamique_Engli...,B1-Aerodynamique_English,1,0.55
5,output/processed_output/B1-Aerodynamique_Engli...,B1-Aerodynamique_English,4,0.5


In [58]:
print("Parsing easy examples")
parser.parse_pdfs(metadata, threshold=.3)

Parsing easy examples
No rows with given threshold


In [59]:
print("Parsing difficult examples")
parser.parse_pdfs(metadata, threshold=.4, is_greater=True)

Parsing difficult examples


100%|██████████| 6/6 [00:00<00:00, 36.80it/s]


PosixPath('output/processed_output/B1-Aerodynamique_English/parsed_text/complex')

SECTION: Iterate through PDFs in Data and aggregate scores in a table

In [20]:
# #Rename all pdfs to add "id" at the beginning, using simple count for now
# data_directory = Path("data")
# pdf_files = list(data_directory.glob('*.pdf'))
# # Iterate through each PDF file and rename it
# for count, pdf_file in enumerate(pdf_files, start=1):
#     #new file name
#     new_name = f"{count}.{pdf_file.name}"
#     #new file path
#     new_file_path = pdf_file.parent / new_name
#     # Rename the file
#     pdf_file.rename(new_file_path)
# print("Renaming complete.")

Renaming complete.


In [7]:
parser = PDFLLMParser(directory, write_path, visualize_path, openai_func)
data_directory = Path("data")
pdf_files = list(data_directory.glob('*.pdf'))

path  =  list(parser._scan_directory_for_pdfs())[2]
print(path)

total_pdfs = len(pdf_files)
print(total_pdfs)

data/B1-Aerodynamique_English1.pdf
4


In [10]:
#Loop
print(f'Loop will process: {total_pdfs} PDFs')
results_dict = {}
for i in range(total_pdfs):
    path  =  list(parser._scan_directory_for_pdfs())[i]
    result = parser.analyze_to_dict(path, results_dict)
    result = parser.latex_to_dict(path, results_dict)
    result = parser.images_to_dict(path, results_dict)
    result = parser.errors_to_dict(path, results_dict)

print(results_dict)

Loop will process: 4 PDFs
{'B1-Aerodynamique_English_1': {'complexity': '0.55', 'latex_count': 14, 'image_count': 4, 'error_count': 8}, 'B1-Aerodynamique_English_2': {'complexity': '0.6', 'latex_count': 25, 'image_count': 6, 'error_count': 16}, 'B1-Aerodynamique_English_3': {'complexity': '0.65', 'latex_count': 14, 'image_count': 14, 'error_count': 11}, 'B1-Aerodynamique_English_4': {'complexity': '0.55', 'latex_count': 4, 'image_count': 1, 'error_count': 0}, 'B1-Aerodynamique_English_5': {'complexity': '0.6', 'latex_count': 0, 'image_count': 3, 'error_count': 0}, 'B1-Aerodynamique_English_6': {'complexity': '0.55', 'latex_count': 20, 'image_count': 1, 'error_count': 13}, 'B1-Aerodynamique_English2_1': {'complexity': '0.65', 'latex_count': 25, 'image_count': 4, 'error_count': 16}, 'B1-Aerodynamique_English1_1': {'complexity': '0.6', 'latex_count': 14, 'image_count': 4, 'error_count': 8}, '0000077_1': {'complexity': '0.2', 'latex_count': 4, 'image_count': 0, 'error_count': 0}, '0000077_

In [11]:
# Sample structure of results_dict for clarity
# results_dict = {
#     'pdfname1_1': {'complexity': 0.5, 'latex_count': 2, 'image_count': 3},
#     'pdfname1_2': {'complexity': 0.7, 'latex_count': 1, 'image_count': 2},
# }

# Transform the nested dictionary into a list of dictionaries for DataFrame construction
data = []
for key, values in results_dict.items():
    pdf_name, page_num = key.rsplit('_', 1)
    row = {
        'pdf_name': pdf_name,
        'page_num': int(page_num),  # Convert page number to integer
        'complexity_score': values.get('complexity', None),
        'latex_count': values.get('latex_count', None),
        'image_count': values.get('image_count', None),
        'error_count': values.get('error_count', None)
    }
    data.append(row)

# Create the DataFrame
df = pd.DataFrame(data)

# Reorder DataFrame columns if necessary
df = df[['pdf_name', 'page_num', 'complexity_score', 'latex_count', 'image_count', 'error_count']]

# Display the DataFrame
print(df)

                     pdf_name  page_num complexity_score  latex_count  \
0    B1-Aerodynamique_English         1             0.55           14   
1    B1-Aerodynamique_English         2              0.6           25   
2    B1-Aerodynamique_English         3             0.65           14   
3    B1-Aerodynamique_English         4             0.55            4   
4    B1-Aerodynamique_English         5              0.6            0   
5    B1-Aerodynamique_English         6             0.55           20   
6   B1-Aerodynamique_English2         1             0.65           25   
7   B1-Aerodynamique_English1         1              0.6           14   
8                     0000077         1              0.2            4   
9                     0000077         2              0.1            5   
10                    0000077         3              0.1            4   
11                    0000077         4              0.1            2   

    image_count  error_count  
0             4    

In [17]:
#Parsing all pdfs loop
print(f'Loop will process: {total_pdfs} PDFs')
results_dict = {}
for i in range(total_pdfs):
    path  =  list(parser._scan_directory_for_pdfs())[i]
    visualized = parser._plot_bbs(path)
    parser.parse_pdfs
    write_path = parser.analyze_files(path)
    metadata = parser.aggregate_metadata(write_path)
    pd.read_csv(metadata, dtype={"file_name": str})
    print("Parsing easy examples")
    parser.parse_pdfs(metadata, threshold=.3)
    print("Parsing difficult examples")
    parser.parse_pdfs(metadata, threshold=.4, is_greater=True)

    

Loop will process: 4 PDFs


100%|██████████| 6/6 [00:00<00:00, 41.44it/s]


Writing llm output to output/processed_output/B1-Aerodynamique_English/scores


100%|██████████| 6/6 [00:13<00:00,  2.17s/it]


Finish doing llm inderence
Parsing easy examples
No rows with given threshold
Parsing difficult examples


100%|██████████| 6/6 [00:00<00:00, 26.84it/s]
100%|██████████| 1/1 [00:00<00:00, 19.75it/s]


Writing llm output to output/processed_output/B1-Aerodynamique_English2/scores


100%|██████████| 1/1 [00:01<00:00,  1.46s/it]


Finish doing llm inderence
Parsing easy examples
No rows with given threshold
Parsing difficult examples


100%|██████████| 1/1 [00:00<00:00, 17.12it/s]
100%|██████████| 1/1 [00:00<00:00, 13.21it/s]


Writing llm output to output/processed_output/B1-Aerodynamique_English1/scores


100%|██████████| 1/1 [00:01<00:00,  1.61s/it]


Finish doing llm inderence
Parsing easy examples
No rows with given threshold
Parsing difficult examples


100%|██████████| 1/1 [00:00<00:00, 23.50it/s]
100%|██████████| 4/4 [00:00<00:00, 115.02it/s]


Writing llm output to output/processed_output/0000077/scores


100%|██████████| 4/4 [00:05<00:00,  1.34s/it]


Finish doing llm inderence
Parsing easy examples


100%|██████████| 4/4 [00:00<00:00, 75.57it/s]

Parsing difficult examples
No rows with given threshold



