# Imports

In [12]:
%pip install requests

import requests
import re

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Reader Class

In [13]:
class PDFObject:
    def __init__(self, id: int, current_byte: int):
        self.id = id
        self.content: str = ""
        self.byte_position: int = current_byte

    def add_content(self, content: str):
        if(self.content):
            self.content += "\n"
            
        self.content += f"{content}"

class SimplePDFReader:
    def __init__(self, lines: list[str]):
        self.current_byte: int = 0
        self.current_line: int = 0
        self.lines: list[str] = lines
        self.state: str = "header"
        self.states = {
            "header": self.header_state,
            "object": self.object_state,
            "inner_object": self.inner_object_state,
            "xref": self.xref_state,
            "trailer": self.header_state,
            "end": self.end_state
        }
        self.objects: PDFObject = []
        self.trailer_content: str = ""
    
    def header_state(self, line: str):
        pattern = r"%SPDF-\d+.\d+"
        if(re.search(pattern, line)):
            self.state = "object"
            
    def object_state(self, line: str):
        if(line == "xref"):
            self.state = "xref"
            return
        
        pattern = r"(?P<Id>\d+) 0 obj"
        match = re.search(pattern, line)
        if(match):
            object_id = match.group("Id")
            self.objects.append(PDFObject(object_id), self.current_byte)
            self.state = "inner_object"
    
    def inner_object_state(self, line: str):
        if(line == "endobj"):
            self.state = "object"
            return
        
        self.objects[-1].add_content(line)
        
    def xref_state(self, line: str):
        pattern = r"0 (?P<LineCount>\d+)"
        match = re.search(pattern, line)
        if(match):
            line_count = match.group("LineCount")
            self.current_line += 1
            self.current_byte += 1
            for i in range(0, line_count):
                self.objects[i].byte_position = self.current_byte
                self.current_line += 1
                
        self.state = "trailer"

    def trailer_state(self, line: str):
        if(line == "startxref"):
            self.state = "end"
            return
        
        if(self.trailer_content):
            self.trailer_content += "\n"
        self.trailer_content += line

    def end_state(self, line: str):
        pass

    def read(self):
        while self.current_line < len(self.lines):
            self.read_line()
    
    def read_line(self):
        line = self.lines[self.current_line]
        
        if(line == ""):
            self.current_line += 1
            self.current_byte += 1
            return
        
        # Run method for current state
        self.states[self.state](line)
        
        self.current_byte += len(line)
        self.current_line += 1

# Main Code

In [14]:
file_name = "RA1-PDF1.txt"

url = f"https://raw.githubusercontent.com/xarss/Building-Interpreters/refs/heads/main/Tests/{file_name}"

# Load test file
response = requests.get(url)
lines = response.text.split("\n")

reader = SimplePDFReader(lines)

reader.read()
reader.state

'header'