In [8]:
!pip install PyPDF2
'''pure-Python library used for manipulating PDF files
used for test extraction, merging & splitting, page manipulation, metadata & security'''


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [9]:
%%writefile src/pdf_parser.py
"""
Phase 1: PDF Parsing & Text Preprocessing
take a raw PDF file and convert it into a clean, machine-readable string of text which can be used for further analysis
"""

import re
from PyPDF2 import PdfReader

'''extracts the raw text from each page and joins it all together into one long string'''
def extract_pdf_text(pdf_path: str) -> str:
    text = ""
    reader = PdfReader(pdf_path)

    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"

    return text

'''removes unnecessary spaces, non-ascii characters, trims the leading or trailing spaces'''
def clean_text(text: str) -> str:
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    return text.strip()


def extract_and_clean(pdf_path: str) -> str:
    return clean_text(extract_pdf_text(pdf_path))


Writing src/pdf_parser.py


In [11]:
import sys
from pathlib import Path

repo_root = Path.cwd()
sys.path.append(str(repo_root / "src"))

from pdf_parser import extract_and_clean



In [21]:
#defining paths

industry_pdf = repo_root / "data/industry/Job Description + SkillSet.pdf"
handbook_2022 = repo_root / "data/university/Handbook 2022-2026.pdf"
handbook_2023 = repo_root / "data/university/Handbook batch 2023-2027.pdf"


In [22]:
industry_text = extract_and_clean(str(industry_pdf))
handbook_2022_text = extract_and_clean(str(handbook_2022))
handbook_2023_text = extract_and_clean(str(handbook_2023))

print("Industry text length:", len(industry_text))
print("Handbook 2022–26 length:", len(handbook_2022_text))
print("Handbook 2023–27 length:", len(handbook_2023_text))


Industry text length: 43814
Handbook 2022–26 length: 223244
Handbook 2023–27 length: 59609


In [23]:
#sanity check
industry_text[:1000]


'JOB DESCRIPTIONS AND SKILL SETS Prof. Animesh Giri Associate Professor & Faculty Placement Coordinator PES University  Electronic City Campus animeshgiri@pes.edu Table of Contents Affinsys AI Pvt. Ltd. Internship Job Details ................................ ................................ ................................ .... 4 Amazon AISPL Cloud Support Associate Job Details ................................ ................................ .................. 5 Software Developer Intern - AVEVA ................................ ................................ ................................ ............ 6 Software Developer Graduate - AVEVA ................................ ................................ ................................ ...... 7 Engineer Trainee - Baxter ................................ ................................ ................................ ............................. 12 Digital Intern - BT Group ................................ .......................

In [24]:
'''takes the cleaned text extracted from PDFs and permanently saves it as a structured JSON file
so all future phases can reuse it without re-reading PDFs.'''

import json

output_dir = repo_root / "outputs/processed_data"
output_dir.mkdir(parents=True, exist_ok=True)

output_data = {
    "industry_text": industry_text,
    "handbook_2022_26": handbook_2022_text,
    "handbook_2023_27": handbook_2023_text
}

with open(output_dir / "phase1_cleaned_text.json", "w") as f:
    json.dump(output_data, f, indent=2)

print("Phase 1 output saved successfully.")


Phase 1 output saved successfully.
