In [1]:
"""
Professional Transcript Extraction System using GPT Vision
US Academic Transcript Data Extraction
"""

import os
import json
import logging
from dotenv import load_dotenv
from openai import OpenAI

# Load environment variables
load_dotenv()

# Configure clean console logging
logging.basicConfig(
    level=logging.INFO,
    format='%(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# Initialize OpenAI client with secure API key loading
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment variables")

client = OpenAI(api_key=api_key)

# Import modular controllers
from controllers import (
    TRANSCRIPT_SCHEMA,
    PDFToImageConverter,
    GPTVisionExtractor, 
    TranscriptProcessor
)

print("✓ Dependencies loaded")
print("✓ OpenAI client initialized")
print("✓ Controllers imported")

✓ Dependencies loaded
✓ OpenAI client initialized
✓ Controllers imported


In [2]:
# Initialize System Components
pdf_converter = PDFToImageConverter(dpi=300, format='PNG')
gpt_extractor = GPTVisionExtractor(client=client, model="gpt-4o", max_retries=3)
transcript_processor = TranscriptProcessor(pdf_converter=pdf_converter, gpt_extractor=gpt_extractor)

print("✓ PDF Converter ready (300 DPI)")
print("✓ GPT Vision Extractor ready (gpt-4o)")
print("✓ Transcript Processor ready")

controllers.pdf_converter - INFO - PDF converter initialized with DPI: 300, Format: PNG
controllers.gpt_extractor - INFO - GPT Vision extractor initialized with model: gpt-4o
controllers.data_processor - INFO - Transcript processor initialized


✓ PDF Converter ready (300 DPI)
✓ GPT Vision Extractor ready (gpt-4o)
✓ Transcript Processor ready


In [3]:
# Step 1: Verify Sample PDF
pdf_path = "input/1.pdf"

print("STEP 1: Verifying Sample PDF")
print("=" * 40)

if os.path.exists(pdf_path):
    file_size = os.path.getsize(pdf_path) / 1024  # KB
    print(f"✓ PDF found: {pdf_path}")
    print(f"✓ File size: {file_size:.1f} KB")
else:
    print(f"✗ PDF not found: {pdf_path}")
    print("Please ensure the PDF file exists in the input/ directory")
    
print("\nReady to process transcript...")

STEP 1: Verifying Sample PDF
✓ PDF found: input/1.pdf
✓ File size: 419.2 KB

Ready to process transcript...


In [4]:
# Step 2: Extract Transcript Data
print("STEP 2: Extracting Transcript Data")
print("=" * 40)

try:
    print("Processing transcript with GPT Vision...")
    print("This may take 30-60 seconds per page...")
    
    # Process the transcript
    result = transcript_processor.process_transcript(pdf_path)
    
    print("✓ Extraction completed successfully!")
    
    # Store result for next step
    extraction_result = result
    
except Exception as e:
    print(f"✗ Extraction failed: {str(e)}")
    extraction_result = None

controllers.data_processor - INFO - Starting transcript processing: input/1.pdf
controllers.data_processor - INFO - Step 1: Converting PDF to images
controllers.pdf_converter - INFO - Converting PDF to images: input/1.pdf


STEP 2: Extracting Transcript Data
Processing transcript with GPT Vision...
This may take 30-60 seconds per page...


controllers.pdf_converter - INFO - Successfully converted 3 pages to images
controllers.data_processor - INFO - Step 2: Extracting data from 3 pages
controllers.gpt_extractor - INFO - Processing page 1/3: input/1_images/page_001.png
httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
controllers.gpt_extractor - INFO - Successfully extracted data from page 1
controllers.schema - INFO - Schema validation passed
controllers.data_processor - INFO - Successfully processed page 1
controllers.gpt_extractor - INFO - Processing page 2/3: input/1_images/page_002.png
httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
controllers.gpt_extractor - INFO - Successfully extracted data from page 2
controllers.schema - INFO - Schema validation passed
controllers.data_processor - INFO - Successfully processed page 2
controllers.gpt_extractor - INFO - Processing page 3/3: input/1_images/page_003.png
httpx - INFO - HTTP R

✓ Extraction completed successfully!


In [5]:
# Step 3: Display Extracted Data
print("STEP 3: Transcript Data Results")
print("=" * 40)

if extraction_result:
    transcript_data = extraction_result.get('transcript_data', {})
    
    # Display clean JSON output
    print("\nCOMPLETE EXTRACTED DATA:")
    print(json.dumps(transcript_data, indent=2, ensure_ascii=False))
    
else:
    print("No data to display - extraction failed in previous step")

STEP 3: Transcript Data Results

COMPLETE EXTRACTED DATA:
{
  "student_information": {
    "student_name": "Cheyenne Alexus Clark",
    "student_id": "10014755",
    "mailing_address": "807 N Van Buren Ave\nMount Pleasant, TX 75455-3249"
  },
  "institution_information": {
    "institution_name": "Northeast Texas Community College",
    "transcript_type": "Undergraduate Division",
    "zip_code": "75455-3249",
    "state": "TX",
    "city": "Mount Pleasant",
    "registrar_signature": "Betsy Gooding",
    "seal_present": false,
    "transcript_issue_date": "9/19/2022",
    "address": "PO Box 1307 • Mount Pleasant, Texas 75456-1307",
    "institution_code": "023154"
  },
  "gpa_summary_info": {
    "term_gpa_history": [],
    "unweighted_gpa": 3.03,
    "quality_points": 9.0
  },
  "degree_information": {
    "major": [],
    "minor": [],
    "concentration": [],
    "degree_awarded": "Associate of Science",
    "degree_date": "05/12/2022"
  },
  "honors_and_awards": [],
  "transfer_cre

In [6]:
# Step 4: Processing Summary & Verification
print("STEP 4: Processing Summary & Quality Metrics")
print("=" * 50)

if extraction_result:
    metadata = extraction_result.get('processing_metadata', {})
    metrics = metadata.get('verification_metrics', {})
    integrity = metadata.get('integrity_validation', {})
    
    print(f"📄 Total Pages Processed: {metadata.get('total_pages', 'N/A')}")
    print(f"📊 Total Courses Found: {metrics.get('total_courses', 'N/A')}")
    print(f"📈 Credits Calculated: {metrics.get('total_credits_calculated', 'N/A')}")
    print(f"📝 Data Completeness: {metrics.get('data_completeness_score', 'N/A')}%")
    print(f"✅ GPA Consistency: {'PASS' if metrics.get('gpa_consistency_check') else 'FAIL'}")
    print(f"🔍 Deduplication: {metadata.get('deduplication_method', 'N/A')}")
    
    # Show key extracted sections
    transcript_data = extraction_result.get('transcript_data', {})
    
    print(f"\n📋 KEY DATA EXTRACTED:")
    print(f"   Student Name: {transcript_data.get('student_information', {}).get('student_name', 'Not found')}")
    print(f"   Institution: {transcript_data.get('institution_information', {}).get('institution_name', 'Not found')}")
    print(f"   GPA (Weighted): {transcript_data.get('gpa_summary_info', {}).get('weighted_gpa', 'Not found')}")
    print(f"   GPA (Unweighted): {transcript_data.get('gpa_summary_info', {}).get('unweighted_gpa', 'Not found')}")
    
    print(f"\n🎯 EXTRACTION COMPLETED SUCCESSFULLY!")
    
else:
    print("❌ No processing summary available - extraction failed")

STEP 4: Processing Summary & Quality Metrics
📄 Total Pages Processed: 3
📊 Total Courses Found: 27
📈 Credits Calculated: 67.0
📝 Data Completeness: 100.0%
✅ GPA Consistency: FAIL
🔍 Deduplication: course_id_plus_term

📋 KEY DATA EXTRACTED:
   Student Name: Cheyenne Alexus Clark
   Institution: Northeast Texas Community College
   GPA (Weighted): Not found
   GPA (Unweighted): 3.03

🎯 EXTRACTION COMPLETED SUCCESSFULLY!


# US Academic Transcript Extraction

This notebook demonstrates step-by-step extraction of data from US academic transcripts using GPT Vision.

## Process Overview

1. **Setup & Verification** - Load components and verify input PDF
2. **Data Extraction** - Process transcript with GPT Vision API  
3. **Results Display** - Show complete extracted data in JSON format
4. **Quality Verification** - Display processing metrics and key data points

## Expected Output

The system extracts comprehensive transcript data including:
- Student information (name, ID, contact details)
- Institution information (school details, registrar info)
- GPA summary (weighted/unweighted, class rank)
- Academic records (all courses with grades, credits, terms)
- Degree information and honors/awards

Processing typically takes 30-60 seconds per page with costs of ~$0.30-0.60 per page.