# Resume Builder v1.0 - PDF Upload and Parser

This notebook allows you to upload a PDF resume and extract its content for further processing.

## Step 1: Install Required Dependencies

Run this cell to install the necessary libraries for PDF processing and file upload widgets.

In [None]:
# Install required packages
!pip install ipywidgets pdfplumber PyPDF2 -q

# Enable widgets extension if not already enabled
!jupyter nbextension enable --py widgetsnbextension --sys-prefix

## Step 2: Import Required Libraries

In [None]:
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import pdfplumber
import PyPDF2
import io
import os
from typing import Optional, Dict, Any

## Step 3: Create PDF Upload Widget with Validation

In [None]:
class PDFResumeUploader:
    """A class to handle PDF resume uploads with validation."""
    
    def __init__(self):
        self.upload_widget = None
        self.status_output = widgets.Output()
        self.content_output = widgets.Output()
        self.pdf_content = None
        self.pdf_text = None
        self.setup_widget()
    
    def setup_widget(self):
        """Set up the upload widget with PDF-only acceptance."""
        self.upload_widget = widgets.FileUpload(
            accept='.pdf',  # Only accept PDF files
            multiple=False,  # Only single file upload
            description='Upload Resume (PDF only)'
        )
        self.upload_widget.observe(self.on_upload, names='value')
    
    def validate_pdf(self, file_content: bytes, filename: str) -> bool:
        """Validate if the uploaded file is a valid PDF."""
        # Check file extension
        if not filename.lower().endswith('.pdf'):
            return False
        
        # Check PDF signature (PDF files start with %PDF)
        if file_content[:4] != b'%PDF':
            return False
        
        return True
    
    def extract_text_with_pdfplumber(self, file_content: bytes) -> str:
        """Extract text from PDF using pdfplumber."""
        text = ""
        try:
            with pdfplumber.open(io.BytesIO(file_content)) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n\n"
        except Exception as e:
            print(f"Error with pdfplumber: {e}")
            return None
        return text.strip()
    
    def extract_text_with_pypdf2(self, file_content: bytes) -> str:
        """Extract text from PDF using PyPDF2 as fallback."""
        text = ""
        try:
            pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
            num_pages = len(pdf_reader.pages)
            
            for page_num in range(num_pages):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n\n"
        except Exception as e:
            print(f"Error with PyPDF2: {e}")
            return None
        return text.strip()
    
    def on_upload(self, change):
        """Handle file upload event."""
        with self.status_output:
            clear_output()
            
            if not change['new']:
                return
            
            uploaded_file = change['new'][0]
            filename = uploaded_file['name']
            content = uploaded_file['content']
            
            # Validate PDF
            if not self.validate_pdf(content, filename):
                display(HTML(
                    '<div style="color: red; font-weight: bold;">'
                    '❌ Error: Please upload a valid PDF file only. '
                    'Other file formats are not accepted.'
                    '</div>'
                ))
                # Clear the upload widget
                self.upload_widget.value = ()
                return
            
            # Success message
            display(HTML(
                f'<div style="color: green; font-weight: bold;">'
                f'✅ Successfully uploaded: {filename} ({len(content):,} bytes)'
                f'</div>'
            ))
            
            # Store the PDF content
            self.pdf_content = content
            
            # Extract text from PDF
            print("\n📄 Extracting text from PDF...")
            
            # Try pdfplumber first, then PyPDF2 as fallback
            self.pdf_text = self.extract_text_with_pdfplumber(content)
            if not self.pdf_text:
                print("Trying alternative extraction method...")
                self.pdf_text = self.extract_text_with_pypdf2(content)
            
            if self.pdf_text:
                print("✅ Text extraction successful!")
                self.display_content()
            else:
                print("❌ Could not extract text from the PDF.")
    
    def display_content(self):
        """Display the extracted content."""
        with self.content_output:
            clear_output()
            
            if self.pdf_text:
                display(HTML('<h3>📋 Extracted Resume Content:</h3>'))
                
                # Create a scrollable text area
                text_widget = widgets.Textarea(
                    value=self.pdf_text,
                    placeholder='Extracted text will appear here',
                    description='',
                    disabled=True,
                    layout=widgets.Layout(width='100%', height='400px')
                )
                display(text_widget)
                
                # Display statistics
                lines = self.pdf_text.split('\n')
                words = self.pdf_text.split()
                display(HTML(
                    f'<div style="margin-top: 10px;">'
                    f'<b>Statistics:</b><br>'
                    f'• Lines: {len(lines)}<br>'
                    f'• Words: {len(words)}<br>'
                    f'• Characters: {len(self.pdf_text)}'
                    f'</div>'
                ))
    
    def display(self):
        """Display the complete upload interface."""
        display(HTML('<h2>📄 Resume PDF Uploader</h2>'))
        display(HTML(
            '<p style="color: #666;">'
            'Upload your resume in PDF format. The system will validate the file type '
            'and extract the text content for further processing.'
            '</p>'
        ))
        display(self.upload_widget)
        display(self.status_output)
        display(self.content_output)
    
    def get_extracted_text(self) -> Optional[str]:
        """Get the extracted text content."""
        return self.pdf_text
    
    def reset(self):
        """Reset the uploader."""
        self.upload_widget.value = ()
        self.pdf_content = None
        self.pdf_text = None
        with self.status_output:
            clear_output()
        with self.content_output:
            clear_output()

## Step 4: Initialize and Display the PDF Uploader

Run this cell to create the upload interface. You can then upload your PDF resume.

In [None]:
# Create and display the PDF uploader
uploader = PDFResumeUploader()
uploader.display()

## Step 5: Process Extracted Text (Optional)

Once you've uploaded a PDF, you can access the extracted text for further processing.

In [None]:
# Get the extracted text (run this after uploading a PDF)
extracted_text = uploader.get_extracted_text()

if extracted_text:
    print("✅ Text successfully extracted and ready for processing!")
    print(f"\nFirst 500 characters of extracted text:")
    print("=" * 50)
    print(extracted_text[:500] + "..." if len(extracted_text) > 500 else extracted_text)
else:
    print("⚠️ No text has been extracted yet. Please upload a PDF file first.")

## Step 6: Reset Uploader (Optional)

Use this to clear the current upload and start fresh.

In [None]:
# Reset the uploader to upload a different file
uploader.reset()
print("🔄 Uploader has been reset. You can now upload a new PDF file.")

## Next Steps

In future versions, we can:
1. Parse the extracted text to identify sections (Education, Experience, Skills, etc.)
2. Structure the data into a standardized format
3. Allow editing of the extracted information
4. Generate new resumes with different templates
5. Support additional file formats (DOCX, TXT, etc.)