### **Import Libraries and Global Dependies**

In [None]:
from __future__ import print_function
import os
import json
import time
import csv
import logging
import concurrent.futures
import ssl
import certifi
import requests
import pandas as pd
import random
import socket
import httplib2
from datetime import datetime
from threading import Lock
import base64

# Google API modules
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

# Encryption for token security
from cryptography.fernet import Fernet
from typing import List, Dict, Set, Tuple, Optional

### **Configuration Loader**

In [None]:
try:
    with open("config.json", "r") as f:
        CONFIG = json.load(f)
except FileNotFoundError:
    raise SystemExit("FATAL ERROR: config.json file not found. Please create config.json in this directory.")

# Extract config variables
SCOPES = CONFIG["api_scopes"]
OUTPUT_CSV = CONFIG["output_csv"]
TOKEN_FILE = CONFIG["token_file"]
CREDENTIALS_FILE = CONFIG["credentials_file"]
FILTERS = CONFIG["filters"]

# Logging configuration (shared across notebook)
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_file = os.path.join(log_dir, f"retrieval_{timestamp}.log")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler(log_file, encoding="utf-8"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

logger.info("Configuration and logging initialized successfully.")


### **Gmail Authentication**

In [None]:
class GmailAuthError(Exception):
    """Raised when Gmail authentication or service creation fails."""
    pass

def _load_encryption_key():
    """Load or create encryption key for token protection."""
    key_path = os.path.expanduser("~/.gmail_token_key")
    if not os.path.exists(key_path):
        key = Fernet.generate_key()
        with open(key_path, "wb") as f:
            f.write(key)
        os.chmod(key_path, 0o600)
        logger.info(f"ðŸ”‘ New encryption key created at {key_path}")
    else:
        with open(key_path, "rb") as f:
            key = f.read()
    return Fernet(key)

def get_gmail_service():
    """Authenticate user and return Gmail service object securely."""
    try:
        logger.info("Initializing Gmail authentication process...")
        creds = None
        fernet = _load_encryption_key()

        if os.path.exists(TOKEN_FILE):
            logger.info(f"Loading encrypted token from {TOKEN_FILE}")
            try:
                with open(TOKEN_FILE, "rb") as f:
                    encrypted = f.read()
                decrypted = fernet.decrypt(encrypted).decode()
                creds = Credentials.from_authorized_user_info(json.loads(decrypted), SCOPES)
            except Exception as e:
                logger.warning(f"Token decryption failed: {e}. Proceeding with new authentication.")
                creds = None
        else:
            logger.warning("Token file not found â€” initiating new authentication flow.")

        if not creds or not creds.valid:
            if creds and creds.expired and creds.refresh_token:
                try:
                    logger.info("Refreshing expired Gmail token...")
                    creds.refresh(Request())
                    logger.info("Token refreshed successfully.")
                except Exception as e:
                    logger.error(f"Token refresh failed: {e}")
                    creds = None
            else:
                MAX_OAUTH_RETRIES = 3
                for attempt in range(MAX_OAUTH_RETRIES):
                    try:
                        logger.info(f"Starting OAuth flow (attempt {attempt+1}/{MAX_OAUTH_RETRIES})...")
                        flow = InstalledAppFlow.from_client_secrets_file(CREDENTIALS_FILE, SCOPES)
                        creds = flow.run_local_server(
                            port=0,
                            authorization_prompt_message="Please authorize this app to access Gmail.",
                            success_message="Authentication successful! You may close this tab.",
                            open_browser=True
                        )
                        logger.info("OAuth authentication completed successfully.")
                        break
                    except Exception as e:
                        logger.warning(f"OAuth flow attempt {attempt+1} failed: {e}")
                        if attempt < MAX_OAUTH_RETRIES - 1:
                            logger.info("Retrying in 10 seconds...")
                            time.sleep(10)
                        else:
                            raise GmailAuthError("OAuth flow failed after multiple retries.") from e

            if creds and creds.valid:
                try:
                    os.makedirs(os.path.dirname(TOKEN_FILE) or ".", exist_ok=True)
                    encrypted = fernet.encrypt(creds.to_json().encode())
                    with open(TOKEN_FILE, "wb") as token:
                        token.write(encrypted)
                    os.chmod(TOKEN_FILE, 0o600)
                    logger.info(f"Encrypted token saved to {TOKEN_FILE}")
                except Exception as e:
                    logger.warning(f"Failed to save encrypted token: {e}")

        service = build("gmail", "v1", credentials=creds)
        logger.info("Gmail service authenticated and ready.")
        return service

    except Exception as e:
        logger.exception(f"Gmail authentication failed: {e}")
        raise GmailAuthError("Gmail service initialization failed.") from e


### **Relevance Check**

In [None]:
def is_relevant_email(sender: str, subject: str, snippet: str, body: str = "") -> bool:
    """Enhanced rule-based email relevance check with full body analysis."""
    if not sender:
        return False

    sender = sender.lower()
    subject = (subject or "").lower()
    snippet = (snippet or "").lower()
    body = (body or "").lower()

    allowed_domains = FILTERS["allowed_sender_domains"]
    spam_keywords = FILTERS["spam_subject_keywords"]
    placement_terms = FILTERS["required_placement_terms"]

    if not any(domain in sender for domain in allowed_domains):
        return False

    spam_text = f"{subject} {snippet} {body}"
    if any(word in spam_text for word in spam_keywords):
        return False

    all_text = f"{subject} {snippet} {body}"
    if not any(word in all_text for word in placement_terms):
        return False

    return True


### **State Management**

In [None]:
PROCESSED_IDS_FILE = CONFIG.get("processed_ids_file", "state/processed_message_ids.txt")
ATTACHMENTS_DIR = CONFIG.get("attachments_dir", "attachments")
GMAIL_QUERY = CONFIG.get("gmail_query", "placement OR internship OR job OR hiring")

def load_processed_ids(path: str) -> set:
    """Load already processed Gmail message IDs from a text file."""
    if not os.path.exists(path):
        return set()
    try:
        with open(path, "r", encoding="utf-8") as f:
            return {line.strip() for line in f if line.strip()}
    except Exception as e:
        logger.warning(f"Failed to load processed IDs from {path}: {e}")
        return set()

def save_processed_ids(path: str, new_ids: set) -> None:
    """Append new processed IDs to existing set and persist to disk."""
    existing = load_processed_ids(path)
    merged = existing.union(new_ids)
    
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    try:
        with open(path, "w", encoding="utf-8") as f:
            for mid in sorted(merged):
                f.write(mid + "\n")
        logger.info(f"Saved {len(merged)} total processed IDs to {path}")
    except Exception as e:
        logger.error(f"Failed to save processed IDs to {path}: {e}")


### **MIME Parsing Helper**

In [None]:
def _decode_base64_urlsafe(data: str) -> str:
    """Decode base64url-encoded string to UTF-8 text."""
    if not data:
        return ""
    try:
        padded_data = data + "=" * ((4 - len(data) % 4) % 4)
        decoded_bytes = base64.urlsafe_b64decode(padded_data.encode("ASCII"))
        return decoded_bytes.decode("utf-8", errors="replace")
    except Exception:
        return ""

def extract_text_from_payload(payload: dict) -> str:
    """Recursively extract all text/plain and text/html content from Gmail payload."""
    texts = []

    def walk(part: dict) -> None:
        mime_type = part.get("mimeType", "")
        body = part.get("body", {})
        data = body.get("data")
        parts = part.get("parts", [])

        if data and mime_type.startswith("text/"):
            texts.append(_decode_base64_urlsafe(data))

        for subpart in parts:
            walk(subpart)

    walk(payload)
    return "\n\n".join(texts).strip()

def collect_pdf_parts(payload: dict) -> list:
    """Find all PDF attachments in payload. Returns (filename, attachmentId) tuples."""
    pdfs = []

    def walk(part: dict) -> None:
        filename = part.get("filename", "").strip()
        body = part.get("body", {})
        attachment_id = body.get("attachmentId")
        parts = part.get("parts", [])

        if filename.lower().endswith(".pdf") and attachment_id:
            pdfs.append((filename, attachment_id))

        for subpart in parts:
            walk(subpart)

    walk(payload)
    return pdfs

def download_pdfs_for_message(service, user_id: str, message_id: str, payload: dict, attachments_dir: str) -> list:
    """Download all PDF attachments for a message. Returns saved file paths."""
    pdf_parts = collect_pdf_parts(payload)
    if not pdf_parts:
        return []

    message_dir = os.path.join(attachments_dir, message_id)
    os.makedirs(message_dir, exist_ok=True)
    saved_files = []

    for filename, attachment_id in pdf_parts:
        try:
            att = service.users().messages().attachments().get(
                userId=user_id, messageId=message_id, id=attachment_id
            ).execute()
            
            data = att.get("data")
            if not data:
                continue

            file_bytes = base64.urlsafe_b64decode(data + "=" * ((4 - len(data) % 4) % 4))
            file_path = os.path.join(message_dir, filename)
            
            with open(file_path, "wb") as f:
                f.write(file_bytes)
            
            saved_files.append(file_path)
            logger.debug(f"Saved PDF: {filename}")
            
        except HttpError as e:
            logger.warning(f"Failed PDF {filename} for {message_id}: {e}")
        except Exception as e:
            logger.error(f"PDF download error {filename}: {e}")

    return saved_files


### **Fetcher Function**

In [None]:
def fetch_emails():
    """Production Gmail email fetcher with full body extraction, PDF downloads, and duplicate prevention."""
    service = get_gmail_service()
    processed_ids = load_processed_ids(PROCESSED_IDS_FILE)
    newly_processed_ids = set()

    # Phase 1: List message IDs
    logger.info(f"Fetching message IDs for query: {GMAIL_QUERY}")
    messages = []
    next_page_token = None

    while True:
        try:
            results = service.users().messages().list(
                userId="me", q=GMAIL_QUERY, maxResults=500, pageToken=next_page_token
            ).execute()
        except HttpError as e:
            logger.warning(f"List error: {e}")
            time.sleep(3)
            continue
        except ssl.SSLError as e:
            logger.warning(f"SSL error: {e}")
            time.sleep(3)
            continue

        batch = results.get("messages", [])
        if not batch:
            break
        messages.extend(batch)
        next_page_token = results.get("nextPageToken")
        if not next_page_token:
            break

    logger.info(f"Raw messages found: {len(messages)}")

    messages = [m for m in messages if m["id"] not in processed_ids]
    total_to_process = len(messages)
    logger.info(f"New messages to process: {total_to_process}")

    if total_to_process == 0:
        logger.info("No new emails to process.")
        return

    # Phase 2: Batch fetch
    BATCH_SIZE = 15
    MAX_RETRIES = 3
    all_results = []
    error_count = total_attempts = 0

    def batch_callback(request_id: str, response: dict, exception):
        nonlocal error_count, total_attempts, all_results, newly_processed_ids
        
        total_attempts += 1
        if exception:
            error_count += 1
            logger.warning(f"Callback error {request_id}: {exception}")
            return

        try:
            payload = response.get("payload", {})
            snippet = response.get("snippet", "")
            message_id = response.get("id", request_id)

            headers = payload.get("headers", [])
            subject = next((h["value"] for h in headers if h["name"].lower() == "subject"), "No Subject")
            sender = next((h["value"] for h in headers if h["name"].lower() == "from"), "Unknown")
            date = next((h["value"] for h in headers if h["name"].lower() == "date"), "Unknown")

            body_text = extract_text_from_payload(payload)
            pdf_files = download_pdfs_for_message(service, "me", message_id, payload, ATTACHMENTS_DIR)

            if is_relevant_email(sender, subject, snippet, body_text):
                all_results.append({
                    "MessageId": message_id, "Sender": sender, "Subject": subject,
                    "Date": date, "Preview": snippet[:200], "Body": body_text,
                    "PDFs": ";".join(pdf_files), "WordCount": len(body_text.split())
                })
                logger.info(f"RELEVANT: {subject[:60]}...")
            else:
                logger.debug(f"ðŸ“Ž Skipped: {subject[:60]}...")

            newly_processed_ids.add(message_id)

        except Exception as e:
            logger.error(f"Callback processing error {request_id}: {e}")
            error_count += 1

    # Execute batches
    start_time = time.time()
    logger.info(f"Processing {total_to_process} messages in batches of {BATCH_SIZE}...")

    for i in range(0, total_to_process, BATCH_SIZE):
        if error_count / max(total_attempts, 1) > 0.25:
            logger.error("Aborting: Error rate >25%")
            break

        batch_msgs = messages[i:i + BATCH_SIZE]
        batch = service.new_batch_http_request(callback=batch_callback)

        for msg in batch_msgs:
            req = service.users().messages().get(userId="me", id=msg["id"], format="full")
            batch.add(req, request_id=msg["id"])

        for attempt in range(1, MAX_RETRIES + 1):
            try:
                batch.execute()
                logger.info(f"Batch {i//BATCH_SIZE + 1}/{total_to_process//BATCH_SIZE + 1}")
                time.sleep(2 + random.uniform(0.2, 0.8))
                break
            except Exception as e:
                logger.warning(f"Batch {i//BATCH_SIZE + 1} attempt {attempt}: {e}")
                time.sleep(5 * attempt)
                if attempt == MAX_RETRIES:
                    error_count += len(batch_msgs)

    # Phase 3: Save results
    if all_results:
        mode = "a" if os.path.exists(OUTPUT_CSV) else "w"
        fieldnames = ["MessageId", "Sender", "Subject", "Date", "Preview", "Body", "PDFs", "WordCount"]
        
        with open(OUTPUT_CSV, mode, newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            if mode == "w":
                writer.writeheader()
            writer.writerows(all_results)
        
        logger.info(f"Saved {len(all_results)} relevant emails to {OUTPUT_CSV}")

    if newly_processed_ids:
        save_processed_ids(PROCESSED_IDS_FILE, newly_processed_ids)

    elapsed = time.time() - start_time
    logger.info(f"COMPLETE: {elapsed:.1f}s | {len(all_results)} relevant | {len(newly_processed_ids)} processed | {error_count}/{total_attempts} errors")


In [None]:
fetch_emails()