<a href="https://colab.research.google.com/github/trakesh15/gmail-automation/blob/main/Gmail_PDF_Attachment_Reader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import base64
import os
import io
# Using the standard library for email parsing is essential when handling raw MIME data
from email.mime.multipart import MIMEMultipart
from email import message_from_bytes

# --- Mock Imports for Google API and PDF Library ---
# In a real setup, you would install these:
# pip install google-api-python-client google-auth-oauthlib PyPDF2
# from google.oauth2.credentials import Credentials
# from googleapiclient.discovery import build
# from PyPDF2 import PdfReader

# --- CONFIGURATION ---
SENDER_FILTER = "report_sender@example.com"
# Reads the core subject filter from an environment variable for dynamic configuration.
# Falls back to "WTR Report" if the environment variable is not set.
SUBJECT_FILTER_CORE = os.environ.get("EMAIL_SUBJECT_CORE", "WTR Report")
PDF_FILE_NAME_PATTERN = "Akira Reddy Tekulapally" # The attachment filename must contain this string

# The API scope needed to read and modify emails
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
API_KEY = "" # Leave empty, Canvas will inject the key

def authenticate_gmail():
    """
    Handles OAuth 2.0 authentication flow and returns a Gmail API service object.

    NOTE: In a real-world scenario, you must replace this with the full OAuth 2.0 flow
    using client_secrets.json and stored tokens.
    """
    print("--- 1. Authenticating Gmail API ---")
    # Placeholder for credentials loading and service building
    # In a real app, this function would handle:
    # 1. Loading credentials (token.json).
    # 2. Creating the service object: service = build('gmail', 'v1', credentials=creds)
    # 3. Returning the 'service' object.

    class MockService:
        """Mock class to simulate the Google API service structure."""
        def users(self):
            return self
        def messages(self):
            return self
        def list(self, userId, q):
            print(f"   [API Mock] Querying: User='{userId}', Search='{q}'")
            return self
        def execute(self):
            # Simulate finding one message ID matching the criteria
            return {'messages': [{'id': '17f3a2b1c4d5e6f7'}]}
        def get(self, userId, id, format):
            print(f"   [API Mock] Fetching message ID: {id} (Format: {format})")
            return self

    return MockService()

def search_and_fetch_attachments(service, user_id='me'):
    """
    Searches for emails based on the SENDER filter, then programmatically filters
    for partial SUBJECT match and extracts PDF attachments.
    """
    # Use a broader search query (Sender only) and filter the subject in Python.
    query = f"from:{SENDER_FILTER}"

    try:
        # 1. Search for messages matching the criteria (Sender only)
        response = service.users().messages().list(userId=user_id, q=query).execute()
        messages = response.get('messages', [])

        if not messages:
            print(f"   No new emails found matching the sender criteria: '{SENDER_FILTER}'")
            return []

        print(f"   Found {len(messages)} email(s) from the sender. Now checking subject line...")
        pdf_texts = []

        for msg in messages:
            msg_id = msg['id']
            # 2. Fetch the raw email content
            message_data = service.users().messages().get(userId=user_id, id=msg_id, format='raw').execute()
            raw_email_bytes = base64.urlsafe_b64decode(message_data['raw'])

            # 3. Parse the raw email content
            email_message = message_from_bytes(raw_email_bytes)

            # --- Check for Partial Subject Match Programmatically ---
            subject = email_message.get('Subject', '')

            # Check if the core subject filter is contained anywhere in the email's subject
            if SUBJECT_FILTER_CORE.lower() not in subject.lower():
                print(f"   Skipping message {msg_id}: Subject '{subject}' does not contain core filter '{SUBJECT_FILTER_CORE}'")
                continue # Skip to the next message

            print(f"   Subject '{subject}' matches core filter. Processing attachments...")

            # 4. Iterate through parts to find attachments
            for part in email_message.walk():
                file_name = part.get_filename()
                content_type = part.get_content_type()

                # Check for an attachment that is a PDF and matches the name pattern
                if file_name and PDF_FILE_NAME_PATTERN.lower() in file_name.lower() and content_type == 'application/pdf':
                    print(f"   Found PDF attachment: {file_name}")

                    # Extract the raw data
                    attachment_data = part.get_payload(decode=True)

                    # 5. Read the PDF content
                    pdf_text = read_pdf_content(file_name, attachment_data)
                    pdf_texts.append(pdf_text)

                    # Mark the email as processed (Optional, but good practice)
                    # For example: service.users().messages().modify(userId='me', id=msg_id, body={'removeLabelIds': ['UNREAD']}).execute()

        return pdf_texts

    except Exception as e:
        print(f"An error occurred during search or fetch: {e}")
        return []

def read_pdf_content(file_name, data):
    """
    Uses a library (like PyPDF2) to read text from the binary PDF data.
    """
    print(f"   [PDF Processor] Attempting to read content from {file_name}...")

    # In a real application, you would use PyPDF2:
    # pdf_file = io.BytesIO(data)
    # reader = PdfReader(pdf_file)
    # text = ""
    # for page in reader.pages:
    #     text += page.extract_text() + "\n"
    # return text

    # --- Mocking the PDF reading result ---
    mock_content = (
        "**MOCK PDF CONTENT**\n"
        f"This text was successfully extracted from the attachment '{file_name}'.\n"
        "Key data points: Total Revenue $1,500,000. Q3 Expenses $450,000."
    )
    return mock_content

def main():
    """Main execution function to run the automation."""
    service = authenticate_gmail()

    if service:
        extracted_content = search_and_fetch_attachments(service)

        if extracted_content:
            print("\n--- Extracted Content Summary ---")
            for i, content in enumerate(extracted_content):
                print(f"\n--- PDF Content from Email {i+1} ---")
                print(content)
                print("---------------------------------")
        else:
            print(f"\nAutomation finished. No PDF content was extracted based on the SENDER filter '{SENDER_FILTER}', core SUBJECT filter '{SUBJECT_FILTER_CORE}', and PDF file name pattern '{PDF_FILE_NAME_PATTERN}'.")

# Retry logic for API calls (using exponential backoff)
import time
def execute_with_backoff(func, *args, max_retries=5, initial_delay=1):
    delay = initial_delay
    for attempt in range(max_retries):
        try:
            return func(*args)
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Final attempt failed. Giving up. Error: {e}")
                raise
            # Example logic for exponential backoff (print only on exception)
            # print(f"API call failed (Attempt {attempt+1}/{max_retries}). Retrying in {delay}s...")
            time.sleep(delay)
            delay *= 2

# Replace standard function calls with the robust one if you were using the real API
# if __name__ == '__main__':
#     main()

# For the mock environment, we call main directly:
main()