### get documents attached with emails 


In [1]:
from __future__ import print_function
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
import os

In [5]:
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

def gmail_connect():
    
    creds = None

    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
        return build('gmail','v1',credentials = creds)

    else:
        print("file 'token.json' not found in current directory.")

    
service = gmail_connect()
    

In [14]:
def recent_emails(service, maxEmails=5):

    results = service.users().messages().list( userId = 'me' , maxResults = maxEmails).execute()

    messages = results.get('messages',[])

    for message in messages:
        msg = service.users().messages().get( userId = 'me', id=message['id']).execute()

        headers = msg['payload']['headers']
        subject = next(h['value'] for h in headers if h['name'] == 'Subject')
        sender = next(h['value'] for h in headers if h['name'] == 'From')

        print(f'subject:{subject}')
        print(f"from:{sender}")
        print(f"content:{msg.get('snippet','No snippet')}")
        print("-"*60)

    return messages,msg,headers,subject,sender

messages,msg,headers,subject,sender = recent_emails(service)

msg

subject:15th Jan - Trading Holiday on account of Maharashtra Municipal Elections
from:Angel One <campaign@angelbroking.in>
content:Dear Client, This is to inform you that on account of the Maharashtra Municipal Corporation Election scheduled to be held on Thursday, January 15, 2026, the following trading segments will remain
------------------------------------------------------------
subject:Screener.in - Watchlist updates
from:"Screener.in" <no-reply@screener.in>
content:Latest updates Coal India Rumour verification - Regulation 30(11) Updates in your watchlist are emailed every morning. Unsubscribe from these updates.
------------------------------------------------------------
subject:Register of Securities & Funds for week ended Jan 10 2026
from:Angel One <contract.notes@angeltrade.in>
content:Dear Padsala Tirth Jaysukhbhai , With reference to NSE circular no NSE/INSP/47227 dated February 03, 2021, please find attached the Statement of Securities &amp; Funds for the period from Ja

{'id': '19bb40815aa448ee',
 'threadId': '19bb40815aa448ee',
 'labelIds': ['UNREAD', 'CATEGORY_UPDATES', 'INBOX'],
 'snippet': 'Plus: 100 Prompts to Stunning Presentations \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c \u200c',
 'payload': {'partId': '',
  'mimeType': 'multipart/alternative',
  'filename': '',
  'headers': [{'name': 'Delivered-To', 'value': 'tirthpadsala1@gmail.com'},
   {'name': 'Received',
    'value': 'by 2002:a05:6919:6b0e:b0:414:876:1610 with SMTP id dt14csp435843ysc;        

### filter emails with attachments

In [17]:
def attachment_emails(service , maxEmails=10):

    results = service.users().messages().list( userId = 'me' , q="has:attachment" , maxResults = maxEmails).execute()
    print(f"found {results.get('resultSizeEstimate' , 0)} emails with attachment..")

    messages = results.get('messages',[])

    for message in messages:
        msg = service.users().messages().get( userId = 'me', id=message['id'] , format='full').execute()

        headers = msg['payload']['headers']
        subject = next(h['value'] for h in headers if h['name'] == 'Subject')
        sender = next(h['value'] for h in headers if h['name'] == 'From')

        docAttaches = []

        def check_parts(parts):
            attachments=[]

            for part in parts:
                if 'filename' in part['body'] and part['body']['filename']:
                    filename = part['body']['filename']
                    if filename.lower().endswith(('.pdf', '.docx', '.doc')):
                        attachments.append(filename)
                
                if 'parts' in part:
                    attachments.extend(check_parts(part['parts']))
            
            return attachments

        if 'parts' in msg['payload']:
            docAttaches = check_parts(msg['payload']['parts'])

        if docAttaches:

            print("-"*60)
            print(f"from:{sender}")
            print(f"subject:{subject}")
            print(f"Documnets:{','.join(docAttaches)}")
        
    return docAttaches

docAttaches = attachment_emails(service)
docAttaches


found 201 emails with attachment..


[]

In [18]:
import os
import base64

def download_attachments(service, maxEmails=10, download_folder='email_attachments'):
    # Create download folder if it doesn't exist
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
    
    results = service.users().messages().list(userId='me', q="has:attachment", maxResults=maxEmails).execute()
    print(f"Found {results.get('resultSizeEstimate', 0)} emails with attachments..")

    messages = results.get('messages', [])
    all_downloaded = []

    for message in messages:
        msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute()

        headers = msg['payload']['headers']
        subject = next((h['value'] for h in headers if h['name'] == 'Subject'), 'No Subject')
        sender = next((h['value'] for h in headers if h['name'] == 'From'), 'Unknown')

        def check_and_download_parts(parts, msg_id):
            downloaded = []

            for part in parts:
                if part.get('filename'):
                    filename = part['filename']
                    
                    # Check if it's a document type you want
                    if filename.lower().endswith(('.pdf', '.docx', '.doc')):
                        # Get attachment data
                        if 'attachmentId' in part['body']:
                            attachment = service.users().messages().attachments().get(
                                userId='me',
                                messageId=msg_id,
                                id=part['body']['attachmentId']
                            ).execute()
                            
                            # Decode the attachment data
                            file_data = base64.urlsafe_b64decode(attachment['data'].encode('UTF-8'))
                            
                            # Save the file
                            filepath = os.path.join(download_folder, filename)
                            
                            # Handle duplicate filenames
                            counter = 1
                            base_name, extension = os.path.splitext(filename)
                            while os.path.exists(filepath):
                                filepath = os.path.join(download_folder, f"{base_name}_{counter}{extension}")
                                counter += 1
                            
                            with open(filepath, 'wb') as f:
                                f.write(file_data)
                            
                            downloaded.append({'filename': filename, 'path': filepath})
                            print(f"  ‚úì Downloaded: {filename}")
                
                # Recursively check nested parts
                if 'parts' in part:
                    downloaded.extend(check_and_download_parts(part['parts'], msg_id))
            
            return downloaded

        # Process email parts
        email_attachments = []
        if 'parts' in msg['payload']:
            email_attachments = check_and_download_parts(msg['payload']['parts'], message['id'])
        
        if email_attachments:
            print("-" * 60)
            print(f"From: {sender}")
            print(f"Subject: {subject}")
            all_downloaded.extend(email_attachments)
    
    print("\n" + "=" * 60)
    print(f"Total files downloaded: {len(all_downloaded)}")
    return all_downloaded


downloaded_files = download_attachments(service, maxEmails=10)

# Access the files
for file_info in downloaded_files:
    print(f"File: {file_info['filename']} -> Saved at: {file_info['path']}")



Found 201 emails with attachments..
  ‚úì Downloaded: cloud workshop.pdf
------------------------------------------------------------
From: Vivaan Benrjee <vivaanbenrjee@gmail.com>
Subject: Hi
  ‚úì Downloaded: ROS_ROF_AABZ656274.pdf
------------------------------------------------------------
From: Angel One <contract.notes@angeltrade.in>
Subject: Register of Securities & Funds for week ended Jan 10 2026
  ‚úì Downloaded: AABZ656274.pdf
------------------------------------------------------------
From: BSE ALERTS <info@bseindia.in>
Subject: Funds / Securities Balance
  ‚úì Downloaded: DEC2025_AA90847562_TXN.pdf
------------------------------------------------------------
From: eCAS@cdslstatement.com
Subject: CDSL Consolidated Account Statement (CAS) across Mutual Funds and Depositories for-DEC2025-798624773-1-1
  ‚úì Downloaded: AABZ656274.pdf
------------------------------------------------------------
From: BSE ALERTS <info@bseindia.in>
Subject: Funds / Securities Balance
  ‚úì Down

In [None]:
import os
import base64
from datetime import datetime

def process_emails_with_attachments(service, max_results=5):
    """Process emails and handle attachments"""
    
    # List messages
    results = service.users().messages().list(
        userId='me',
        maxResults=max_results
    ).execute()
    
    messages = results.get('messages', [])
    
    for i, msg_info in enumerate(messages, 1):
        print(f"\n{'='*60}")
        print(f"üìß Processing Email {i}/{len(messages)}")
        print(f"{'='*60}")
        
        # Get full message
        msg = service.users().messages().get(
            userId='me',
            id=msg_info['id'],
            format='full'
        ).execute()
        
        # Extract basic info
        headers = msg['payload']['headers']
        subject = next((h['value'] for h in headers if h['name'] == 'Subject'), 'No Subject')
        sender = next((h['value'] for h in headers if h['name'] == 'From'), 'Unknown')
        
        print(f"Subject: {subject}")
        print(f"From: {sender}")
        
        # Extract body
        body = extract_email_body(msg['payload'])
        print(f"Body preview: {body[:100]}..." if body else "No body")
        
        # Check for attachments
        attachments = []
        
        def find_attachments(parts):
            for part in parts:
                if 'filename' in part['body'] and part['body']['filename']:
                    filename = part['body']['filename']
                    # Check if it's a document file
                    if filename.lower().endswith(('.pdf', '.docx', '.doc', '.xlsx', '.xls', '.pptx', '.ppt')):
                        attachments.append({
                            'filename': filename,
                            'mimeType': part['mimeType'],
                            'size': part['body'].get('size', 0),
                            'attachmentId': part['body'].get('attachmentId')
                        })
                
                if 'parts' in part:
                    find_attachments(part['parts'])
        
        if 'parts' in msg['payload']:
            find_attachments(msg['payload']['parts'])
        
        if attachments:
            print(f"\nüìé Found {len(attachments)} document attachments:")
            for att in attachments:
                print(f"  - {att['filename']} ({att['size']} bytes)")
            
            # Ask if user wants to download
            response = input(f"\nDownload {len(attachments)} attachments? (y/n): ")
            if response.lower() == 'y':
                # Create folder for this email's attachments
                folder_name = f"attachments_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{msg_info['id']}"
                os.makedirs(folder_name, exist_ok=True)
                
                for att in attachments:
                    if att['attachmentId']:
                        download_attachment(service, msg_info['id'], att, folder_name)
        
        else:
            print("No document attachments found")

def extract_email_body(payload):
    """Extract email body text"""
    body = ""
    
    def extract_from_part(part):
        content = ""
        if 'body' in part and 'data' in part['body']:
            try:
                content = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8')
            except:
                pass
        
        if 'parts' in part:
            for p in part['parts']:
                content += extract_from_part(p)
        
        return content
    
    return extract_from_part(payload)

def download_attachment(service, message_id, attachment_info, save_folder):
    """Download a single attachment"""
    try:
        # Get attachment data
        attachment = service.users().messages().attachments().get(
            userId='me',
            messageId=message_id,
            id=attachment_info['attachmentId']
        ).execute()
        
        # Decode and save
        file_data = base64.urlsafe_b64decode(attachment['data'])
        filepath = os.path.join(save_folder, attachment_info['filename'])
        
        with open(filepath, 'wb') as f:
            f.write(file_data)
        
        print(f"‚úÖ Saved: {attachment_info['filename']}")
        return filepath
        
    except Exception as e:
        print(f"‚ùå Failed to download {attachment_info['filename']}: {e}")
        return None

# Run it
process_emails_with_attachments(service, max_results=5)

[{'id': '19bb56469c635fa8', 'threadId': '19bb56469c635fa8'},
 {'id': '19bb54baad98d91b', 'threadId': '19bb54baad98d91b'},
 {'id': '19bb4ae3f5156206', 'threadId': '19bb4ae3f5156206'}]

In [16]:
import base64
from googleapiclient.discovery import build

def get_full_messages(service, max_results=5):
    """Get full content of emails"""
    
    # First, list message IDs
    results = service.users().messages().list(
        userId='me', 
        maxResults=max_results
    ).execute()

    '''results: It returns only: [{ "id": "...", "threadId": "..." }] Not the content yet.'''
    
    messages = results.get('messages', []) # Pulls out the list of message IDs.
    
    full_messages = []
    
    for msg_info in messages: # Each msg_info contains: { "id": "XYZ123" }

        # Get FULL message content

        '''It downloads: Subject Sender Body Attachments MIME structure'''
        msg = service.users().messages().get(
            userId='me', 
            id=msg_info['id'],
            format='full'  
        ).execute()
        
        # Process the full message
        processed_msg = extract_complete_message(msg)
        full_messages.append(processed_msg)
        
        print(f"Email {len(full_messages)}: {processed_msg['subject']}...") # shows proress 
    
    return full_messages

def extract_complete_message(msg):
    """Extract all information from a full message"""
    
    # Extract headers
    '''This contains: From Subject Date To'''
    headers = msg['payload']['headers']

    subject = next((h['value'] for h in headers if h['name'] == 'Subject'), 'No Subject') # get subject
    sender = next((h['value'] for h in headers if h['name'] == 'From'), 'Unknown') # get sender email
    date = next((h['value'] for h in headers if h['name'] == 'Date'), 'No Date') # get date 
    
    # Extract body content
    body_text = ""
    body_html = ""
    
    def extract_from_payload(payload):
        """Recursively extract from payload"""
        text = ""
        html = ""
        
        # Check if this part has a body
        if 'body' in payload and 'data' in payload['body']:
            try:
                decoded = base64.urlsafe_b64decode(payload['body']['data']).decode('utf-8') # Gmail gives body in base64 ‚Üí you convert it back to text.
                if payload.get('mimeType') == 'text/plain':
                    text = decoded
                elif payload.get('mimeType') == 'text/html':
                    html = decoded
                '''Gmail can have: plain text HTML both capture both.'''
            except:
                pass
        
        # Recursively check parts, find the body even if it is nested.
        if 'parts' in payload:
            for part in payload['parts']:
                t, h = extract_from_payload(part)
                text += t
                html += h
        
        return text, html
    
    body_text, body_html = extract_from_payload(msg['payload'])
    
    # If no text body but we have HTML, use HTML
    if not body_text and body_html:
        # Simple HTML to text conversion (remove tags)
        import re
        body_text = re.sub('<[^<]+?>', '', body_html)

    # retrive whole metadata
    return {
        'id': msg['id'],
        'subject': subject,
        'sender': sender,
        'date': date,
        'body_text': body_text,
        'body_html': body_html,
        'snippet': msg.get('snippet', ''),
        'full_message': msg  # Keep the full raw message if needed
    }


def print_full_emails(messages):
    """Print full email content"""
    for i, msg in enumerate(messages, 1):
        print(f"\n{'='*80}")
        print(f" EMAIL {i}/{len(messages)}")
        print(f"{'='*80}")
        print(f"Subject: {msg['subject']}")
        print(f"From: {msg['sender']}")
        print(f"Date: {msg['date']}")
        print(f"\n{'‚îÄ'*80}")
        print("BODY:")
        print(f"{'‚îÄ'*80}")
        
        # Show body text (or snippet if body is empty)
        if msg['body_text']:
            # Truncate if too long
            body_preview = msg['body_text'] + "..." if len(msg['body_text']) > 1000 else msg['body_text']
            print(body_preview)
        else:
            print(msg['snippet'])
        
        # Show character count
        print(f"\nBody length: {len(msg['body_text'])} characters")
        print(f"{'='*80}\n")


messages = get_full_messages(service, max_results=5)
print_full_emails(messages)

Email 1: Hi...
Email 2: 15th Jan - Trading Holiday on account of Maharashtra Municipal Elections...
Email 3: Screener.in - Watchlist updates...
Email 4: Register of Securities & Funds for week ended Jan 10 2026...
Email 5: Funds / Securities Balance...

 EMAIL 1/5
Subject: Hi
From: Vivaan Benrjee <vivaanbenrjee@gmail.com>
Date: Tue, 13 Jan 2026 11:31:39 +0530

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
BODY:
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Bojb


Body length: 6 characters


 EMAIL 2/5
Subject: 15th Jan - Trading Holiday on account of Maharashtra Municipal Elections
From: Angel One <camp