# Gmail Query Lab 🧪
Experiment with Gmail search queries to find invoice emails

In [1]:
# Setup imports and authentication
import os
import yaml
from gmail_server import GmailServer
from datetime import datetime, timedelta
import pandas as pd

# Load config
with open("config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Initialize Gmail server
gmail_server = GmailServer(
    credentials_file=config["gmail"]["credentials_file"],
    token_file=config["gmail"]["token_file"],
    scopes=config["gmail"]["scopes"],
    config=config,
)

print("✅ Gmail server initialized!")

✅ Gmail server initialized!


## 1. Test Basic Query Statistics

In [2]:
# Check how many emails you have in different time periods
def test_email_counts(days_back_list=[7, 30, 90, 365]):
    results = []

    for days in days_back_list:
        end_date = datetime.now()
        start_date = end_date - timedelta(days=days)

        # Simple query - all emails
        query = f'after:{start_date.strftime("%Y/%m/%d")}'

        try:
            result = (
                gmail_server.service.users()  # type: ignore
                .messages()
                .list(userId="me", q=query, maxResults=1000)
                .execute()
            )

            count = len(result.get("messages", []))
            results.append(
                {
                    "days_back": days,
                    "total_emails": count,
                    "date_from": start_date.strftime("%Y-%m-%d"),
                }
            )

        except Exception as e:
            results.append(
                {
                    "days_back": days,
                    "total_emails": f"Error: {e}",
                    "date_from": start_date.strftime("%Y-%m-%d"),
                }
            )

    return pd.DataFrame(results)


# Run the test
email_counts = test_email_counts()
print("📊 Email counts by time period:")
print(email_counts)

📊 Email counts by time period:
   days_back  total_emails   date_from
0          7           185  2025-08-22
1         30           500  2025-07-30
2         90           500  2025-05-31
3        365           500  2024-08-29


## 2. Test Different Search Queries

In [3]:
# Test various search queries to see what matches
def test_search_queries(days_back=30):
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days_back)
    date_filter = f'after:{start_date.strftime("%Y/%m/%d")}'

    # Different query variations
    queries = [
        # Original query
        f"{date_filter} (subject:DUNS numbers, and stuff OR subject:räkning OR subject:invoice OR subject:bill OR has:attachment filetype:pdf)",
        # Just Swedish terms
        f"{date_filter} (subject:faktura OR subject:räkning)",
        # Just English terms
        f"{date_filter} (subject:invoice OR subject:bill)",
        # Just PDF attachments
        f"{date_filter} has:attachment filetype:pdf",
        # Broader Swedish terms
        f'{date_filter} (faktura OR räkning OR förfallodag OR "att betala")',
        # Broader English terms
        f'{date_filter} (invoice OR bill OR "payment due" OR "due date")',
        # Common vendors
        f"{date_filter} (from:vattenfall OR from:telia OR from:ica OR from:spotify)",
        # No-reply emails (often automated billing)
        f"{date_filter} from:noreply",
        # Very broad - any email with attachments
        f"{date_filter} has:attachment",
    ]

    query_names = [
        "Original Query",
        "Swedish Only",
        "English Only",
        "PDF Attachments Only",
        "Broad Swedish",
        "Broad English",
        "Common Vendors",
        "No-Reply Emails",
        "Any Attachments",
    ]

    results = []

    for name, query in zip(query_names, queries):
        try:
            result = (
                gmail_server.service.users()  # type: ignore
                .messages()
                .list(userId="me", q=query, maxResults=100)
                .execute()
            )

            count = len(result.get("messages", []))
            results.append(
                {
                    "Query Type": name,
                    "Count": count,
                    "Query": query[:80] + "..." if len(query) > 80 else query,
                }
            )

        except Exception as e:
            results.append(
                {
                    "Query Type": name,
                    "Count": f"Error: {e}",
                    "Query": query[:80] + "..." if len(query) > 80 else query,
                }
            )

    return pd.DataFrame(results)


# Test queries
query_results = test_search_queries(days_back=30)
print("🔍 Search query results (last 30 days):")
query_results

🔍 Search query results (last 30 days):


Unnamed: 0,Query Type,Count,Query
0,Original Query,0,"after:2025/07/30 (subject:DUNS numbers, and st..."
1,Swedish Only,14,after:2025/07/30 (subject:faktura OR subject:r...
2,English Only,0,after:2025/07/30 (subject:invoice OR subject:b...
3,PDF Attachments Only,0,after:2025/07/30 has:attachment filetype:pdf
4,Broad Swedish,27,after:2025/07/30 (faktura OR räkning OR förfal...
5,Broad English,18,"after:2025/07/30 (invoice OR bill OR ""payment ..."
6,Common Vendors,3,after:2025/07/30 (from:vattenfall OR from:teli...
7,No-Reply Emails,100,after:2025/07/30 from:noreply
8,Any Attachments,69,after:2025/07/30 has:attachment


## 3. Examine Sample Emails

In [12]:
# Get sample emails to see what they look like
def get_sample_emails(query, max_samples=5):
    try:
        result = (
            gmail_server.service.users()  # type: ignore
            .messages()
            .list(userId="me", q=query, maxResults=max_samples)
            .execute()
        )

        messages = result.get("messages", [])

        samples = []
        for msg in messages[:max_samples]:
            email_data = gmail_server._get_email_details(msg["id"])
            if email_data:
                samples.append(
                    {
                        "Subject": email_data["subject"][:100],
                        "Sender": email_data["sender"][:50],
                        "Date": email_data["date"],
                        "Body Preview": (
                            email_data["body"][:200] + "..."
                            if email_data["body"]
                            else "No body"
                        ),
                        "Attachments": [
                            att["filename"] for att in email_data.get("attachments", [])
                        ],
                    }
                )

        return pd.DataFrame(samples)

    except Exception as e:
        print(f"Error getting samples: {e}")
        return pd.DataFrame()


# Try different queries to see what emails you have
print("📧 Sample emails with attachments:")
attachment_query = (
    f'after:{(datetime.now() - timedelta(days=30)).strftime("%Y/%m/%d")} has:attachment'
)
attachment_samples = get_sample_emails(attachment_query, max_samples=3)
display(attachment_samples)

📧 Sample emails with attachments:


Unnamed: 0,Subject,Sender,Date,Body Preview,Attachments
0,Sv: CV,Sandra Thelander Svensson <Sandra.Thelander.Sv...,2025-08-27 14:05:40,No body,"[image001.png, image001.png, image001.png]"
1,Christian x HiQ Intervju ☕️,Sandra Thelander Svensson <Sandra.Thelander.Sv...,2025-08-27 13:55:02,"Välkommen till HiQ på intervju, Christian. Du ...",[invite.ics]
2,Christian x HiQ Intervju ☕️,Sandra Thelander Svensson <Sandra.Thelander.Sv...,2025-08-27 13:46:47,"Välkommen till HiQ på intervju, Christian. Du ...",[invite.ics]


In [13]:
# Sample no-reply emails (often automated billing)
print("\n📧 Sample no-reply emails:")
noreply_query = (
    f'after:{(datetime.now() - timedelta(days=30)).strftime("%Y/%m/%d")} from:noreply'
)
noreply_samples = get_sample_emails(noreply_query, max_samples=3)
display(noreply_samples)


📧 Sample no-reply emails:


Unnamed: 0,Subject,Sender,Date,Body Preview,Attachments
0,Nytt ränteråd för brf:er,SEB <noreply@newsletter.seb.se>,2025-08-29 07:43:38,SEB\r\n\r\nFå koll på den senaste utvecklingen...,[]
1,"Publication editors, welcome to your new submi...",Medium Daily Digest <noreply@medium.com>,2025-08-29 05:10:00,Stories for Christian Wahlström\r\n@christian....,[]
2,Påminnelse: Ditt HogiaID med epost christian.w...,Hogia Notification Service <noreply@hogia.se>,2025-08-28 18:00:04,Påminnelse: Vill du fortsätta använda Hogia me...,[]


## 4. Custom Query Testing

In [14]:
# Test your own custom queries here
def test_custom_query(query_text, days_back=30, max_results=10):
    """
    Test a custom Gmail search query

    Examples:
    - 'subject:spotify'
    - 'from:vattenfall.se'
    - 'has:attachment filename:pdf'
    - 'faktura OR räkning'
    """
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days_back)

    # Add date filter
    full_query = f'after:{start_date.strftime("%Y/%m/%d")} {query_text}'

    print(f"🔍 Testing query: {full_query}")

    try:
        result = (
            gmail_server.service.users()  # type: ignore
            .messages()
            .list(userId="me", q=full_query, maxResults=max_results)
            .execute()
        )

        messages = result.get("messages", [])
        print(f"📊 Found {len(messages)} emails")

        if messages:
            # Get details for first few
            samples = []
            for msg in messages[:3]:  # Just first 3
                email_data = gmail_server._get_email_details(msg["id"])
                if email_data:
                    samples.append(
                        {
                            "Subject": email_data["subject"],
                            "Sender": email_data["sender"],
                            "Date": email_data["date"],
                        }
                    )

            return pd.DataFrame(samples)
        else:
            print("No emails found with this query")
            return pd.DataFrame()

    except Exception as e:
        print(f"❌ Error: {e}")
        return pd.DataFrame()


# Try some custom queries
print("Testing custom queries:\n")

# Test 1: Look for specific vendor
print("1. Spotify emails:")
spotify_results = test_custom_query("spotify", days_back=90)
display(spotify_results)
print()

Testing custom queries:

1. Spotify emails:
🔍 Testing query: after:2025/05/31 spotify
📊 Found 10 emails


Unnamed: 0,Subject,Sender,Date
0,"ByteDance passes Meta 📈, Google's manager purg...",TLDR <dan@tldrnewsletter.com>,2025-08-28 10:21:51
1,"New from Tim — ""Dr. Jeffrey Goldberg — Creatin...",Tim Ferriss <tim@fourhourbody.com>,2025-08-26 20:45:17
2,"5-Bullet Friday — Self-Publishing 101, Devices...",Tim Ferriss <tim@fourhourbody.com>,2025-08-23 00:42:15





In [7]:
# Test 2: Look for Swedish keywords in body/subject
print("2. Swedish 'faktura' emails:")
faktura_results = test_custom_query("faktura", days_back=90)
# print(faktura_results)
# print()
faktura_results

2. Swedish 'faktura' emails:
🔍 Testing query: after:2025/05/31 faktura
📊 Found 10 emails


Unnamed: 0,Subject,Sender,Date
0,Du har fått en ny faktura från Bangerhead.se,Qliro <noreply@e.qliro.com>,2025-08-28 12:13:57
1,Din faktura från Apple,Apple <no_reply@email.apple.com>,2025-08-23 11:50:30
2,Ditt kvitto från Glimra,no-reply@glimra.com,2025-08-21 13:29:28


In [8]:
# Test 3: Your own custom query
print("3. Custom query - edit this cell:")
# Edit the query below to test what you want:
custom_query = "subject:bill OR subject:payment"
custom_results = test_custom_query(custom_query, days_back=60)
print(custom_results)

3. Custom query - edit this cell:
🔍 Testing query: after:2025/06/30 subject:bill OR subject:payment
📊 Found 7 emails
                                      Subject                       Sender  \
0        Receipt for Your Payment to Temu.com   PayPal <service@paypal.se>   
1  [GitHub] Payment Receipt for userchallenge  GitHub <noreply@github.com>   
2      Receipt for Your Payment to DisneyPlus   PayPal <service@paypal.se>   

                  Date  
0  2025-08-24 06:02:53  
1  2025-08-03 11:26:02  
2  2025-08-03 11:06:51  


## 5. Build Better Query

In [16]:
# Based on your findings above, build a better query
def build_optimized_query(days_back=30):
    """
    Build an optimized query based on what we learned
    Edit this function based on your findings above
    """

    end_date = datetime.now()
    start_date = end_date - timedelta(days=days_back)
    date_filter = f'after:{start_date.strftime("%Y/%m/%d")}'

    # Build query components based on what you found
    query_parts = [
        # Add the most effective search terms you found above
        # "subject:faktura",
        # "subject:räkning",
        # "subject:invoice",
        # "subject:bill",
        "faktura",
        "räkning",
        "invoice",
        "bill",
        # Add specific vendors if you found them
        # 'from:spotify',
        # 'from:vattenfall',
        # Add other patterns you discovered
    ]

    # Combine with OR
    search_terms = " OR ".join(query_parts)
    optimized_query = f"{date_filter} ({search_terms})"
    # optimized_query = "after:2025/06/25 (subject:madinter OR räkning OR invoice OR bill OR has:attachment filename:pdf)"
    # optimized_query = "after:2025/05/26 (faktura OR räkning OR invoice OR bill)"
    optimized_query = "after:2025/07/24 ((subject:konsert OR konsert) OR (subject:spelning OR spelning) OR (subject:live OR live) OR (subject:scen OR scen) OR (subject:biljett OR biljett) OR (subject:köp din biljett OR köp din biljett) OR (subject:säkra din biljett OR säkra din biljett) OR (subject:intar scenen OR intar scenen) OR (subject:återvänder OR återvänder) OR (subject:kommer till OR kommer till) OR (subject:concert OR concert) OR (subject:show OR show) OR (subject:performance OR performance) OR (subject:tickets OR tickets) OR (subject:buy tickets OR buy tickets) OR (subject:on stage OR on stage) OR (subject:live at OR live at) OR filename:pdf)"

    print(f"🎯 Optimized query: {optimized_query}")

    # Test it
    try:
        result = (
            gmail_server.service.users()  # type: ignore
            .messages()
            .list(userId="me", q=optimized_query, maxResults=20)
            .execute()
        )

        messages = result.get("messages", [])
        print(f"📊 Found {len(messages)} emails with optimized query")

        return optimized_query, messages

    except Exception as e:
        print(f"❌ Error with optimized query: {e}")
        return optimized_query, []


# Test the optimized query
optimized_query, found_messages = build_optimized_query(days_back=60)

if found_messages:
    print("\n📧 Sample results from optimized query:")
    sample_results = get_sample_emails(optimized_query, max_samples=30)
    print(sample_results)
else:
    print("\n❌ No emails found with optimized query - try adjusting the search terms")

display(sample_results)

🎯 Optimized query: after:2025/07/24 ((subject:konsert OR konsert) OR (subject:spelning OR spelning) OR (subject:live OR live) OR (subject:scen OR scen) OR (subject:biljett OR biljett) OR (subject:köp din biljett OR köp din biljett) OR (subject:säkra din biljett OR säkra din biljett) OR (subject:intar scenen OR intar scenen) OR (subject:återvänder OR återvänder) OR (subject:kommer till OR kommer till) OR (subject:concert OR concert) OR (subject:show OR show) OR (subject:performance OR performance) OR (subject:tickets OR tickets) OR (subject:buy tickets OR buy tickets) OR (subject:on stage OR on stage) OR (subject:live at OR live at) OR filename:pdf)
📊 Found 20 emails with optimized query

📧 Sample results from optimized query:
                                              Subject  \
0   ~ 4 unique approaches to using diptychs in pho...   
1   Påminnelse: Ditt HogiaID med epost christian.w...   
2            Taylor Swift förlovad – så dyr är ringen   
3     BMW Neue Klasse 2025 – en ny e

Unnamed: 0,Subject,Sender,Date,Body Preview,Attachments
0,~ 4 unique approaches to using diptychs in pho...,LensCulture Weekly <info@lensculture.com>,2025-08-29 03:23:18,3 new bodies of work plus one from the archive...,[]
1,Påminnelse: Ditt HogiaID med epost christian.w...,Hogia Notification Service <noreply@hogia.se>,2025-08-28 18:00:04,Påminnelse: Vill du fortsätta använda Hogia me...,[]
2,Taylor Swift förlovad – så dyr är ringen,GAFFA Sweden <noreply@gaffa.se>,2025-08-28 15:12:50,"Hej,\r\n\r\nDu har fått ett nyhetsbrev från GA...",[]
3,BMW Neue Klasse 2025 – en ny epok av mobilitet.,BMW Sverige <noreply@nordics.bmwgroup.com>,2025-08-28 16:10:23,Elektrisk. Digital. Innovativ. \t\r\n\t\r\n\r\...,[]
4,"ByteDance passes Meta 📈, Google's manager purg...",TLDR <dan@tldrnewsletter.com>,2025-08-28 10:21:51,ByteDance is set to launch a new employee shar...,[]
5,A better way to capture and organize information,Notion Team <team@mail.notion.so>,2025-08-27 15:03:50,( https://www.notion.so/?utm_campaign=%5BUpdat...,[]
6,"Starship 10 🚀, Google's nano banana 🍌, uv for ...",TLDR <dan@tldrnewsletter.com>,2025-08-27 10:18:57,SpaceX's Starship made it all the way up to sp...,[]
7,Companies like Softronic and others in your ne...,LinkedIn <jobs-listings@linkedin.com>,2025-08-27 10:04:31,Hiring in your network\r\n\r\n\r\nExplore rele...,[]
8,"New from Tim — ""Dr. Jeffrey Goldberg — Creatin...",Tim Ferriss <tim@fourhourbody.com>,2025-08-26 20:45:17,a {text-decoration: none;}\r\n\r\n​\r\n\r\n***...,[]
9,Re: CV,"""Christian Wahlström"" <christian.wahlstrom@gma...",2025-08-26 20:42:25,No body,"[image001.png, Christian Wahlström - CV.pdf]"


In [10]:
sample_results

Unnamed: 0,Subject,Sender,Date,Body Preview,Attachments
0,~ 4 unique approaches to using diptychs in pho...,LensCulture Weekly <info@lensculture.com>,2025-08-29 03:23:18,3 new bodies of work plus one from the archive...,[]
1,Påminnelse: Ditt HogiaID med epost christian.w...,Hogia Notification Service <noreply@hogia.se>,2025-08-28 18:00:04,Påminnelse: Vill du fortsätta använda Hogia me...,[]
2,Taylor Swift förlovad – så dyr är ringen,GAFFA Sweden <noreply@gaffa.se>,2025-08-28 15:12:50,"Hej,\r\n\r\nDu har fått ett nyhetsbrev från GA...",[]
3,BMW Neue Klasse 2025 – en ny epok av mobilitet.,BMW Sverige <noreply@nordics.bmwgroup.com>,2025-08-28 16:10:23,Elektrisk. Digital. Innovativ. \t\r\n\t\r\n\r\...,[]
4,"ByteDance passes Meta 📈, Google's manager purg...",TLDR <dan@tldrnewsletter.com>,2025-08-28 10:21:51,ByteDance is set to launch a new employee shar...,[]
5,A better way to capture and organize information,Notion Team <team@mail.notion.so>,2025-08-27 15:03:50,( https://www.notion.so/?utm_campaign=%5BUpdat...,[]
6,"Starship 10 🚀, Google's nano banana 🍌, uv for ...",TLDR <dan@tldrnewsletter.com>,2025-08-27 10:18:57,SpaceX's Starship made it all the way up to sp...,[]
7,Companies like Softronic and others in your ne...,LinkedIn <jobs-listings@linkedin.com>,2025-08-27 10:04:31,Hiring in your network\r\n\r\n\r\nExplore rele...,[]
8,"New from Tim — ""Dr. Jeffrey Goldberg — Creatin...",Tim Ferriss <tim@fourhourbody.com>,2025-08-26 20:45:17,a {text-decoration: none;}\r\n\r\n​\r\n\r\n***...,[]
9,Re: CV,"""Christian Wahlström"" <christian.wahlstrom@gma...",2025-08-26 20:42:25,No body,"[image001.png, Christian Wahlström - CV.pdf]"


## 6. Update Your Config

In [11]:
# Once you find a good query, you can update the Gmail search in your demo.py
print("💡 To update your demo.py with a better query:")
print("")
print("1. Open gmail_server.py")
print("2. Find the 'fetch_emails' method")
print("3. Update the 'query' variable around line 45")
print("")
print("Current query:")
current_query = "f'after:{start_date.strftime(\"%Y/%m/%d\")} (subject:faktura OR subject:räkning OR subject:invoice OR subject:bill OR has:attachment filetype:pdf)'"
print(current_query)
print("")
print("Suggested replacement (edit based on your findings):")
if "optimized_query" in locals():
    # Extract just the search part (without the date)
    search_part = (
        optimized_query.split(" ", 1)[1] if " " in optimized_query else optimized_query
    )
    suggested = f"f'after:{{start_date.strftime(\"%Y/%m/%d\")}} {search_part}'"
    print(suggested)
else:
    print(
        "f'after:{start_date.strftime(\"%Y/%m/%d\")} (YOUR_OPTIMIZED_SEARCH_TERMS_HERE)'"
    )

💡 To update your demo.py with a better query:

1. Open gmail_server.py
2. Find the 'fetch_emails' method
3. Update the 'query' variable around line 45

Current query:
f'after:{start_date.strftime("%Y/%m/%d")} (subject:faktura OR subject:räkning OR subject:invoice OR subject:bill OR has:attachment filetype:pdf)'

Suggested replacement (edit based on your findings):
f'after:{start_date.strftime("%Y/%m/%d")} ((subject:konsert OR konsert) OR (subject:spelning OR spelning) OR (subject:live OR live) OR (subject:scen OR scen) OR (subject:biljett OR biljett) OR (subject:köp din biljett OR köp din biljett) OR (subject:säkra din biljett OR säkra din biljett) OR (subject:intar scenen OR intar scenen) OR (subject:återvänder OR återvänder) OR (subject:kommer till OR kommer till) OR (subject:concert OR concert) OR (subject:show OR show) OR (subject:performance OR performance) OR (subject:tickets OR tickets) OR (subject:buy tickets OR buy tickets) OR (subject:on stage OR on stage) OR (subject:live at

## 7. Next Steps

Based on your experiments:

1. **If you found emails**: Update the query in `gmail_server.py` and run `python demo.py`
2. **If no invoice emails found**: Try broader time ranges (`--days-back 90` or `--days-back 365`)
3. **If you have invoices but different format**: Update the keywords in `config/config.yaml`
4. **Test with your best query**: Run `python demo.py --days-back 90 --verbose`