# Gmail Query Lab 🧪
Experiment with Gmail search queries to find invoice emails

In [None]:
# Setup imports and authentication
import os
import yaml
from gmail_server import GmailServer
from datetime import datetime, timedelta
import pandas as pd

# Load config
with open("config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

# Initialize Gmail server
gmail_server = GmailServer(
    credentials_file=config["gmail"]["credentials_file"],
    token_file=config["gmail"]["token_file"],
    scopes=config["gmail"]["scopes"],
    config=config,
)

print("✅ Gmail server initialized!")

## 1. Test Basic Query Statistics

In [None]:
# Check how many emails you have in different time periods
def test_email_counts(days_back_list=[7, 30, 90, 365]):
    results = []

    for days in days_back_list:
        end_date = datetime.now()
        start_date = end_date - timedelta(days=days)

        # Simple query - all emails
        query = f'after:{start_date.strftime("%Y/%m/%d")}'

        try:
            result = (
                gmail_server.service.users()  # type: ignore
                .messages()
                .list(userId="me", q=query, maxResults=1000)
                .execute()
            )

            count = len(result.get("messages", []))
            results.append(
                {
                    "days_back": days,
                    "total_emails": count,
                    "date_from": start_date.strftime("%Y-%m-%d"),
                }
            )

        except Exception as e:
            results.append(
                {
                    "days_back": days,
                    "total_emails": f"Error: {e}",
                    "date_from": start_date.strftime("%Y-%m-%d"),
                }
            )

    return pd.DataFrame(results)


# Run the test
email_counts = test_email_counts()
print("📊 Email counts by time period:")
print(email_counts)

📊 Email counts by time period:
   days_back  total_emails   date_from
0          7           164  2025-07-18
1         30           500  2025-06-25
2         90           500  2025-04-26
3        365           500  2024-07-25


## 2. Test Different Search Queries

In [None]:
# Test various search queries to see what matches
def test_search_queries(days_back=30):
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days_back)
    date_filter = f'after:{start_date.strftime("%Y/%m/%d")}'

    # Different query variations
    queries = [
        # Original query
        f"{date_filter} (subject:DUNS numbers, and stuff OR subject:räkning OR subject:invoice OR subject:bill OR has:attachment filetype:pdf)",
        # Just Swedish terms
        f"{date_filter} (subject:faktura OR subject:räkning)",
        # Just English terms
        f"{date_filter} (subject:invoice OR subject:bill)",
        # Just PDF attachments
        f"{date_filter} has:attachment filetype:pdf",
        # Broader Swedish terms
        f'{date_filter} (faktura OR räkning OR förfallodag OR "att betala")',
        # Broader English terms
        f'{date_filter} (invoice OR bill OR "payment due" OR "due date")',
        # Common vendors
        f"{date_filter} (from:vattenfall OR from:telia OR from:ica OR from:spotify)",
        # No-reply emails (often automated billing)
        f"{date_filter} from:noreply",
        # Very broad - any email with attachments
        f"{date_filter} has:attachment",
    ]

    query_names = [
        "Original Query",
        "Swedish Only",
        "English Only",
        "PDF Attachments Only",
        "Broad Swedish",
        "Broad English",
        "Common Vendors",
        "No-Reply Emails",
        "Any Attachments",
    ]

    results = []

    for name, query in zip(query_names, queries):
        try:
            result = (
                gmail_server.service.users()  # type: ignore
                .messages()
                .list(userId="me", q=query, maxResults=100)
                .execute()
            )

            count = len(result.get("messages", []))
            results.append(
                {
                    "Query Type": name,
                    "Count": count,
                    "Query": query[:80] + "..." if len(query) > 80 else query,
                }
            )

        except Exception as e:
            results.append(
                {
                    "Query Type": name,
                    "Count": f"Error: {e}",
                    "Query": query[:80] + "..." if len(query) > 80 else query,
                }
            )

    return pd.DataFrame(results)


# Test queries
query_results = test_search_queries(days_back=30)
print("🔍 Search query results (last 30 days):")
query_results

🔍 Search query results (last 30 days):


Unnamed: 0,Query Type,Count,Query
0,Original Query,0,"after:2025/06/25 (subject:DUNS numbers, and st..."
1,Swedish Only,13,after:2025/06/25 (subject:faktura OR subject:r...
2,English Only,3,after:2025/06/25 (subject:invoice OR subject:b...
3,PDF Attachments Only,0,after:2025/06/25 has:attachment filetype:pdf
4,Broad Swedish,27,after:2025/06/25 (faktura OR räkning OR förfal...
5,Broad English,22,"after:2025/06/25 (invoice OR bill OR ""payment ..."
6,Common Vendors,8,after:2025/06/25 (from:vattenfall OR from:teli...
7,No-Reply Emails,100,after:2025/06/25 from:noreply
8,Any Attachments,36,after:2025/06/25 has:attachment


## 3. Examine Sample Emails

In [None]:
# Get sample emails to see what they look like
def get_sample_emails(query, max_samples=5):
    try:
        result = (
            gmail_server.service.users()  # type: ignore
            .messages()
            .list(userId="me", q=query, maxResults=max_samples)
            .execute()
        )

        messages = result.get("messages", [])

        samples = []
        for msg in messages[:max_samples]:
            email_data = gmail_server._get_email_details(msg["id"])
            if email_data:
                samples.append(
                    {
                        "Subject": email_data["subject"][:100],
                        "Sender": email_data["sender"][:50],
                        "Date": email_data["date"],
                        "Body Preview": (
                            email_data["body"][:200] + "..."
                            if email_data["body"]
                            else "No body"
                        ),
                        "Attachments": [
                            att["filename"] for att in email_data.get("attachments", [])
                        ],
                    }
                )

        return pd.DataFrame(samples)

    except Exception as e:
        print(f"Error getting samples: {e}")
        return pd.DataFrame()


# Try different queries to see what emails you have
print("📧 Sample emails with attachments:")
attachment_query = (
    f'after:{(datetime.now() - timedelta(days=30)).strftime("%Y/%m/%d")} has:attachment'
)
attachment_samples = get_sample_emails(attachment_query, max_samples=3)
print(attachment_samples)

📧 Sample emails with attachments:
                                             Subject  \
0              Bokningsbekräftelse: GolfStar Sverige   
1  Kvitto på din beställning E18142269 från Syste...   
2   Your receipt from Anthropic, PBC #2030-5704-3161   

                                              Sender                 Date  \
0           GolfStar Sverige <no-reply@sweetspot.io>  2025-07-24 15:20:18   
1  "no-reply@systembolaget.se" <no-reply@systembo...  2025-07-24 14:41:22   
2  "Anthropic, PBC" <invoice+statements@mail.anth...  2025-07-24 12:54:20   

                                        Body Preview  \
0  <!doctype html>\r\n<html lang="en" dir="auto" ...   
1  <meta http-equiv="Content-Type" content="text/...   
2                                            No body   

                                         Attachments  
0                                        [event.ics]  
1         [Försäljningskvitto 2025-07-24 044122.pdf]  
2  [Invoice-72E2DW43-0001.pdf, Receipt-203

In [5]:
# Sample no-reply emails (often automated billing)
print("\n📧 Sample no-reply emails:")
noreply_query = (
    f'after:{(datetime.now() - timedelta(days=30)).strftime("%Y/%m/%d")} from:noreply'
)
noreply_samples = get_sample_emails(noreply_query, max_samples=3)
print(noreply_samples)


📧 Sample no-reply emails:
                                     Subject  \
0      Idag sker din månadsbetalning, fello.   
1  “consultant”: Influence Tech AB is hiring   
2        “project manager”: Joinrs is hiring   

                                              Sender                 Date  \
0                           Fello <noreply@fello.se>  2025-07-25 13:36:52   
1  LinkedIn Job Alerts <jobalerts-noreply@linkedi...  2025-07-25 09:43:56   
2  LinkedIn Job Alerts <jobalerts-noreply@linkedi...  2025-07-25 07:44:31   

                                        Body Preview Attachments  
0  [Visa i webläsare](https://r.fellos.se/tr/mr/C...          []  
1  Your job alert for consultant in Greater Stock...          []  
2  Your job alert for project manager in Greater ...          []  


## 4. Custom Query Testing

In [None]:
# Test your own custom queries here
def test_custom_query(query_text, days_back=30, max_results=10):
    """
    Test a custom Gmail search query

    Examples:
    - 'subject:spotify'
    - 'from:vattenfall.se'
    - 'has:attachment filename:pdf'
    - 'faktura OR räkning'
    """
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days_back)

    # Add date filter
    full_query = f'after:{start_date.strftime("%Y/%m/%d")} {query_text}'

    print(f"🔍 Testing query: {full_query}")

    try:
        result = (
            gmail_server.service.users()  # type: ignore
            .messages()
            .list(userId="me", q=full_query, maxResults=max_results)
            .execute()
        )

        messages = result.get("messages", [])
        print(f"📊 Found {len(messages)} emails")

        if messages:
            # Get details for first few
            samples = []
            for msg in messages[:3]:  # Just first 3
                email_data = gmail_server._get_email_details(msg["id"])
                if email_data:
                    samples.append(
                        {
                            "Subject": email_data["subject"],
                            "Sender": email_data["sender"],
                            "Date": email_data["date"],
                        }
                    )

            return pd.DataFrame(samples)
        else:
            print("No emails found with this query")
            return pd.DataFrame()

    except Exception as e:
        print(f"❌ Error: {e}")
        return pd.DataFrame()


# Try some custom queries
print("Testing custom queries:\n")

# Test 1: Look for specific vendor
print("1. Spotify emails:")
spotify_results = test_custom_query("spotify", days_back=90)
print(spotify_results)
print()

Testing custom queries:

1. Spotify emails:
🔍 Testing query: after:2025/04/26 spotify
📊 Found 10 emails
                                             Subject  \
0  Du har väl inte missat att vi släppt fler enda...   
1  Säsongsöppning, inbjudan, premiärer och andra ...   
2  Danko Jones och Jakob Hellman live: anpassade ...   

                           Sender                 Date  
0       Debaser <info@debaser.se>  2025-07-25 13:09:16  
1    Nalen <nyhetsbrev@nalen.com>  2025-07-25 06:32:23  
2  Spotify <no-reply@spotify.com>  2025-07-24 19:56:29  



In [7]:
# Test 2: Look for Swedish keywords in body/subject
print("2. Swedish 'faktura' emails:")
faktura_results = test_custom_query("faktura", days_back=90)
# print(faktura_results)
# print()
faktura_results

2. Swedish 'faktura' emails:
🔍 Testing query: after:2025/04/26 faktura
📊 Found 10 emails


Unnamed: 0,Subject,Sender,Date
0,Din faktura från Apple,Apple <no_reply@email.apple.com>,2025-07-23 12:38:55
1,[Reservdelaronline] Skickad,Reservdelaronline <support@reservdelaronline.se>,2025-07-21 13:09:37
2,[Reservdelaronline] Orderbekräftelse,Reservdelaronline <support@reservdelaronline.se>,2025-07-20 11:31:10


In [8]:
# Test 3: Your own custom query
print("3. Custom query - edit this cell:")
# Edit the query below to test what you want:
custom_query = "subject:bill OR subject:payment"
custom_results = test_custom_query(custom_query, days_back=60)
print(custom_results)

3. Custom query - edit this cell:
🔍 Testing query: after:2025/05/26 subject:bill OR subject:payment
📊 Found 7 emails
                                      Subject                       Sender  \
0      Receipt for Your Payment to DisneyPlus   PayPal <service@paypal.se>   
1  [GitHub] Payment Receipt for userchallenge  GitHub <noreply@github.com>   
2    Receipt for Your Payment to GitHub, Inc.   PayPal <service@paypal.se>   

                  Date  
0  2025-07-03 11:09:53  
1  2025-07-03 10:54:34  
2  2025-07-03 10:17:20  


## 5. Build Better Query

In [None]:
# Based on your findings above, build a better query
def build_optimized_query(days_back=30):
    """
    Build an optimized query based on what we learned
    Edit this function based on your findings above
    """

    end_date = datetime.now()
    start_date = end_date - timedelta(days=days_back)
    date_filter = f'after:{start_date.strftime("%Y/%m/%d")}'

    # Build query components based on what you found
    query_parts = [
        # Add the most effective search terms you found above
        # "subject:faktura",
        # "subject:räkning",
        # "subject:invoice",
        # "subject:bill",
        "faktura",
        "räkning",
        "invoice",
        "bill",
        # Add specific vendors if you found them
        # 'from:spotify',
        # 'from:vattenfall',
        # Add other patterns you discovered
    ]

    # Combine with OR
    search_terms = " OR ".join(query_parts)
    optimized_query = f"{date_filter} ({search_terms})"
    # optimized_query = "after:2025/06/25 (subject:madinter OR räkning OR invoice OR bill OR has:attachment filename:pdf)"
    # optimized_query = "after:2025/05/26 (faktura OR räkning OR invoice OR bill)"
    optimized_query = "after:2025/07/24 ((subject:konsert OR konsert) OR (subject:spelning OR spelning) OR (subject:live OR live) OR (subject:scen OR scen) OR (subject:biljett OR biljett) OR (subject:köp din biljett OR köp din biljett) OR (subject:säkra din biljett OR säkra din biljett) OR (subject:intar scenen OR intar scenen) OR (subject:återvänder OR återvänder) OR (subject:kommer till OR kommer till) OR (subject:concert OR concert) OR (subject:show OR show) OR (subject:performance OR performance) OR (subject:tickets OR tickets) OR (subject:buy tickets OR buy tickets) OR (subject:on stage OR on stage) OR (subject:live at OR live at) OR filename:pdf)"

    print(f"🎯 Optimized query: {optimized_query}")

    # Test it
    try:
        result = (
            gmail_server.service.users()  # type: ignore
            .messages()
            .list(userId="me", q=optimized_query, maxResults=20)
            .execute()
        )

        messages = result.get("messages", [])
        print(f"📊 Found {len(messages)} emails with optimized query")

        return optimized_query, messages

    except Exception as e:
        print(f"❌ Error with optimized query: {e}")
        return optimized_query, []


# Test the optimized query
optimized_query, found_messages = build_optimized_query(days_back=60)

if found_messages:
    print("\n📧 Sample results from optimized query:")
    sample_results = get_sample_emails(optimized_query, max_samples=30)
    print(sample_results)
else:
    print("\n❌ No emails found with optimized query - try adjusting the search terms")

sample_results

🎯 Optimized query: after:2025/07/24 ((subject:konsert OR konsert) OR (subject:spelning OR spelning) OR (subject:live OR live) OR (subject:scen OR scen) OR (subject:biljett OR biljett) OR (subject:köp din biljett OR köp din biljett) OR (subject:säkra din biljett OR säkra din biljett) OR (subject:intar scenen OR intar scenen) OR (subject:återvänder OR återvänder) OR (subject:kommer till OR kommer till) OR (subject:concert OR concert) OR (subject:show OR show) OR (subject:performance OR performance) OR (subject:tickets OR tickets) OR (subject:buy tickets OR buy tickets) OR (subject:on stage OR on stage) OR (subject:live at OR live at) OR filename:pdf)
📊 Found 20 emails with optimized query

📧 Sample results from optimized query:


Unnamed: 0,Subject,Sender,Date,Body Preview,Attachments
0,Summer Road Trips & Iconic Stops Along the Way,"""Outside+"" <membership@outside.plus>",2025-07-27 09:11:19,"Explore national parks, yoga studios, amazing ...",[]
1,Säkra höstens underhållning på Cirkus,"""Cirkus Arena & Restaurang"" <donotreply@cirkus...",2025-07-27 12:26:57,Image\r\n[https://images.markethype.io/4f3aa55...,[]
2,Missa inte höjdpunkterna hos Live Nation – nya...,Live Nation - Sweden <email@info.livenation.se>,2025-07-27 04:04:29,https://click.mailing.livenation.com/?qs=6034e...,[]
3,What the Colbert Cancellation Signals for Crea...,Medium Daily Digest <noreply@medium.com>,2025-07-27 05:10:00,Stories for Christian Wahlström\r\n@christian....,[]
4,Handlar Swifts låt om Skarsgård?,GAFFA Sweden <noreply@gaffa.se>,2025-07-26 08:04:17,"Hej,\r\n\r\nDu har fått ett nyhetsbrev från GA...",[]
5,Sommarrealisation - Nu upp till 60% rabatt,Care of Carl <no-reply@emails.careofcarl.com>,2025-07-26 07:12:26,[https://cdn.eu1.exponea.com/care-of-carl-live...,[]
6,"5-Bullet Friday — Tyler Cowen Wisdom, A Headse...",Tim Ferriss <tim@fourhourbody.com>,2025-07-25 20:09:43,a {text-decoration: none;}\r\n\r\n************...,[]
7,NYA SLÄPP 📣 Bob Dylan • Architects • Mosebacke...,Stockholm Live <nyhetsbrev@email.stockholmlive...,2025-07-25 14:05:52,"Kommer snart: Drake, Loyle Carner + mycket mer...",[]
8,Du har väl inte missat att vi släppt fler enda...,Debaser <info@debaser.se>,2025-07-25 13:09:16,Image\r\n[https://images.markethype.io/db93f58...,[]
9,What we can learn from listening to people who...,The Medium Newsletter <newsletters@medium.com>,2025-07-25 11:40:00,Coexisting with beavers + the science behind w...,[]


In [33]:
sample_results

Unnamed: 0,Subject,Sender,Date,Body Preview,Attachments
0,Kvitto på din beställning E18142269 från Syste...,"""no-reply@systembolaget.se"" <no-reply@systembo...",2025-07-24 14:41:22,"<meta http-equiv=""Content-Type"" content=""text/...",[Försäljningskvitto 2025-07-24 044122.pdf]
1,"Your receipt from Anthropic, PBC #2030-5704-3161","""Anthropic, PBC"" <invoice+statements@mail.anth...",2025-07-24 12:54:20,No body,"[Invoice-72E2DW43-0001.pdf, Receipt-2030-5704-..."
2,"Your receipt from Anthropic, PBC #2655-8053-4226","""Anthropic, PBC"" <invoice+statements@mail.anth...",2025-07-24 07:13:05,No body,"[Invoice-5002BD60-0003.pdf, Receipt-2655-8053-..."


## 6. Update Your Config

In [10]:
# Once you find a good query, you can update the Gmail search in your demo.py
print("💡 To update your demo.py with a better query:")
print("")
print("1. Open gmail_server.py")
print("2. Find the 'fetch_emails' method")
print("3. Update the 'query' variable around line 45")
print("")
print("Current query:")
current_query = "f'after:{start_date.strftime(\"%Y/%m/%d\")} (subject:faktura OR subject:räkning OR subject:invoice OR subject:bill OR has:attachment filetype:pdf)'"
print(current_query)
print("")
print("Suggested replacement (edit based on your findings):")
if "optimized_query" in locals():
    # Extract just the search part (without the date)
    search_part = (
        optimized_query.split(" ", 1)[1] if " " in optimized_query else optimized_query
    )
    suggested = f"f'after:{{start_date.strftime(\"%Y/%m/%d\")}} {search_part}'"
    print(suggested)
else:
    print(
        "f'after:{start_date.strftime(\"%Y/%m/%d\")} (YOUR_OPTIMIZED_SEARCH_TERMS_HERE)'"
    )

💡 To update your demo.py with a better query:

1. Open gmail_server.py
2. Find the 'fetch_emails' method
3. Update the 'query' variable around line 45

Current query:
f'after:{start_date.strftime("%Y/%m/%d")} (subject:faktura OR subject:räkning OR subject:invoice OR subject:bill OR has:attachment filetype:pdf)'

Suggested replacement (edit based on your findings):
f'after:{start_date.strftime("%Y/%m/%d")} (subject:faktura OR subject:räkning OR subject:invoice OR subject:bill)'


## 7. Next Steps

Based on your experiments:

1. **If you found emails**: Update the query in `gmail_server.py` and run `python demo.py`
2. **If no invoice emails found**: Try broader time ranges (`--days-back 90` or `--days-back 365`)
3. **If you have invoices but different format**: Update the keywords in `config/config.yaml`
4. **Test with your best query**: Run `python demo.py --days-back 90 --verbose`