# Database Query Notebook

This notebook provides a scratchpad for querying the email processing database with Pandas.

In [27]:
import pandas as pd
import sys
import os

# Add the project root to Python path
sys.path.append(os.getcwd())

from email_processing.database.db_manager import EmailDatabaseManager

In [28]:
# Initialize database manager
db_manager = EmailDatabaseManager()
connection = db_manager.get_connection()

print(f"Connected to database: {db_manager.db_path}")

Connected to database: data/email_processing.db


In [29]:
# Merge all database-tables into dataframe

query_email = "SELECT * FROM emails"
emails_df = pd.read_sql(query_email, connection)

query_categorizations = "SELECT * FROM categorizations"
categorizations_df = pd.read_sql(query_categorizations, connection)

query_summaries = "SELECT * FROM summaries"
summaries_df = pd.read_sql(query_summaries, connection)

In [38]:
pd.set_option("display.max_colwidth", 50)
df = pd.DataFrame()
df = emails_df.merge(
    categorizations_df[["email_id", "ai_reasoning"]],
    on="email_id",
    how="left",
)
df = df.merge(
    summaries_df[["email_id", "purpose", "value_for_recipient", "ai_reasoning"]],
    suffixes=("_cat", "_sum"),
    on="email_id",
    how="left",
)
df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
email_id,1990bb498a57e609,1990bb44ffd9e8bc,1990b78f53342fe3,1990b4558397f0b4,1990b1a132ca27f0,1990b180383e7498,1990af6275cda803,1990aee2541f2915,1990aca8c0a2ad1c,1990ab7d6d290b51
date,2025-09-02 18:33:28.000000,2025-09-02 18:33:09.000000,2025-09-02 10:28:19.000000,2025-09-02 16:31:56.000000,2025-09-02 15:44:37.000000,2025-09-02 17:42:26.000000,2025-09-02 15:05:27.000000,2025-09-02 14:56:42.000000,2025-09-02 14:15:55.000000,2025-09-02 13:57:23.000000
sender,Netflix <info@account.netflix.com>,Tim Ferriss <tim@fourhourbody.com>,GitHub <noreply@github.com>,Temu <temu@orders.temu.com>,LinkedIn <messages-noreply@linkedin.com>,"""H&M"" <membership@email.hm.com>",Bagaren och Kocken <no-reply@bagarenochkocken.se>,Elite Hotels Rewards <newsletter@elite.se>,GAFFA Sweden <noreply@gaffa.se>,SEB no-reply <noreply@seb.se>
subject,En ny enhet använder ditt konto,"New from Tim — ""What Most Has My Attention Rig...",[GitHub] Your Dependabot alerts for the week o...,En del av din Temu-beställning har överförts t...,2 new Chief Product Officer openings at Amendo...,Uppdatering av villkor,Lagerrensning! Fynda över 1600 produkter,Skäm bort dig med en höstweekend 🍁,Får vi nytt från Håkan Hellström?,Din ansökan om att bli kund
body_markdown,Kontrollera vem som använder ditt Netflix-kont...,a {text-decoration: none;} ​ ************ New ...,Dependabot alerts on GitHub Dependabot alerts ...,\---------------------------------------------...,\---------------------------------------- Hiri...,Open the email in browser on following link\nh...,Belysningsveckor | 15% på utvalt från Sage Ple...,Upp till 25 % rabatt för medlemmar [WEBBVERSIO...,"Hello, You have received a newsletter from GAF...",\n\n
body_clean,Kontrollera vem som använder ditt Netflix-kont...,a {text-decoration: none;} ​ ************ New ...,Dependabot alerts on GitHub Dependabot alerts ...,\---------------------------------------------...,\---------------------------------------- Hiri...,Open the email in browser on following link\nh...,Belysningsveckor | 15% på utvalt från Sage Ple...,Upp till 25 % rabatt för medlemmar [WEBBVERSIO...,"Hello, You have received a newsletter from GAF...",
pdf_text,,,,,,,,,,
raw_email,"{""id"": ""1990bb498a57e609"", ""subject"": ""En ny e...","{""id"": ""1990bb44ffd9e8bc"", ""subject"": ""New fro...","{""id"": ""1990b78f53342fe3"", ""subject"": ""[GitHub...","{""id"": ""1990b4558397f0b4"", ""subject"": ""En del ...","{""id"": ""1990b1a132ca27f0"", ""subject"": ""2 new C...","{""id"": ""1990b180383e7498"", ""subject"": ""Uppdate...","{""id"": ""1990af6275cda803"", ""subject"": ""Lagerre...","{""id"": ""1990aee2541f2915"", ""subject"": ""Sk\u00e...","{""id"": ""1990aca8c0a2ad1c"", ""subject"": ""F\u00e5...","{""id"": ""1990ab7d6d290b51"", ""subject"": ""Din ans..."
category_id,2,1,2,1,1,1,1,1,1,1
ai_reasoning_cat,This is a security alert from Netflix about a ...,This is a newsletter email from Tim Ferriss co...,This is a security alert email from GitHub abo...,This is a shipping notification email from Tem...,This is a promotional email from LinkedIn noti...,This is a notification email from H&M about te...,This is clearly a promotional email about a cl...,This is a promotional newsletter from Elite Ho...,This is clearly a newsletter email from GAFFA ...,This appears to be an automated notification e...


In [31]:
import textwrap

pd.set_option("display.max_colwidth", 50)
# Fetch all emails from the emails table
query = "SELECT * FROM categorizations"
categorizations_df = pd.read_sql(query, connection)


# def wrap_column(categorizations_df, col, width=80):
#     categorizations_df[col] = categorizations_df[col].apply(
#         lambda x: "\n".join(textwrap.wrap(str(x), width))
#     )
#     return categorizations_df


# # Usage:
# categorizations_df = wrap_column(categorizations_df, "ai_reasoning", width=80)


print(f"Fetched {len(categorizations_df)} categorizations from database")
categorizations_df.sort_values(by="created_at", ascending=True)[
    ["email_id", "ai_reasoning"]
].head(20)

Fetched 30 categorizations from database


Unnamed: 0,email_id,ai_reasoning
0,1990bb498a57e609,This is a security alert from Netflix about a ...
1,1990bb44ffd9e8bc,This is a newsletter email from Tim Ferriss co...
2,1990b78f53342fe3,This is a security alert email from GitHub abo...
3,1990b4558397f0b4,This is a shipping notification email from Tem...
4,1990b1a132ca27f0,This is a promotional email from LinkedIn noti...
5,1990b180383e7498,This is a notification email from H&M about te...
6,1990af6275cda803,This is clearly a promotional email about a cl...
7,1990aee2541f2915,This is a promotional newsletter from Elite Ho...
8,1990aca8c0a2ad1c,This is clearly a newsletter email from GAFFA ...
9,1990ab7d6d290b51,This appears to be an automated notification e...


In [32]:
# Fetch all emails from the emails table
query = "SELECT * FROM emails"
emails_df = pd.read_sql(query, connection)

print(f"Fetched {len(emails_df)} emails from database")
emails_df.sort_values(by="date", ascending=True)

Fetched 30 emails from database


Unnamed: 0,email_id,date,sender,subject,body_markdown,body_clean,pdf_text,raw_email,category_id
20,19909fd14a5e7e55,2025-09-02 04:31:58.000000,"""Outside+"" <membership@outside.plus>",Top 5 Reads: How Two Hikers Fought Off Bears—A...,"Plus, the most-searched National Park accordin...","Plus, the most-searched National Park accordin...",,"{""id"": ""19909fd14a5e7e55"", ""subject"": ""Top 5 R...",1
29,19908d4ff85b69c8,2025-09-02 05:10:00.000000,Medium Daily Digest <noreply@medium.com>,MindsDB: The Only MCP Server You’ll Ever Need ...,Stories for Christian Wahlström @christian.wah...,Stories for Christian Wahlström @christian.wah...,,"{""id"": ""19908d4ff85b69c8"", ""subject"": ""MindsDB...",1
15,1990a529b9c49232,2025-09-02 06:06:46.000000,adidas <adidas@se-news.adidas.com>,Släpptes nyss: Teamgeist & Adilenium 4.0,adidas Godkänd streetlook\nhttps://click.link....,adidas Godkänd streetlook\nhttps://click.link....,,"{""id"": ""1990a529b9c49232"", ""subject"": ""Sl\u00e...",1
28,19909627cf61b885,2025-09-02 07:44:31.000000,LinkedIn Job Alerts <jobalerts-noreply@linkedi...,“consultant”: KPMG Sweden - Management Consult...,Your job alert for consultant in Greater Stock...,Your job alert for consultant in Greater Stock...,,"{""id"": ""19909627cf61b885"", ""subject"": ""\u201cc...",1
27,19909729c36593a3,2025-09-02 08:02:09.000000,GolfStar Sverige <no-reply@sweetspot.io>,Avbokning: GolfStar Sverige,| \n--- \n \n| Hej Christian \n--- \nDin...,|\n---\n| Hej Christian\n---\nDin bokning på ...,,"{""id"": ""19909729c36593a3"", ""subject"": ""Avbokni...",1
26,1990973ec9db7b70,2025-09-02 08:03:35.000000,GolfStar Sverige <no-reply@sweetspot.io>,Återbetalning,| | | \n--- \n \n## Återbetalning \n ...,| | |\n---\n## Återbetalning\n| | **Transa...,,"{""id"": ""1990973ec9db7b70"", ""subject"": ""\u00c5t...",1
25,199098fd4a49e8f9,2025-09-02 08:34:04.000000,Care of Carl <no-reply@emails.careofcarl.com>,Upptäck Outdoor: Premiumsortiment för naturens...,[https://cdn.eu1.exponea.com/care-of-carl-\nli...,[https://cdn.eu1.exponea.com/care-of-carl-\nli...,,"{""id"": ""199098fd4a49e8f9"", ""subject"": ""Uppt\u0...",1
24,19909ab95b5321f6,2025-09-02 09:04:21.000000,LinkedIn <jobs-listings@linkedin.com>,Companies like Forefront Amplify and others in...,Hiring in your network Explore relevant jobs t...,Hiring in your network Explore relevant jobs t...,,"{""id"": ""19909ab95b5321f6"", ""subject"": ""Compani...",1
23,19909d0187d04649,2025-09-02 09:44:15.000000,LinkedIn Job Alerts <jobalerts-noreply@linkedi...,“project manager”: Persona - Project Manager (...,Your job alert for project manager in Greater ...,Your job alert for project manager in Greater ...,,"{""id"": ""19909d0187d04649"", ""subject"": ""\u201cp...",1
22,19909d822e79e105,2025-09-02 09:53:01.000000,GolfStar Sverige <no-reply@sweetspot.io>,Bokningsbekräftelse: GolfStar Sverige,| \n--- \n \n| Hej Christian \n--- \nDin...,|\n---\n| Hej Christian\n---\nDin bokning på ...,,"{""id"": ""19909d822e79e105"", ""subject"": ""Bokning...",1


In [33]:
mask = emails_df["date"] > "2023-01-01"
emails_df[mask][["date", "sender", "subject", "body_clean"]].head(20)

Unnamed: 0,date,sender,subject,body_clean
0,2025-09-02 18:33:28.000000,Netflix <info@account.netflix.com>,En ny enhet använder ditt konto,Kontrollera vem som använder ditt Netflix-kont...
1,2025-09-02 18:33:09.000000,Tim Ferriss <tim@fourhourbody.com>,"New from Tim — ""What Most Has My Attention Rig...",a {text-decoration: none;} ​ ************ New ...
2,2025-09-02 10:28:19.000000,GitHub <noreply@github.com>,[GitHub] Your Dependabot alerts for the week o...,Dependabot alerts on GitHub Dependabot alerts ...
3,2025-09-02 16:31:56.000000,Temu <temu@orders.temu.com>,En del av din Temu-beställning har överförts t...,\---------------------------------------------...
4,2025-09-02 15:44:37.000000,LinkedIn <messages-noreply@linkedin.com>,2 new Chief Product Officer openings at Amendo...,\---------------------------------------- Hiri...
5,2025-09-02 17:42:26.000000,"""H&M"" <membership@email.hm.com>",Uppdatering av villkor,Open the email in browser on following link\nh...
6,2025-09-02 15:05:27.000000,Bagaren och Kocken <no-reply@bagarenochkocken.se>,Lagerrensning! Fynda över 1600 produkter,Belysningsveckor | 15% på utvalt från Sage Ple...
7,2025-09-02 14:56:42.000000,Elite Hotels Rewards <newsletter@elite.se>,Skäm bort dig med en höstweekend 🍁,Upp till 25 % rabatt för medlemmar [WEBBVERSIO...
8,2025-09-02 14:15:55.000000,GAFFA Sweden <noreply@gaffa.se>,Får vi nytt från Håkan Hellström?,"Hello, You have received a newsletter from GAF..."
9,2025-09-02 13:57:23.000000,SEB no-reply <noreply@seb.se>,Din ansökan om att bli kund,


In [34]:
# Example: Query specific columns only
query = "SELECT email_id, sender, subject, date FROM emails"
emails_summary_df = pd.read_sql(query, connection)

print("Email Summary:")
emails_summary_df.head()

Email Summary:


Unnamed: 0,email_id,sender,subject,date
0,1990bb498a57e609,Netflix <info@account.netflix.com>,En ny enhet använder ditt konto,2025-09-02 18:33:28.000000
1,1990bb44ffd9e8bc,Tim Ferriss <tim@fourhourbody.com>,"New from Tim — ""What Most Has My Attention Rig...",2025-09-02 18:33:09.000000
2,1990b78f53342fe3,GitHub <noreply@github.com>,[GitHub] Your Dependabot alerts for the week o...,2025-09-02 10:28:19.000000
3,1990b4558397f0b4,Temu <temu@orders.temu.com>,En del av din Temu-beställning har överförts t...,2025-09-02 16:31:56.000000
4,1990b1a132ca27f0,LinkedIn <messages-noreply@linkedin.com>,2 new Chief Product Officer openings at Amendo...,2025-09-02 15:44:37.000000


In [35]:
# Custom query space - modify as needed
custom_query = """
SELECT 
    email_id,
    sender,
    subject,
    date,
    category_id
FROM emails 
WHERE sender LIKE '%@%'
ORDER BY date DESC
LIMIT 10
"""

result_df = pd.read_sql(custom_query, connection)
print("Custom Query Results:")
result_df

Custom Query Results:


Unnamed: 0,email_id,sender,subject,date,category_id
0,1990bb498a57e609,Netflix <info@account.netflix.com>,En ny enhet använder ditt konto,2025-09-02 18:33:28.000000,2
1,1990bb44ffd9e8bc,Tim Ferriss <tim@fourhourbody.com>,"New from Tim — ""What Most Has My Attention Rig...",2025-09-02 18:33:09.000000,1
2,1990b180383e7498,"""H&M"" <membership@email.hm.com>",Uppdatering av villkor,2025-09-02 17:42:26.000000,1
3,1990b4558397f0b4,Temu <temu@orders.temu.com>,En del av din Temu-beställning har överförts t...,2025-09-02 16:31:56.000000,1
4,1990b1a132ca27f0,LinkedIn <messages-noreply@linkedin.com>,2 new Chief Product Officer openings at Amendo...,2025-09-02 15:44:37.000000,1
5,1990af6275cda803,Bagaren och Kocken <no-reply@bagarenochkocken.se>,Lagerrensning! Fynda över 1600 produkter,2025-09-02 15:05:27.000000,1
6,1990a859fd71266f,no-reply@gjensidige.se,Du har fått ett meddelande från Gjensidige För...,2025-09-02 15:02:31.000000,1
7,1990aee2541f2915,Elite Hotels Rewards <newsletter@elite.se>,Skäm bort dig med en höstweekend 🍁,2025-09-02 14:56:42.000000,1
8,1990aca8c0a2ad1c,GAFFA Sweden <noreply@gaffa.se>,Får vi nytt från Håkan Hellström?,2025-09-02 14:15:55.000000,1
9,1990ab7d6d290b51,SEB no-reply <noreply@seb.se>,Din ansökan om att bli kund,2025-09-02 13:57:23.000000,1
