In [1]:
# Import necessary libraries
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

# Define scopes for Gmail API
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]

def authenticate_gmail():
    """Authenticate and return the Gmail API service object."""
    flow = InstalledAppFlow.from_client_secrets_file("credentials.json", SCOPES)
    creds = flow.run_local_server(port=0)  # Opens browser for authentication
    return build("gmail", "v1", credentials=creds)

# Authenticate and create the service object
service = authenticate_gmail()

def search_emails(query='subject:("application received" OR "application for" OR "application received by" OR " thank you for applying to" OR "your application was sent" OR "your application to" OR "thank you for your application" OR "we received your application" OR "your job application")'):
    """Search for job application emails and paginate through all results."""
    messages = []
    next_page_token = None

    while True:
        # Fetch messages
        results = service.users().messages().list(userId="me", q=query, maxResults=100, pageToken=next_page_token).execute()
        
        # Add found messages
        messages.extend(results.get("messages", []))

        # Check for next page
        next_page_token = results.get("nextPageToken")

        if not next_page_token:
            break  # Stop when there are no more pages

    return messages

# Example: List first 5 emails
results = service.users().messages().list(userId="me", maxResults=5).execute()
messages = results.get("messages", [])

print("Fetched Messages:")
for msg in messages:
    print(msg["id"])

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=1001019770922-er6k6heu5jrgbo04j48hu12s8da6oiq3.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A62122%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fgmail.readonly&state=ZFBzMOUKT34bbtWa6Gaf65QR2Dc5oR&access_type=offline


In [15]:
import email.utils
import pandas as pd
import re
import base64
import spacy

# Load spaCy NER model
nlp = spacy.load("en_core_web_sm")

# Function to get email content
def get_email_body(message_id):
    msg = service.users().messages().get(userId="me", id=message_id, format="full").execute()

    parts = msg["payload"].get("parts", [])
    if parts:
        for part in parts:
            if part_body := part.get('body', {}).get('data'):
                return base64.urlsafe_b64decode(part_body).decode('utf-8')
    return ""

# Function to get subject line
def get_email_subject(message_id):
    msg = service.users().messages().get(userId="me", id=message_id, format="metadata").execute()
    headers = msg["payload"]["headers"]
    subject = next(header['value'] for header in headers if header['name'] == 'Subject')
    return subject

# Extract company name from subject line
def parse_company(subject):
    patterns = [
        r"your application was sent to ([\w\s,&.-]+)",
        r"Thank you for applying to ([\w\s,&.-]+)",
        r"Application received – Thank You for Applying to ([\w\s,&.-]+)",
        r"Application received for .* at ([\w\s,&.-]+)",
        r"(.+) received your application",
    ]

    for pattern in patterns:
        match = re.search(pattern, subject, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    return subject.strip()

# Use spaCy NER to extract position from email body with regex fallback
def parse_position(email_body):
    doc = nlp(email_body)
    positions = [ent.text for ent in doc.ents if ent.label_ in ("ORG", "WORK_OF_ART", "PRODUCT", "EVENT")]
    if positions:
        return positions[0]

    # Fallback regex patterns if NER doesn't find anything
    position_patterns = [
        r"your application was sent to .* for the ([\w\s-]+) position",
        r"Thanks for taking the time to apply for the ([\w\s-]+) position",
        r"we received your application for the ([\w\s-]+) position",
        r"thank you for your application for the ([\w\s-]+) position",
        r"regarding your application for the ([\w\s-]+) position",
        r"Application received for the ([\w\s-]+) position",
        r"application for the ([\w\s-]+) role",
    ]

    for pattern in position_patterns:
        match = re.search(pattern, email_body, re.IGNORECASE)
        if match:
            return match.group(1).strip()
    return None

# Extract date from email metadata
def get_email_date(message_id):
    email_info = service.users().messages().get(userId="me", id=message_id, format="metadata").execute()
    headers = email_info['payload'].get('headers', [])
    date_str = next((header['value'] for header in headers if header['name'] == 'Date'), None)

    if date_str:
        date_received = email.utils.parsedate_to_datetime(date_str)
        return date_received.date()
    else:
        return None

# Collect data
messages = search_emails()
data = []

for msg in messages:
    email_id = msg['id']
    subject = get_email_subject(email_id)
    email_body = get_email_body(email_id)

    company = parse_company(subject)
    position = parse_position(email_body)
    date_received = get_email_date(email_id)

    if date_received:
        data.append({
            "Company": company,
            "Position": position,
            "Date Applied": date_received
        })

# Convert data into a DataFrame
applications_df = pd.DataFrame(data)

applications_df.tail(40)