In [33]:
import requests
import pandas as pd
from urllib.parse import urlparse, unquote
from bs4 import BeautifulSoup
import re
from collections import Counter
import time

# Hardcoded list of domains to exclude
EXCLUDE_DOMAINS = [
    # Add more domains as needed (just the domain, no "www." needed)
]

STOPWORDS = set("""
a about above after again against all am an and any are aren't as at be because been before being below between both but by
could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves
""".split())

def get_top_keywords(text, num_keywords=5):
    words = re.findall(r'\b[a-z]{3,}\b', text.lower())
    filtered = [w for w in words if w not in STOPWORDS]
    freq = Counter(filtered)
    keywords = [kw for kw, _ in freq.most_common(num_keywords)]
    return ', '.join(keywords) if keywords else ''

def get_keywords_from_html_fields(url, num_keywords=5):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        resp = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(resp.text, 'html.parser')

        # Title tag
        title = soup.title.string if soup.title and soup.title.string else ''

        # Meta title
        meta_title = ''
        meta_title_tag = soup.find('meta', attrs={'name': 'title'})
        if meta_title_tag and meta_title_tag.get('content'):
            meta_title = meta_title_tag['content']

        # Meta description
        meta_desc = ''
        meta_desc_tag = soup.find('meta', attrs={'name': 'description'})
        if meta_desc_tag and meta_desc_tag.get('content'):
            meta_desc = meta_desc_tag['content']

        # Headings
        headings_text = ' '.join([h.get_text(separator=' ', strip=True) for h in soup.find_all(['h1', 'h2', 'h3'])])

        # Alt text from images
        alt_texts = ' '.join([img.get('alt', '') for img in soup.find_all('img') if img.get('alt')])

        # Anchor texts
        anchor_texts = ' '.join([a.get_text(separator=' ', strip=True) for a in soup.find_all('a')])

        # URL keywords (words in the path of the URL)
        url_path = unquote(urlparse(url).path)
        url_words = ' '.join(re.findall(r'\b[a-z]{3,}\b', url_path.lower()))

        # Compile per-field keywords
        return {
            'title_keywords': get_top_keywords(title, num_keywords),
            'meta_title_keywords': get_top_keywords(meta_title, num_keywords),
            'meta_description_keywords': get_top_keywords(meta_desc, num_keywords),
            'heading_keywords': get_top_keywords(headings_text, num_keywords),
            'alt_text_keywords': get_top_keywords(alt_texts, num_keywords),
            'anchor_text_keywords': get_top_keywords(anchor_texts, num_keywords),
            'url_keywords': get_top_keywords(url_words, num_keywords),
        }
    except Exception as e:
        return {
            'title_keywords': 'N/A',
            'meta_title_keywords': 'N/A',
            'meta_description_keywords': 'N/A',
            'heading_keywords': 'N/A',
            'alt_text_keywords': 'N/A',
            'anchor_text_keywords': 'N/A',
            'url_keywords': 'N/A',
        }

def generate_search_queries(topic, context):
    templates = [
        f"Best {topic} {context}",
        f"Top {topic} {context}",
        f"{topic} guide {context}",
        f"How to choose {topic} {context}",
        f"{topic} tips {context}",
        f"Latest trends in {topic} {context}",
        f"{topic} reviews {context}",
        f"{topic} recommendations {context}",
        f"Affordable {topic} {context}",
        f"Where to find {topic} {context}",
    ]
    return templates

def google_custom_search(api_key, cse_id, query, num=3):  # num=3 for speed, increase if desired
    url = "https://www.googleapis.com/customsearch/v1"
    params = {
        "key": api_key,
        "cx": cse_id,
        "q": query,
        "num": num,
    }
    response = requests.get(url, params=params)
    data = response.json()
    results = []
    for item in data.get("items", []):
        title = item.get("title")
        link = item.get("link")
        snippet = item.get("snippet")
        domain = urlparse(link).netloc
        results.append({
            'query': query,
            'rank': len(results)+1,
            'title': title,
            'url': link,
            'domain': domain,
            'snippet': snippet
        })
    return results

def is_excluded_domain(domain):
    domain = domain.lower()
    for excl in EXCLUDE_DOMAINS:
        if domain.endswith(excl):
            return True
    return False

def main():
    print("Welcome! This script does SERP analysis, extracts top keywords per field, and skips hardcoded domains.")
    api_key = input("Enter your Google Custom Search API key: ").strip()
    cse_id = input("Enter your Custom Search Engine ID (CSE ID): ").strip()
    topic = input("Enter your search topic: ").strip()
    context = input("Enter a context or condition (e.g., 'for beginners', 'during summer', '2025'): ").strip()

    queries = generate_search_queries(topic, context)
    all_results = []
    for query in queries:
        print(f"\nRunning SERP analysis for: {query}")
        results = google_custom_search(api_key, cse_id, query)
        for r in results:
            domain = r['domain'].lower()
            if is_excluded_domain(domain):
                print(f"  Skipping excluded domain: {domain}")
                continue
            print(f"  Fetching keywords for: {r['title']} ({r['url']})")
            field_keywords = get_keywords_from_html_fields(r['url'])
            r.update(field_keywords)
            time.sleep(1)  # polite pause
            all_results.append(r)

    df = pd.DataFrame(all_results)
    csv_filename = "serp_analysis_per_field_keywords.csv"
    df.to_csv(csv_filename, index=False)
    print(f"\nSaved SERP analysis with per-field keywords to {csv_filename}")

    try:
        from google.colab import files
        files.download(csv_filename)
        print("Download link generated.")
    except ImportError:
        print("Download the file from your working directory.")

if __name__ == "__main__":
    main()

Welcome! This script does SERP analysis, extracts top keywords per field, and skips hardcoded domains.
Enter your Google Custom Search API key: AIzaSyAIXaAVOvpOMk2YVuhssq5ThanR1Ij-Wx8
Enter your Custom Search Engine ID (CSE ID): 44ca7a913db3e4724
Enter your search topic: Discussing AI ethics in design
Enter a context or condition (e.g., 'for beginners', 'during summer', '2025'): for UX designers

Running SERP analysis for: Best Discussing AI ethics in design for UX designers
  Fetching keywords for: Future of UX | Your Design, Tech and User Experience Podcast | AI ... (https://podcasts.apple.com/sk/podcast/future-of-ux-your-design-tech-and-user/id1480706373)
  Fetching keywords for: Designer Debate: AI Ethics in Design | Toptal® (https://www.toptal.com/designers/artificial-intelligence/ai-ethics-in-design)
  Fetching keywords for: Design Principles for a New AI World - UX Magazine (https://uxmag.com/articles/design-principles-for-a-new-ai-world)

Running SERP analysis for: Top Discussi

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download link generated.
