# Import Required Libraries
Import the necessary libraries for file handling, HTTP requests, and the Gemini API.

In [None]:
import os
import requests
import google.generativeai as genai

from urllib.parse import urlparse, quote


# Define Helper Functions and News URLs
Define the list of 20 German/European news sources and create helper functions for fetching content and generating filenames.

In [None]:
# Define the list of 20 German/European news sources
NEWS_URLS = [
    "https://www.spiegel.de",
    "https://www.orf.at",
    "https://www.faz.net",
    "https://www.welt.de",
    "https://www.lemonde.fr",
    "https://www.lefigaro.fr",
    "https://www.corriere.it",
    "https://www.theguardian.com",
    "https://www.lavanguardia.com",
    "https://www.euronews.com",
]

# Define the output directory for saving fetched content
out_dir = "data/misc/webpgs"


In [None]:
def fetch_md(url):
    encoded_url = quote(url, safe="")
    jina_url = f"https://r.jina.ai/{encoded_url}"

    try:
        response = requests.get(jina_url, timeout=30)
        response.raise_for_status()  # Raise exception for HTTP errors
    except requests.exceptions.Timeout as e:
        print(e)
        return None

    return response.text


def fname(url):
    d = urlparse(url).netloc.split("www.")[1]
    return f"{d.replace('/', '_')}.md"

# Fetch and Save News Articles
Loop through the news URLs, fetch content for each one, and save the retrieved markdown content to files.

In [None]:
# Loop through the news URLs, fetch content for each one, and save the retrieved markdown content to files
for url in NEWS_URLS:
    fn = f"{out_dir}/{fname(url)}"
    md = fetch_md(url)  # Fetch markdown content for the URL
    if md:  # If content is successfully fetched
        print(f"Saving {fn}...")
        with open(fn, "w", encoding="utf-8") as f:
            f.write(md)  # Save the content to a file in the output director

# Load Saved Articles
Read the saved markdown files and load their contents into a dictionary for further processing.

In [None]:
articles = {}
for filename in os.listdir(out_dir):
    if filename.endswith(".md"):
        file_path = os.path.join(out_dir, filename)
        with open(file_path, encoding="utf-8") as f:
            file_stem = os.path.splitext(filename)[0]  # Get filename without extension
            articles[file_stem] = f.read()

# Initialize Gemini Client
Set up the Gemini client using API keys from environment variables and configure the model to use.

# Obtaining a Google Gemini API Key

To use the Gemini API in this notebook, you'll need to obtain an API key:

1. **Go to Google AI Studio**: Visit [makersuite.google.com](https://makersuite.google.com/)

2. **Sign in with your Google Account**: Create one if needed

3. **Get API Key**: 
   - Click on your profile picture in the top-right corner
   - Select "Get API key"
   - Either create a new key or use an existing one

4. **Security Best Practices**:
   - Store your key in an environment variable or .env file
   - Add .env to your .gitignore file
   - Never commit API keys to version control

For more information, visit the [Gemini API documentation](https://ai.google.dev/docs/gemini-api/setup).

In [None]:
# API_KEY = "sadkal12312asdl0312ejksdfj1023"
MODEL = "gemini-2.0-flash"

In [None]:
# Configure the API with your key
genai.configure(api_key=API_KEY)

# Then you can use the API directly
model = genai.GenerativeModel(MODEL)  # Replace MODEL with your model name

# Create Gemini Query Function
Create a function to send prompts to the Gemini API and retrieve responses with appropriate configuration.

In [None]:
def gemini_query(prompt):
    try:
        # The model instance is already created as 'model' variable
        response = model.generate_content(
            prompt,  # Just pass the prompt directly
            generation_config={
                "max_output_tokens": 500,  # Limit the output tokens
                "temperature": 0.3,  # Set the temperature for response variability
            },
        )
        # Extract text from the response
        return response.text
    except Exception as e:
        return f"Error: {e}"  # Handle any exceptions and return the error messages

# Execute Different Gemini Queries
Run five different types of queries on the collected news data: individual summaries, trend extraction from a specific source, creative summary, overall news trends analysis, and technology/AI coverage analysis.

In [None]:
for name, content in articles.items():
    print(f"{name}:")
    print(content)
    print("\n\n")

In [None]:
# Query 1: Summarize main headlines for each individual article
print("=== Query 1: Summaries for individual pages ===")
for name, content in articles.items():
    prompt = f"""The text below was scraped from {name}. Please summarize the main headlines and key news from this ALREADY SCRAPED content:

CONTENT:
{content[:2000]}
"""
    summary = gemini_query(prompt)
    print(f"\n[{name}]")
    print(summary)


In [None]:
# Query 2: Extract emerging trends from a specific source (e.g., "spiegel.de")
if "spiegel.de" in articles:
    prompt = f"""The text below was already scraped from spiegel.de. Based ONLY on this pre-scraped content, list 3 emerging news trends:

CONTENT:
{articles["spiegel.de"][:1500]}
"""
    print("\n=== Query 2: Emerging trends on spiegel.de ===")
    print(gemini_query(prompt))


In [None]:
# Query 3: Provide a creative summary for a specific source (e.g., "dw.com")
if "dw.com" in articles:
    prompt = f"""The text below was already scraped from dw.com. Based ONLY on this pre-scraped content, provide a creative summary of the news:

CONTENT:
{articles["dw.com"][:1500]}
"""
    print("\n=== Query 3: Creative summary for dw.com ===")
    print(gemini_query(prompt))


In [None]:
# Query 4: Analyze major news trends across all sources
full_text = "\n\n---SOURCE SEPARATOR---\n\n".join(articles.values())
prompt = f"""The text below contains ALREADY SCRAPED content from multiple European news sources. Based ONLY on this pre-scraped content, list 5 major news trends and summarize the overall current news landscape:

CONTENT:
{full_text[:5000]}
"""

print("\n=== Query 4: Overall European news trends ===")
print(gemini_query(prompt))


In [None]:
# Query 5: Analyze how technology and AI are covered across all sources
prompt = f"""The text below contains ALREADY SCRAPED content from multiple European news sources. Based ONLY on this pre-scraped content, analyze and summarize how technology and AI are being covered. Highlight any emerging themes or concerns:

CONTENT:
{full_text[:5000]}
"""

print("\n=== Query 5: Technology and AI coverage analysis ===")
print(gemini_query(prompt))