In [1]:
import requests
import pandas as pd
import time

def fetch_gdelt_context(query, max_records=200, start=0):
    base_url = "https://api.gdeltproject.org/api/v2/context/context"
    
    params = {
        'query': query,
        'mode': 'artlist',
        'format': 'json',
        'maxrecords': max_records,
        'start': start
    }
    
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse JSON response
        data = response.json()
        articles = data.get('articles', [])

        return articles

    except requests.exceptions.RequestException as e:
        print(f"Error fetching articles: {e}")
        return []

# List of topics to query
topics = [
    'climate change', 'artificial intelligence', 'global warming', 'covid-19', 'mental health',
    'electric vehicles', 'blockchain', 'cybersecurity', 'space exploration', 'quantum computing',
    'renewable energy', 'cryptocurrency', 'big data', 'internet of things', '5G technology',
    'nanotechnology', 'biotechnology', 'genomics', 'robotics', 'smart cities'
]

# Initialize DataFrame to store all articles
all_articles_df = pd.DataFrame()

# Fetch data for each topic
for topic in topics:
    start = 0
    while True:
        articles = fetch_gdelt_context(query=topic, max_records=200, start=start)
        if not articles:
            break  # Exit loop if no more articles are returned

        # Convert list of dictionaries to DataFrame
        articles_df = pd.DataFrame(articles)

        # Append to the main DataFrame
        all_articles_df = pd.concat([all_articles_df, articles_df], ignore_index=True)

        # Break if we reach the required number of articles
        if len(all_articles_df) >= 50000:
            break

        # Update start for next page
        start += 200

        # Pause to respect API rate limits
        time.sleep(1)

    # Check if we have reached the required number of articles
    if len(all_articles_df) >= 1000:
        break

print(f"Fetched {len(all_articles_df)} articles.")

# Save DataFrame to CSV file
all_articles_df.to_csv('gdelt_news_articles.csv', index=False)

# Optionally, display or further process the DataFrame
print(all_articles_df.head())


Fetched 109 articles.
                                                 url  \
0  https://www.ideastream.org/2024-06-15/under-pa...   
1  https://www.thesunchronicle.com/news/nation_wo...   
2  https://www.nzherald.co.nz/nz/climate-change-h...   
3  https://lasvegassun.com/news/2024/jun/15/milan...   
4  https://www.aspentimes.com/opinion/irreligious...   

                                               title          seendate  \
0  Under Paris is a Seine-sational French shark m...  20240616T003130Z   
1  Much of US braces for extreme weather, from so...  20240616T003125Z   
2  Climate change: How a heating planet is chargi...  20240616T003124Z   
3  Milan menswear seeks reassurance in nostalgia ...  20240616T003120Z   
4     Irreligious: What is the purpose of education?  20240616T003119Z   

                                         socialimage               domain  \
0  http://npr-brightspot.s3.amazonaws.com/84/3d/6...       ideastream.org   
1  https://bloximages.chicago2.vip.townnew