In [1]:
import requests
import pandas as pd
import time

def fetch_gdelt_context(query, max_records=200, start=0):
    base_url = "https://api.gdeltproject.org/api/v2/context/context"
    
    params = {
        'query': query,
        'mode': 'artlist',
        'format': 'json',
        'maxrecords': max_records,
        'start': start
    }
    
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse JSON response
        data = response.json()
        articles = data.get('articles', [])

        return articles

    except requests.exceptions.RequestException as e:
        print(f"Error fetching articles: {e}")
        return []

# List of topics to query
topics = [
    'canada'
]

# number of articles to fetch
article_count = 100000

# Initialize DataFrame to store all articles
all_articles_df = pd.DataFrame()

# Fetch data for each topic
for topic in topics:
    start = 0
    while True:
        articles = fetch_gdelt_context(query=topic, max_records=200, start=start)
        if not articles:
            break  # Exit loop if no more articles are returned

        # Convert list of dictionaries to DataFrame
        articles_df = pd.DataFrame(articles)

        # Append to the main DataFrame
        all_articles_df = pd.concat([all_articles_df, articles_df], ignore_index=True)

        # Break if we reach the required number of articles
        if len(all_articles_df) >= article_count:
            break

        # Update start for next page
        start += 200

        print("Fetched total data : ",len(all_articles_df))

        # Pause to respect API rate limits
        time.sleep(1)

    # Check if we have reached the required number of articles
    if len(all_articles_df) >= article_count:
        break

print(f"Fetched {len(all_articles_df)} articles.")

# Save DataFrame to CSV file
all_articles_df.to_csv('data/gdelt_news_articles.csv', index=False)

# Optionally, display or further process the DataFrame
print(all_articles_df.head())


Fetched total data :  97
Fetched total data :  194
Fetched total data :  291
Fetched total data :  388
Fetched total data :  485
Fetched total data :  582
Fetched total data :  679
Fetched total data :  776
Fetched total data :  873
Fetched total data :  970
Fetched total data :  1067
Fetched total data :  1164
Fetched total data :  1261
Fetched total data :  1358
Fetched total data :  1455
Fetched total data :  1552
Fetched total data :  1649
Fetched total data :  1746
Fetched total data :  1843
Fetched total data :  1940
Fetched total data :  2037
Fetched total data :  2134
Fetched total data :  2231
Fetched total data :  2328
Fetched total data :  2425
Fetched total data :  2522
Fetched total data :  2619
Fetched total data :  2716
Fetched total data :  2813
Fetched total data :  2910
Fetched total data :  3007
Fetched total data :  3104
Fetched total data :  3201
Fetched total data :  3298
Fetched total data :  3395
Fetched total data :  3492
Fetched total data :  3589
Fetched tota

KeyboardInterrupt: 