In [4]:
import requests
import pandas as pd
import time

def fetch_gdelt_context(query, max_records=200, start=0):
    base_url = "https://api.gdeltproject.org/api/v2/context/context"
    
    params = {
        'query': query,
        'mode': 'artlist',
        'format': 'json',
        'maxrecords': max_records,
        'start': start
    }
    
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Parse JSON response
        data = response.json()
        articles = data.get('articles', [])

        return articles

    except requests.exceptions.RequestException as e:
        print(f"Error fetching articles: {e}")
        return []

# List of topics to query
topics = [
    'canada'
]

# number of articles to fetch
article_count = 20000

# Initialize DataFrame to store all articles
all_articles_df = pd.DataFrame()

# Fetch data for each topic
for topic in topics:
    start = 0
    while True:
        articles = fetch_gdelt_context(query=topic, max_records=200, start=start)
        if not articles:
            break  # Exit loop if no more articles are returned

        # Convert list of dictionaries to DataFrame
        articles_df = pd.DataFrame(articles)

        # Append to the main DataFrame
        all_articles_df = pd.concat([all_articles_df, articles_df], ignore_index=True)

        # Break if we reach the required number of articles
        if len(all_articles_df) >= article_count:
            break

        # Update start for next page
        start += 200

        print("Fetched total data : ",len(all_articles_df))

        # Pause to respect API rate limits
        time.sleep(1)

    # Check if we have reached the required number of articles
    if len(all_articles_df) >= article_count:
        break

print(f"Fetched {len(all_articles_df)} articles.")

# Save DataFrame to CSV file
all_articles_df.to_csv('data/gdelt_news_articles.csv', index=False)

# Optionally, display or further process the DataFrame
print(all_articles_df.head())


Fetched total data :  102
Fetched total data :  204
Fetched total data :  306
Fetched total data :  408
Fetched total data :  510
Fetched total data :  612
Fetched total data :  714
Fetched total data :  816
Fetched total data :  918
Fetched total data :  1020
Fetched total data :  1122
Fetched total data :  1224
Fetched total data :  1326
Fetched total data :  1428
Fetched total data :  1530
Fetched total data :  1632
Fetched total data :  1734
Fetched total data :  1836
Fetched total data :  1938
Fetched total data :  2040
Fetched total data :  2142
Fetched total data :  2244
Fetched total data :  2346
Fetched total data :  2448
Fetched total data :  2550
Fetched total data :  2652
Fetched total data :  2754
Fetched total data :  2856
Fetched total data :  2958
Fetched total data :  3060
Fetched total data :  3162
Fetched total data :  3264
Fetched total data :  3366
Fetched total data :  3468
Fetched total data :  3570
Fetched total data :  3672
Fetched total data :  3774
Fetched to