In [82]:
# %% [markdown]
"""
# Sri Lanka Crop Disease News Extractor (2013-Present)
GDELT API Data Collection
"""
# %%
import requests
import pandas as pd
from tqdm.notebook import tqdm
import time
import os

# %% [markdown]
"""
## Configuration
"""
# %%
# API Parameters
params = {
    'query': '("crop disease" OR "plant pest" OR "agricultural blight") location:SriLanka',
    'format': 'json',
    'startdatetime': '20130101',
    'enddatetime': '20241231',  # Update to current year
    'mode': 'artlist',
    'maxrecords': 250  # API limit per request
}

# Output file
output_file = 'sri_lanka_crop_disease_news.csv'
existing_urls = set()

# %% [markdown]
"""
## Data Collection Function
"""
# %%
def fetch_gdelt_articles(params, last_url=None):
    """Fetch articles from GDELT API with pagination support"""
    if last_url:
        params['lasturl'] = last_url
    
    try:
        response = requests.get(
            "https://api.gdeltproject.org/api/v2/doc/doc",
            params=params,
            timeout=30
        )
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"API Error: {str(e)}")
        return None
    except ValueError as e:
        print(f"JSON Decode Error: {str(e)}")
        return None

# %% [markdown]
"""
## Main Collection Process
"""
# %%
# Initialize DataFrame if file doesn't exist
if os.path.exists(output_file):
    df = pd.read_csv(output_file)
    existing_urls = set(df['url'].tolist())
else:
    df = pd.DataFrame(columns=['date', 'title', 'url', 'source', 'language'])
    df.to_csv(output_file, index=False)

# Collection loop
last_url = None
progress_bar = tqdm(desc="Collecting articles")

while True:
    data = fetch_gdelt_articles(params, last_url)
    if not data or 'articles' not in data or len(data['articles']) == 0:
        break
        
    new_articles = []
    for article in data['articles']:
        if article['url'] not in existing_urls:
            new_articles.append({
                'date': article.get('date', ''),
                'title': article.get('title', ''),
                'url': article.get('url', ''),
                'source': article.get('source', ''),
                'language': article.get('language', '')
            })
            existing_urls.add(article['url'])
    
    if new_articles:
        pd.DataFrame(new_articles).to_csv(
            output_file, 
            mode='a', 
            header=not os.path.exists(output_file), 
            index=False
        )
    
    last_url = data['articles'][-1]['url']
    progress_bar.update(len(data['articles']))
    time.sleep(2)  # Respect API rate limits

progress_bar.close()
print("Data collection complete!")

# %% [markdown]
"""
## Data Verification
"""
# %%
# Load and verify collected data
final_df = pd.read_csv(output_file)
print(f"Total articles collected: {len(final_df)}")
print("\nSample data:")
final_df.head()

# %% [markdown]
"""
## Optional: Content Scraping
"""
# %%
def scrape_article_content(url):
    """Scrape full article content from URL"""
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Common content selectors
        selectors = [
            'article', 
            '.article-content', 
            '#main-content', 
            '.post-content'
        ]
        
        for selector in selectors:
            content = soup.select_one(selector)
            if content:
                return ' '.join(p.get_text() for p in content.find_all('p'))
        
        return None
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return None

# Only run if you need full content
if False:  # Change to True to enable scraping
    tqdm.pandas(desc="Scraping content")
    final_df['content'] = final_df['url'].progress_apply(scrape_article_content)
    final_df.to_csv('sri_lanka_crop_disease_with_content.csv', index=False)

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [77]:
response

<Response [400]>

In [31]:
print(search.keys())

dict_keys(['feed', 'entries'])


In [28]:
print(search['entries'])

[]


In [57]:
!pip install ntscraper

Collecting ntscraper
  Downloading ntscraper-0.4.0-py3-none-any.whl.metadata (7.4 kB)
Downloading ntscraper-0.4.0-py3-none-any.whl (12 kB)
Installing collected packages: ntscraper
Successfully installed ntscraper-0.4.0



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [68]:
from ntscraper import Nitter

# Initialize the scraper
scraper = Nitter(log_level=1,skip_instance_check=True)



In [69]:
# Example: Search for tweets related to agriculture in Sri Lanka
results = scraper.get_tweets("agriculture Sri Lanka", mode="term", number=50)

# Print results
for tweet in results['tweets']:
    print(f"{tweet['date']} - {tweet['user']['username']}: {tweet['text']}\n")



ValueError: No instance specified and instance check skipped

In [67]:
from ntscraper import Nitter

# Manually specify a known working instance
scraper = Nitter(instances="https://nitter.poast.org", skip_instance_check=True)

# Search for tweets
results = scraper.get_tweets("agriculture Sri Lanka", mode="term", number=50)

# Print result tweets
for tweet in results['tweets']:
    print(f"{tweet['date']} - @{tweet['user']['username']}: {tweet['text']}\n")


ValueError: No instance specified and instance check skipped

In [70]:
from ntscraper import Nitter

scraper = Nitter(instance="https://nitter.poast.org", skip_instance_check=True)


TypeError: Nitter.__init__() got an unexpected keyword argument 'instance'. Did you mean 'instances'?

In [83]:
import requests

# API endpoint for document search
url = "https://api.gdeltproject.org/api/v2/doc/doc"

params = {
    'query': '("crop disease" OR "plant pest") location:SriLanka',
    'format': 'json',
    'startdatetime': '20130101',
    'enddatetime': '20231231',
    'mode': 'artlist'  # Returns article list with metadata
}

response = requests.get(url, params=params)
articles = response.json()['articles']

for article in articles:
    print(f"{article['date']} | {article['title']} | {article['url']}")

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [84]:
import requests

# API Endpoint
url = "https://api.gdeltproject.org/api/v2/doc/doc"

# Parameters
params = {
    'query': '(crop disease OR plant pest) location:SriLanka',
    'format': 'json',  # Can also use 'csv'
    'startdatetime': '20130101',  # From Jan 1, 2013
    'enddatetime': '20241231',    # To Dec 31, 2024
    'mode': 'artlist',  # Returns article metadata
    'maxrecords': 100   # Limit results (API allows up to 250 per call)
}

# Send request
response = requests.get(url, params=params)
data = response.json()

# Extract data
for article in data.get('articles', []):
    print(f"Date: {article['date']}")
    print(f"Headline: {article['title']}")
    print(f"URL: {article['url']}\n")

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [85]:
import requests
import json
from urllib.parse import quote

def get_gdelt_articles(query, start_date, end_date, max_records=100):
    """Safe GDELT API request with error handling"""
    url = "https://api.gdeltproject.org/api/v2/doc/doc"
    
    params = {
        'query': quote(query),  # URL-encode special characters
        'format': 'json',
        'startdatetime': start_date,
        'enddatetime': end_date,
        'mode': 'artlist',
        'maxrecords': min(max_records, 250)  # API limit is 250
    }
    
    try:
        # Add timeout and custom headers
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
        }
        response = requests.get(
            url, 
            params=params, 
            headers=headers,
            timeout=30
        )
        
        # Check for empty response
        if not response.content:
            print("Empty API response")
            return None
            
        data = response.json()
        
        # Check if articles exist
        if not data.get('articles'):
            print("No articles found")
            return None
            
        return data
        
    except json.JSONDecodeError:
        print(f"Failed to decode JSON. Raw response: {response.text[:200]}...")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {str(e)}")
        return None

# Example usage
data = get_gdelt_articles(
    query='(crop disease OR plant pest) location:SriLanka',
    start_date='20130101',
    end_date='20231231',
    max_records=100
)

if data:
    for article in data['articles']:
        print(f"{article['date']} | {article['title']} | {article['url']}")
else:
    print("No data received")

Failed to decode JSON. Raw response: Invalid query start date format (must be in YYYYMMDDHHMMSS format).
...
No data received


In [86]:
# Fallback to GDELT raw data files
import pandas as pd

def get_gdelt_raw_data(date):
    """Download raw GDELT data for specific date"""
    url = f"http://data.gdeltproject.org/gdeltv2/{date}.export.CSV.zip"
    try:
        df = pd.read_csv(url, sep='\t', header=None)
        # Filter for Sri Lanka (column 53 contains country codes)
        return df[df[53].str.contains('LKA', na=False)]
    except Exception as e:
        print(f"Failed to download {date}: {str(e)}")
        return None

In [87]:
import requests
import json
from urllib.parse import quote

def get_gdelt_articles(query, start_date, end_date, max_records=100):
    """Safe GDELT API request with error handling"""
    url = "https://api.gdeltproject.org/api/v2/doc/doc"
    
    params = {
        'query': quote(query),  # URL-encode special characters
        'format': 'json',
        'startdatetime': start_date,
        'enddatetime': end_date,
        'mode': 'artlist',
        'maxrecords': min(max_records, 250)  # API limit is 250
    }
    
    try:
        # Add timeout and custom headers
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
        }
        response = requests.get(
            url, 
            params=params, 
            headers=headers,
            timeout=30
        )
        
        # Check for empty response
        if not response.content:
            print("Empty API response")
            return None
            
        data = response.json()
        
        # Check if articles exist
        if not data.get('articles'):
            print("No articles found")
            return None
            
        return data
        
    except json.JSONDecodeError:
        print(f"Failed to decode JSON. Raw response: {response.text[:200]}...")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {str(e)}")
        return None

# Example usage
data = get_gdelt_articles(
    query='(crop disease OR plant pest) location:SriLanka',
    start_date='20130101',
    end_date='20231231',
    max_records=100
)

if data:
    for article in data['articles']:
        print(f"{article['date']} | {article['title']} | {article['url']}")
else:
    print("No data received")

Failed to decode JSON. Raw response: Invalid query start date format (must be in YYYYMMDDHHMMSS format).
...
No data received


In [88]:
import requests
import json
from urllib.parse import quote
from datetime import datetime

def get_gdelt_articles(query, start_date, end_date, max_records=100):
    """Safe GDELT API request with proper date formatting"""
    url = "https://api.gdeltproject.org/api/v2/doc/doc"
    
    # Convert dates to proper format (YYYYMMDD000000 if no time specified)
    start_datetime = f"{start_date}000000" if len(start_date) == 8 else start_date
    end_datetime = f"{end_date}235959" if len(end_date) == 8 else end_date
    
    params = {
        'query': quote(query),
        'format': 'json',
        'startdatetime': start_datetime,
        'enddatetime': end_datetime,
        'mode': 'artlist',
        'maxrecords': min(max_records, 250)
    }
    
    try:
        response = requests.get(
            url,
            params=params,
            headers={'User-Agent': 'Mozilla/5.0'},
            timeout=30
        )
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error: {str(e)}")
        print(f"API URL: {response.url}")  # Debug the actual request
        return None

# Example usage with proper dates
data = get_gdelt_articles(
    query='(crop disease OR plant pest) location:SriLanka',
    start_date='20130101',  # Will be converted to 20130101000000
    end_date='20231231',    # Will be converted to 20231231235959
    max_records=50
)

if data and data.get('articles'):
    for article in data['articles']:
        print(f"{article['date']} | {article['title']} | {article['url']}")
else:
    print("No results found. Try broadening your search.")

Error: Expecting value: line 1 column 1 (char 0)
API URL: https://api.gdeltproject.org/api/v2/doc/doc?query=%2528crop%2520disease%2520OR%2520plant%2520pest%2529%2520location%253ASriLanka&format=json&startdatetime=20130101000000&enddatetime=20231231235959&mode=artlist&maxrecords=50
No results found. Try broadening your search.


In [89]:
import requests
import json
from datetime import datetime

def get_gdelt_articles(query, start_date, end_date, max_records=100):
    """Working GDELT API request with proper encoding"""
    url = "https://api.gdeltproject.org/api/v2/doc/doc"
    
    params = {
        'query': query,  # Don't pre-encode the query
        'format': 'json',
        'startdatetime': f"{start_date}000000",
        'enddatetime': f"{end_date}235959",
        'mode': 'artlist',
        'maxrecords': min(max_records, 250),
        'sort': 'datedesc'  # Get newest articles first
    }
    
    try:
        response = requests.get(
            url,
            params=params,  # Let requests handle encoding
            headers={'User-Agent': 'Mozilla/5.0'},
            timeout=30
        )
        
        # Debug raw response if needed
        if not response.text.strip():
            print("Empty response from server")
            return None
            
        data = response.json()
        return data
        
    except json.JSONDecodeError:
        print(f"API returned non-JSON response. Status: {response.status_code}")
        print(f"Response text: {response.text[:200]}...")
        return None
    except Exception as e:
        print(f"Request failed: {str(e)}")
        return None

# Try these improved queries
queries_to_try = [
    'crop disease Sri Lanka',
    'plant pest Sri Lanka',
    'agriculture disease Sri Lanka',
    'Sri Lanka AND (blight OR pest OR disease)'
]

for query in queries_to_try:
    print(f"\nTrying query: '{query}'")
    data = get_gdelt_articles(
        query=query,
        start_date='20130101',
        end_date='20231231',
        max_records=50
    )
    
    if data and data.get('articles'):
        print(f"Found {len(data['articles'])} articles:")
        for article in data['articles'][:3]:  # Show first 3 results
            print(f" - {article['date']}: {article['title']}")
        break
    else:
        print("No results. Trying next query...")

if not data or not data.get('articles'):
    print("\nAll queries failed. Try these alternatives:")
    print("1. Use broader search terms")
    print("2. Try a shorter date range")
    print("3. Check https://blog.gdeltproject.org/api-debugging-tool/")


Trying query: 'crop disease Sri Lanka'
API returned non-JSON response. Status: 200
Response text: Invalid query start date.
...
No results. Trying next query...

Trying query: 'plant pest Sri Lanka'
API returned non-JSON response. Status: 200
Response text: Invalid query start date.
...
No results. Trying next query...

Trying query: 'agriculture disease Sri Lanka'
API returned non-JSON response. Status: 200
Response text: Invalid query start date.
...
No results. Trying next query...

Trying query: 'Sri Lanka AND (blight OR pest OR disease)'
API returned non-JSON response. Status: 200
Response text: Invalid query start date.
...
No results. Trying next query...

All queries failed. Try these alternatives:
1. Use broader search terms
2. Try a shorter date range
3. Check https://blog.gdeltproject.org/api-debugging-tool/


In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta

def fetch_gdelt_articles():
    base_url = "https://api.gdeltproject.org/api/v2/doc/doc"
    
    # Break into smaller time periods to avoid API limits
    date_ranges = []
    current_date = datetime(2013, 1, 1)
    end_date = datetime.now()
    
    while current_date < end_date:
        next_date = current_date + timedelta(days=30)  # 1-month chunks
        date_ranges.append((
            current_date.strftime('%Y%m%d%H%M%S'),
            min(next_date, end_date).strftime('%Y%m%d%H%M%S')
        ))
        current_date = next_date
    
    all_articles = []
    
    for start, end in date_ranges:
        params = {
            'query': 'sri lanka (crop OR plant) (disease OR pest OR blight)',
            'format': 'json',
            'startdatetime': start,
            'enddatetime': end,
            'mode': 'artlist',
            'maxrecords': 100,
            'sort': 'datedesc'
        }
        
        try:
            response = requests.get(base_url, params=params, timeout=30)
            if response.status_code == 200:
                data = response.json()
                if 'articles' in data:
                    all_articles.extend(data['articles'])
                    print(f"Found {len(data['articles'])} articles from {start[:8]} to {end[:8]}")
                else:
                    print(f"No articles in {start[:8]}-{end[:8]}")
            else:
                print(f"API error for {start[:8]}-{end[:8]}: {response.text[:100]}")
            
            time.sleep(1)  # Be gentle with the API
            
        except Exception as e:
            print(f"Error processing {start[:8]}-{end[:8]}: {str(e)}")
    
    return all_articles

# Run the collection
articles = fetch_gdelt_articles()

if articles:
    df = pd.DataFrame(articles)[['date', 'title', 'url', 'source']]
    print(f"\nTotal articles collected: {len(df)}")
    print("\nSample results:")
    print(df.head())
    
    # Save to CSV
    df.to_csv('sri_lanka_crop_diseases.csv', index=False)
    print("\nSaved to sri_lanka_crop_diseases.csv")
else:
    print("No articles found. Try adjusting the query or date range.")

API error for 20130101-20130131: Please limit requests to one every 5 seconds or contact kalev.leetaru5@gmail.com for larger queries.
API error for 20130131-20130302: Please limit requests to one every 5 seconds or contact kalev.leetaru5@gmail.com for larger queries.
API error for 20130302-20130401: Please limit requests to one every 5 seconds or contact kalev.leetaru5@gmail.com for larger queries.
API error for 20130401-20130501: Please limit requests to one every 5 seconds or contact kalev.leetaru5@gmail.com for larger queries.
API error for 20130501-20130531: Please limit requests to one every 5 seconds or contact kalev.leetaru5@gmail.com for larger queries.
API error for 20130531-20130630: Please limit requests to one every 5 seconds or contact kalev.leetaru5@gmail.com for larger queries.
API error for 20130630-20130730: Please limit requests to one every 5 seconds or contact kalev.leetaru5@gmail.com for larger queries.
API error for 20130730-20130829: Please limit requests to one 

In [92]:
import requests
import pandas as pd
import time
from datetime import datetime, timedelta
import os
from tqdm import tqdm

# Configuration
OUTPUT_FILE = "sri_lanka_crop_diseases_2013_present.csv"
START_DATE = datetime(2013, 1, 1)
END_DATE = datetime.now()

## Phase 1: Polite API Requests (With Rate Limiting)
def fetch_gdelt_api():
    base_url = "https://api.gdeltproject.org/api/v2/doc/doc"
    collected = []
    
    # Process quarterly instead of monthly to reduce requests
    current = START_DATE
    pbar = tqdm(desc="Fetching GDELT API Data", total=(END_DATE - START_DATE).days)
    
    while current < END_DATE:
        next_date = min(current + timedelta(days=90), END_DATE)  # 3-month chunks
        
        params = {
            'query': '(sri lanka OR lka) (crop OR plant) (disease OR pest OR blight)',
            'format': 'json',
            'startdatetime': current.strftime('%Y%m%d%H%M%S'),
            'enddatetime': next_date.strftime('%Y%m%d%H%M%S'),
            'mode': 'artlist',
            'maxrecords': 100,
            'sort': 'datedesc'
        }
        
        try:
            response = requests.get(base_url, params=params, timeout=30)
            if response.status_code == 200:
                data = response.json()
                if data.get('articles'):
                    collected.extend(data['articles'])
                    pbar.update((next_date - current).days)
            elif "limit requests" in response.text:
                tqdm.write(f"Rate limited at {current.date()}, switching to fallback...")
                break
                
        except Exception as e:
            tqdm.write(f"Error at {current.date()}: {str(e)}")
            
        current = next_date
        time.sleep(6)  # 6 seconds between requests (above their 5s requirement)
    
    pbar.close()
    return pd.DataFrame(collected)[['date', 'title', 'url', 'source']] if collected else None

## Phase 2: Direct File Download Fallback
def download_gdelt_raw():
    base_url = "http://data.gdeltproject.org/events/{year}.zip"
    collected = []
    
    for year in tqdm(range(START_DATE.year, END_DATE.year + 1), desc="Downloading Raw Data"):
        try:
            df = pd.read_csv(
                base_url.format(year=year),
                sep='\t', 
                header=None,
                compression='zip',
                encoding='latin1',
                usecols=[1, 2, 26, 27, 53]  # Date, Actor1, EventCode, URL, Country
            )
            # Filter for Sri Lanka and agricultural events
            filtered = df[(df[53] == 'LKA') & (df[26].astype(str).str.startswith('034'))]
            collected.append(filtered)
            time.sleep(2)  # Be gentle with their servers
        except Exception as e:
            tqdm.write(f"Error downloading {year}: {str(e)}")
    
    if collected:
        combined = pd.concat(collected)
        combined.columns = ['date', 'actor', 'event_code', 'url', 'country']
        return combined
    return None

## Phase 3: Local Sri Lankan Sources
def scrape_local_sources():
    sources = {
        "Agriculture Dept": "http://www.agridept.gov.lk",
        "FAO Sri Lanka": "http://www.fao.org/sri-lanka/news/en/",
        "Tea Research Inst": "http://www.tri.lk/publications/"
    }
    
    collected = []
    # Implement scraping logic for each source
    # (Would need custom selectors for each site)
    
    return pd.DataFrame(collected) if collected else None

## Main Execution
if __name__ == "__main__":
    # Try API first
    print("Attempting GDELT API collection...")
    api_df = fetch_gdelt_api()
    
    # Fallback to raw data if API fails
    if api_df is None or len(api_df) < 50:
        print("\nAPI results insufficient, trying raw data download...")
        raw_df = download_gdelt_raw()
    else:
        raw_df = None
    
    # Combine results
    final_df = pd.concat([df for df in [api_df, raw_df] if df is not None])
    
    if final_df is not None and len(final_df) > 0:
        print(f"\nCollected {len(final_df)} records")
        final_df.to_csv(OUTPUT_FILE, index=False)
        print(f"Saved to {OUTPUT_FILE}")
        
        # Show sample
        print("\nSample results:")
        print(final_df.head())
    else:
        print("\nFailed to collect data. Try:")
        print("1. Running again later (rate limits reset)")
        print("2. Using the direct download links manually")
        print("3. Contacting kalev.leetaru5@gmail.com for bulk access")

Attempting GDELT API collection...




Fetching GDELT API Data:   0%|                                                                | 0/4528 [00:00<?, ?it/s][A[A

[A[A                                                                                                                 
[A

[A[A
[A

Rate limited at 2013-01-01, switching to fallback...
Error at 2013-01-01: 'tqdm_notebook' object has no attribute 'container'


AttributeError: 'tqdm_notebook' object has no attribute 'container'

In [93]:
!pip install tqdm_notebook

ERROR: Could not find a version that satisfies the requirement tqdm_notebook (from versions: none)

[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for tqdm_notebook


In [3]:
import requests
import pandas as pd
import time
from datetime import datetime, timedelta
from tqdm.auto import tqdm  # Updated import for compatibility

# Configuration
OUTPUT_FILE = "sri_lanka_crop_diseases_final.csv"
START_DATE = datetime(2013, 1, 1)
END_DATE = datetime.now()

def fetch_gdelt_data():
    """Main function to collect crop disease data with proper rate limiting"""
    base_url = "https://api.gdeltproject.org/api/v2/doc/doc"
    collected_data = []
    
    # Calculate time chunks (6-month intervals to reduce requests)
    date_chunks = []
    current_date = START_DATE
    while current_date < END_DATE:
        next_date = min(current_date + timedelta(days=180), END_DATE)  # 6 months
        date_chunks.append((current_date, next_date))
        current_date = next_date
    
    # Initialize progress bar
    pbar = tqdm(date_chunks, desc="Collecting GDELT Data")
    
    for start_date, end_date in pbar:
        pbar.set_postfix({
            'period': f"{start_date.date()} to {end_date.date()}",
            'collected': len(collected_data)
        })
        
        params = {
            'query': 'sri lanka (crop disease OR plant pest OR agricultural blight)',
            'format': 'json',
            'startdatetime': start_date.strftime('%Y%m%d000000'),
            'enddatetime': end_date.strftime('%Y%m%d235959'),
            'mode': 'artlist',
            'maxrecords': 100,
            'sort': 'datedesc'
        }
        
        try:
            response = requests.get(base_url, params=params, timeout=30)
            
            if response.status_code == 200:
                data = response.json()
                if data.get('articles'):
                    collected_data.extend(data['articles'])
            elif "limit requests" in response.text:
                tqdm.write(f"Rate limited at {start_date.date()}, slowing down...")
                time.sleep(10)  # Wait longer if rate limited
                continue
                
        except Exception as e:
            tqdm.write(f"Error processing {start_date.date()}: {str(e)}")
        
        time.sleep(6)  # 6 seconds between requests
    
    # Convert to DataFrame
    if collected_data:
        df = pd.DataFrame(collected_data)
        keep_columns = ['date', 'title', 'url', 'source', 'seendate']
        available_columns = [col for col in keep_columns if col in df.columns]
        return df[available_columns]
    return None

def save_and_show_results(df):
    """Save data and display summary"""
    if df is not None and len(df) > 0:
        # Clean date format
        df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
        
        # Save to CSV
        df.to_csv(OUTPUT_FILE, index=False)
        print(f"\nSuccessfully saved {len(df)} records to {OUTPUT_FILE}")
        
        # Show summary
        print("\nDate range covered:", df['date'].min(), "to", df['date'].max())
        print("\nTop sources:")
        print(df['source'].value_counts().head(10))
        
        print("\nSample results:")
        return df.head()
    else:
        print("No data collected. Try adjusting the query or date range.")
        return None

# Run the collection
if __name__ == "__main__":
    print("Starting data collection...")
    final_data = fetch_gdelt_data()
    results = save_and_show_results(final_data)
    
    if results is not None:
        print(results)

Starting data collection...


Collecting GDELT Data:   0%|          | 0/26 [00:00<?, ?it/s]

Rate limited at 2013-01-01, slowing down...
Rate limited at 2013-06-30, slowing down...
Rate limited at 2013-12-27, slowing down...
Rate limited at 2014-06-25, slowing down...
Rate limited at 2014-12-22, slowing down...
Rate limited at 2015-06-20, slowing down...
Rate limited at 2015-12-17, slowing down...


KeyboardInterrupt: 

In [1]:
import requests
import pandas as pd
import time
from datetime import datetime, timedelta
from tqdm.notebook import tqdm  # Progress bar for notebooks
import langdetect 
from langdetect import detect

#https://api.gdeltproject.org/api/v2/doc/doc?query=sri+lanka+agriculture&format=json&startdatetime=20230101000000&enddatetime=20230630235959&mode=artlist&maxrecords=10

In [2]:
def fetch_gdelt_articles(query, start_date, end_date, maxrecords=250):
    """
    Fetch articles from GDELT API for a given query and date range.

    Args:
        query (str): Search query (e.g., 'sri lanka agriculture').
        start_date (datetime): Start datetime.
        end_date (datetime): End datetime.
        maxrecords (int): Max records to fetch (max 25000, usually 100+ is good).

    Returns:
        list of articles (dict)
    """
    base_url = "https://api.gdeltproject.org/api/v2/doc/doc"
        
    params = {
        'query': query,
        'format': 'json',
        'startdatetime': start_date.strftime('%Y%m%d%H%M%S'),
        'enddatetime': end_date.strftime('%Y%m%d%H%M%S'),
        'mode': 'artlist',
        'maxrecords': maxrecords,
        'sort': 'datedesc'
    }

    # Add proper headers
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Accept': 'application/json'
    }
    enriched_articles = []
    
    try:
        response = requests.get(base_url, params=params,headers=headers, timeout=30)
        if response.status_code == 200:
                data = response.json()
                articles = data.get('articles', [])

                for article in articles:
                    article['sourcecountry'] = "Sri Lanka"
                    article['language'] = "English"
                    if 'title' in article and detect(article['title']) == 'en':
                        enriched_articles.append(article)
                return enriched_articles
            
        
        elif response.status_code == 429:
            print("⚠️ Rate limited. Need to slow down.")
            return None
        
        else:
            print(f"Error: Received status code {response.status_code}")
            return None
    
    except Exception as e:
        print(f"Exception during API call: {e}")
        return None


In [3]:
def collect_gdelt_data(query, start_date, end_date, chunk_days=180, delay_sec=6):
    """
    Collect data in chunks of chunk_days with delay_sec seconds between requests.
    
    Returns:
        Pandas DataFrame with all articles collected.
    """
    all_articles = []
    
    current_start = start_date
    
    date_ranges = []
    while current_start < end_date:
        current_end = min(current_start + timedelta(days=chunk_days), end_date)
        date_ranges.append((current_start, current_end))
        current_start = current_end + timedelta(seconds=1)  # avoid overlap
    
    print(f"Collecting data in {len(date_ranges)} chunks of {chunk_days} days each.")
    
    for s, e in tqdm(date_ranges):
        print(f"\nFetching data from {s.date()} to {e.date()}")
        
        articles = fetch_gdelt_articles(query, s, e)
        
        if articles is None:
            print("Sleeping 30 seconds due to rate limiting or error...")
            time.sleep(30)
            # Retry once after wait
            articles = fetch_gdelt_articles(query, s, e)
            if articles is None:
                print("Skipping this chunk due to repeated error.")
                continue
        
        print(f"Received {len(articles)} articles.")
        all_articles.extend(articles)
        
        print(f"Sleeping {delay_sec} seconds before next request...")
        time.sleep(delay_sec)
    
    if all_articles:
        df = pd.DataFrame(all_articles)
        return df
    else:
        print("No articles collected.")
        return pd.DataFrame()


In [None]:
# Define your query and date range here
query = "Sri+Lanka+Faces+Crop+Crisis"
start_date = datetime(2020, 6, 1)
end_date = datetime(2023, 6, 30)

# Collect data
df = collect_gdelt_data(query,start_date, end_date, chunk_days=30, delay_sec=30)


Collecting data in 38 chunks of 30 days each.


  0%|          | 0/38 [00:00<?, ?it/s]


Fetching data from 2020-06-01 to 2020-07-01
Exception during API call: Expecting value: line 1 column 1 (char 0)
Sleeping 30 seconds due to rate limiting or error...
Exception during API call: Expecting value: line 1 column 1 (char 0)
Skipping this chunk due to repeated error.

Fetching data from 2020-07-01 to 2020-07-31
Exception during API call: Expecting value: line 1 column 1 (char 0)
Sleeping 30 seconds due to rate limiting or error...


In [4]:
df.head(150)

NameError: name 'df' is not defined

In [22]:
if not df.empty:
    # Convert 'date' column to datetime and format
    df['date'] = pd.to_datetime(df['seendate']).dt.strftime('%Y-%m-%d')
    
    print(f"\nCollected {len(df)} articles.")
    print(df[['date', 'title', 'domain', 'sourcecountry']].head())
    
    # Save to CSV
    df.to_csv("mine.csv", index=False)
    print("Saved to mine.csv")
else:
    print("No data to save.")



Collected 3345 articles.
         date                                              title  \
0  2020-07-02  15 Best Dry Shampoos for 2020 - Top Dry Shampo...   
1  2020-07-01                                    California Cool   
2  2020-07-01   The 15 Best New Beauty Products Dropping in July   
3  2020-07-01  15 Best Dry Shampoos for 2020 - Top Dry Shampo...   
4  2020-07-01                                         The Island   

              domain sourcecountry  
0           wtae.com     Sri Lanka  
1  winespectator.com     Sri Lanka  
2            msn.com     Sri Lanka  
3           wjcl.com     Sri Lanka  
4          island.lk     Sri Lanka  
Saved to mine.csv


In [None]:
def scrape_article(url):
    """Scrape full article content using newspaper3k"""
    article = {
        'url': url,
        'title': '',
        'text': '',
        'publish_date': None,
        'scrape_success': False
    }
    
    try:
        # Using newspaper3k (best for news articles)
        news_article = newspaper.Article(url)
        news_article.download()
        news_article.parse()
        
        article.update({
            'title': news_article.title,
            'text': news_article.text,
            'publish_date': news_article.publish_date,
            'scrape_success': True
        })
        
    except Exception as e:
        print(f"Failed to scrape {url}: {str(e)}")
    
    return article


In [None]:
import requests
from bs4 import BeautifulSoup
import newspaper
from datetime import datetime
import time
import json

# 3. Main function
def get_sri_lanka_news(query, days_back=7, max_articles=5):
    """Full pipeline to get and scrape Sri Lankan news"""
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days_back)
    
    print(f"Fetching URLs about '{query}' from {start_date.date()} to {end_date.date()}")
    urls = get_gdelt_urls(query, start_date, end_date, max_articles)
    
    if not urls:
        print("No articles found")
        return []
    
    print(f"Found {len(urls)} articles. Starting scraping...")
    
    all_articles = []
    for url in urls:
        article = scrape_article(url)
        if article['scrape_success']:
            all_articles.append(article)
        time.sleep(2)  # Be polite to websites
        
    print(f"Successfully scraped {len(all_articles)}/{len(urls)} articles")
    return all_articles

In [None]:
# Example usage
if __name__ == "__main__":
    # Get news about Sri Lankan agriculture
    articles = get_sri_lanka_news(
        query="agriculture OR farming",
        days_back=30,
        max_articles=5
    )
    
    # Save results
    with open('sri_lanka_agriculture_news.json', 'w') as f:
        json.dump(articles, f, indent=2, default=str)
    
    print(f"Saved {len(articles)} articles to JSON file")

In [41]:
import requests
import json

url = "https://api.gdeltproject.org/api/v2/doc/doc"

# Properly encoded parameters
params = {
    'query': '(crop disease OR plant pest OR agricultural blight) AND sri lanka',
    'format': 'json',
    'startdatetime': today.strftime('%Y%m%d000000'),
    'enddatetime': today.strftime('%Y%m%d235959'),
    'mode': 'artlist',
    'maxrecords': '10'  # String instead of integer
}

# Add proper headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept': 'application/json'
}

try:
    response = requests.get(url, params=params, headers=headers, timeout=30)
    
    # Check if response is valid JSON
    if response.text.strip():
        data = response.json()
        if 'articles' in data:
    # Add sourcecountry and language fields to each article
            enriched_articles = []
            for article in data['articles']:
                article['sourcecountry'] = "Sri Lanka"
                article['language'] = "English"
                enriched_articles.append(article)
    
            print(json.dumps(enriched_articles, indent=2))
      
        else:
            print("Unexpected response format:", data)
    else:
        print("Empty response from server. Status code:", response.status_code)
        print("Response text:", response.text)
        
except json.JSONDecodeError:
    print("Failed to decode JSON. Raw response:")
    print(response.text)
except requests.exceptions.RequestException as e:
    print("Request failed:", str(e))

[
  {
    "url": "https://www.ft.lk/business/EDB-and-New-Zealand-explore-agri-exports-boost/34-776891",
    "url_mobile": "",
    "title": "EDB and New Zealand explore agri - exports boost",
    "seendate": "20250526T010000Z",
    "socialimage": "",
    "domain": "ft.lk",
    "language": "English",
    "sourcecountry": "Sri Lanka"
  },
  {
    "url": "https://www.ft.lk/ft_view__editorial/Harnessing-Sri-Lanka-s-biodiversity-for-national-benefit/58-776864",
    "url_mobile": "",
    "title": "Harnessing Sri Lanka biodiversity for national benefit",
    "seendate": "20250526T010000Z",
    "socialimage": "",
    "domain": "ft.lk",
    "language": "English",
    "sourcecountry": "Sri Lanka"
  },
  {
    "url": "https://www.ft.lk/front-page/NEXT-closure-exposes-antics-of-self-centred-trade-unionism/44-776902",
    "url_mobile": "",
    "title": "NEXT closure exposes antics of self - centred trade unionism",
    "seendate": "20250526T010000Z",
    "socialimage": "",
    "domain": "ft.lk",
   

In [37]:
if 'articles' in data:
    print(json.dumps(data['articles'], indent=2))

[
  {
    "url": "https://www.ft.lk/business/EDB-and-New-Zealand-explore-agri-exports-boost/34-776891",
    "url_mobile": "",
    "title": "EDB and New Zealand explore agri - exports boost",
    "seendate": "20250526T010000Z",
    "socialimage": "",
    "domain": "ft.lk",
    "language": "English",
    "sourcecountry": "Sri Lanka"
  },
  {
    "url": "https://www.ft.lk/ft_view__editorial/Harnessing-Sri-Lanka-s-biodiversity-for-national-benefit/58-776864",
    "url_mobile": "",
    "title": "Harnessing Sri Lanka biodiversity for national benefit",
    "seendate": "20250526T010000Z",
    "socialimage": "",
    "domain": "ft.lk",
    "language": "English",
    "sourcecountry": "Sri Lanka"
  },
  {
    "url": "https://www.ft.lk/front-page/NEXT-closure-exposes-antics-of-self-centred-trade-unionism/44-776902",
    "url_mobile": "",
    "title": "NEXT closure exposes antics of self - centred trade unionism",
    "seendate": "20250526T010000Z",
    "socialimage": "",
    "domain": "ft.lk",
   

In [39]:
if 'articles' in data:
    # Add sourcecountry and language fields to each article
    enriched_articles = []
    for article in data['articles']:
        article['sourcecountry'] = "Sri Lanka"
        article['language'] = "English"
        enriched_articles.append(article)
    
    print(json.dumps(enriched_articles, indent=2))


[
  {
    "url": "https://www.ft.lk/business/EDB-and-New-Zealand-explore-agri-exports-boost/34-776891",
    "url_mobile": "",
    "title": "EDB and New Zealand explore agri - exports boost",
    "seendate": "20250526T010000Z",
    "socialimage": "",
    "domain": "ft.lk",
    "language": "English",
    "sourcecountry": "Sri Lanka"
  },
  {
    "url": "https://www.ft.lk/ft_view__editorial/Harnessing-Sri-Lanka-s-biodiversity-for-national-benefit/58-776864",
    "url_mobile": "",
    "title": "Harnessing Sri Lanka biodiversity for national benefit",
    "seendate": "20250526T010000Z",
    "socialimage": "",
    "domain": "ft.lk",
    "language": "English",
    "sourcecountry": "Sri Lanka"
  },
  {
    "url": "https://www.ft.lk/front-page/NEXT-closure-exposes-antics-of-self-centred-trade-unionism/44-776902",
    "url_mobile": "",
    "title": "NEXT closure exposes antics of self - centred trade unionism",
    "seendate": "20250526T010000Z",
    "socialimage": "",
    "domain": "ft.lk",
   

In [1]:
import requests
import json
from datetime import datetime, timedelta
import time
import html
import langdetect  # Install with: pip install langdetect

# Filter only English articles
from langdetect import detect

# Setup
n = 3 # Target number of articles
enriched_articles = []
maxrecords = 10  # GDELT API allows up to 250, but use 10 for lighter requests
days_back = 0  # Start from today

# Headers and endpoint
url = "https://api.gdeltproject.org/api/v2/doc/doc"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
    'Accept': 'application/json'
}

while len(enriched_articles) < n:
    # Define date range (1-day window, going backward)
    today = datetime.utcnow() - timedelta(days=days_back)
    start_date = today.replace(hour=0, minute=0, second=0)
    end_date = today.replace(hour=23, minute=59, second=59)

    params = {
        'query': '(crop disease OR plant pest OR agricultural blight) AND sri lanka',
        'format': 'json',
        'startdatetime': start_date.strftime('%Y%m%d%H%M%S'),
        'enddatetime': end_date.strftime('%Y%m%d%H%M%S'),
        'mode': 'artlist',
        'maxrecords': str(maxrecords)
    }

    try:
        response = requests.get(url, params=params, headers=headers, timeout=30)
        if response.text.strip():
            data = response.json()
            articles = data.get('articles', [])

            for article in articles:
                article['sourcecountry'] = "Sri Lanka"
                article['language'] = "English"
                if 'title' in article and detect(article['title']) == 'en':
                    enriched_articles.append(article)
                    
            print(f"✅ Collected {len(enriched_articles)} articles (Date: {today.date()})")

        else:
            print(f"⚠️ Empty response on {today.date()}: Status {response.status_code}")

    except json.JSONDecodeError:
        print(f"❌ JSON decode error on {today.date()}")
    except requests.exceptions.RequestException as e:
        print(f"❌ Request failed: {e}")

    days_back += 1  # Go further back in time
    time.sleep(6)  # Respect API rate limit

# Trim to exactly `n`
enriched_articles = enriched_articles[:n]
print(f"\n🎉 Done! Collected {len(enriched_articles)} enriched articles.")

# Optional: Print first 2 articles
print(json.dumps(enriched_articles[:10], indent=2))


  today = datetime.utcnow() - timedelta(days=days_back)


✅ Collected 4 articles (Date: 2025-05-26)

🎉 Done! Collected 3 enriched articles.
[
  {
    "url": "https://www.ft.lk/business/EDB-and-New-Zealand-explore-agri-exports-boost/34-776891",
    "url_mobile": "",
    "title": "EDB and New Zealand explore agri - exports boost",
    "seendate": "20250526T010000Z",
    "socialimage": "",
    "domain": "ft.lk",
    "language": "English",
    "sourcecountry": "Sri Lanka"
  },
  {
    "url": "https://www.ft.lk/ft_view__editorial/Harnessing-Sri-Lanka-s-biodiversity-for-national-benefit/58-776864",
    "url_mobile": "",
    "title": "Harnessing Sri Lanka biodiversity for national benefit",
    "seendate": "20250526T010000Z",
    "socialimage": "",
    "domain": "ft.lk",
    "language": "English",
    "sourcecountry": "Sri Lanka"
  },
  {
    "url": "https://www.ft.lk/front-page/NEXT-closure-exposes-antics-of-self-centred-trade-unionism/44-776902",
    "url_mobile": "",
    "title": "NEXT closure exposes antics of self - centred trade unionism",
   

In [47]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     -------------------------------------- 981.5/981.5 kB 4.2 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (pyproject.toml): started
  Building wheel for langdetect (pyproject.toml): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993363 sha256=ca9d5abeacf11994b1eb4


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
