# GDELT Demo Notebook

This notebook demonstrates working with GDELT (Global Database of Events, Language, and Tone) data for graph analysis.


In [26]:
# Configuration variables
GCP_PROJECT_ID = "graph-demo-471710"  # Replace with your actual GCP project ID
BIGQUERY_DATASET = "gdelt"  # Replace with your actual BigQuery dataset name

# Derived variables
BIGQUERY_DATASET_ID = f"{GCP_PROJECT_ID}.{BIGQUERY_DATASET}"

print(f"Configuration loaded:")
print(f"  GCP Project: {GCP_PROJECT_ID}")
print(f"  BigQuery Dataset: {BIGQUERY_DATASET}")
print(f"  Full Dataset ID: {BIGQUERY_DATASET_ID}")


Configuration loaded:
  GCP Project: graph-demo-471710
  BigQuery Dataset: gdelt
  Full Dataset ID: graph-demo-471710.gdelt


In [27]:
# GCP Authentication Setup
import subprocess
import os
import shutil
from google.auth import default
from google.auth.exceptions import DefaultCredentialsError

def setup_gcp_authentication():
    """Complete GCP authentication setup with error handling"""
    print("üîê Setting up GCP Authentication...")
    
    try:
        # Step 1: Try to use existing credentials first
        print("üîç Checking for existing credentials...")
        try:
            credentials, default_project = default()
            print(f"‚úÖ Found existing credentials for project: {default_project}")
            
            # If the project matches, we're good
            if default_project == GCP_PROJECT_ID:
                print(f"üéØ Project matches target project: {GCP_PROJECT_ID}")
                os.environ['GOOGLE_CLOUD_PROJECT'] = GCP_PROJECT_ID
                return credentials, GCP_PROJECT_ID
            else:
                print(f"‚ö†Ô∏è  Project mismatch: {default_project} vs {GCP_PROJECT_ID}")
                print("üîÑ Will re-authenticate with correct project...")
        except DefaultCredentialsError:
            print("‚ùå No existing credentials found")
            print("üîÑ Will authenticate from scratch...")
        
        # Step 2: Clear old credentials if needed
        print("üóëÔ∏è  Clearing old credentials...")
        adc_path = os.path.expanduser("~/.config/gcloud/application_default_credentials.json")
        if os.path.exists(adc_path):
            os.remove(adc_path)
            print("‚úÖ Removed old application default credentials")
        
        # Step 3: Set the correct project
        print(f"üéØ Setting gcloud project to: {GCP_PROJECT_ID}")
        result = subprocess.run(['gcloud', 'config', 'set', 'project', GCP_PROJECT_ID], 
                              capture_output=True, text=True, check=True)
        print("‚úÖ Project set successfully")
        
        # Step 4: Re-authenticate
        print("üîÑ Re-authenticating with application default credentials...")
        print("   This will open a browser window for authentication...")
        
        result = subprocess.run(['gcloud', 'auth', 'application-default', 'login'], 
                              check=True)
        print("‚úÖ Re-authentication successful")
        
        # Step 5: Set quota project to avoid warnings
        print("üí∞ Setting quota project...")
        try:
            subprocess.run(['gcloud', 'auth', 'application-default', 'set-quota-project', GCP_PROJECT_ID], 
                          capture_output=True, text=True, check=True)
            print("‚úÖ Quota project set successfully")
        except:
            print("‚ö†Ô∏è  Could not set quota project (this is usually fine)")
        
        # Step 6: Verify the setup
        print("üß™ Verifying authentication...")
        credentials, project = default()
        print(f"‚úÖ Authentication successful - Project: {project}")
        
        # Set environment variable
        os.environ['GOOGLE_CLOUD_PROJECT'] = GCP_PROJECT_ID
        print(f"üåç Set GOOGLE_CLOUD_PROJECT environment variable to: {GCP_PROJECT_ID}")
        
        return credentials, GCP_PROJECT_ID
        
    except subprocess.CalledProcessError as e:
        print(f"‚ùå Command failed: {e}")
        print("üí° Manual steps required:")
        print(f"   1. gcloud config set project {GCP_PROJECT_ID}")
        print("   2. gcloud auth application-default login")
        print(f"   3. gcloud auth application-default set-quota-project {GCP_PROJECT_ID}")
        return None, None
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return None, None

# Run authentication setup
credentials, authenticated_project = setup_gcp_authentication()


üîê Setting up GCP Authentication...
üîç Checking for existing credentials...




‚úÖ Found existing credentials for project: graph-demo
‚ö†Ô∏è  Project mismatch: graph-demo vs graph-demo-471710
üîÑ Will re-authenticate with correct project...
üóëÔ∏è  Clearing old credentials...
‚úÖ Removed old application default credentials
üéØ Setting gcloud project to: graph-demo-471710
‚úÖ Project set successfully
üîÑ Re-authenticating with application default credentials...
   This will open a browser window for authentication...


Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=4oQms9dSVKQvNG95uvSfnyfGBSxNkS&access_type=offline&code_challenge=oMXLu2Z3305ktTMGqeVJlHQ_18fZ3JMa-Uy2axu8wVk&code_challenge_method=S256


Credentials saved to file: [/Users/johnswain/.config/gcloud/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "graph-demo-471710" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.


‚úÖ Re-authentication successful
üí∞ Setting quota project...
‚úÖ Quota project set successfully
üß™ Verifying authentication...
‚úÖ Authentication successful - Project: graph-demo
üåç Set GOOGLE_CLOUD_PROJECT environment variable to: graph-demo-471710


In [28]:
# Import required libraries
import os
import pandas as pd
from google.cloud import bigquery
from google.cloud import storage
import json
from datetime import datetime

print("Libraries imported successfully!")


Libraries imported successfully!


In [29]:
# Test GCP connectivity
def test_gcp_connectivity():
    """Test basic connectivity to GCP services"""
    print("üîç Testing GCP connectivity...")
    
    # Check if authentication was successful
    if not credentials or not authenticated_project:
        print("‚ùå Authentication required - please run the authentication cell first")
        return False
    
    print(f"‚úÖ Using authenticated project: {authenticated_project}")
    
    # Test 1: Test BigQuery connectivity
    try:
        # Use explicit credentials and project
        client = bigquery.Client(credentials=credentials, project=authenticated_project)
        print(f"üîó BigQuery client created for project: {client.project}")
        
        # Simple query to test connectivity
        query = "SELECT 1 as test_value"
        result = client.query(query).result()
        for row in result:
            print(f"‚úÖ BigQuery connectivity successful - Test query result: {row.test_value}")
            break  # Only need first row
    except Exception as e:
        error_str = str(e)
        if "has been deleted" in error_str or "USER_PROJECT_DENIED" in error_str:
            print(f"‚ùå BigQuery connectivity failed: Project mismatch detected")
            print(f"   Error: {e}")
            print(f"üîß This usually means your credentials are cached for a different project")
            print(f"   üí° Try running the authentication cell again")
            print(f"   üìã Or manually run: gcloud auth application-default login")
            return False
        else:
            print(f"‚ùå BigQuery connectivity failed: {e}")
            return False
    
    # Test 2: Test BigQuery dataset access
    try:
        client = bigquery.Client(credentials=credentials, project=authenticated_project)
        dataset_ref = client.dataset(BIGQUERY_DATASET)
        dataset = client.get_dataset(dataset_ref)
        print(f"‚úÖ BigQuery dataset '{BIGQUERY_DATASET}' accessible")
        
        # List tables in the dataset
        tables = list(client.list_tables(dataset_ref))
        print(f"üìä Found {len(tables)} tables in dataset")
        for table in tables[:5]:  # Show first 5 tables
            print(f"   - {table.table_id}")
        if len(tables) > 5:
            print(f"   ... and {len(tables) - 5} more tables")
            
    except Exception as e:
        print(f"‚ùå BigQuery dataset access failed: {e}")
        print(f"   Make sure dataset '{BIGQUERY_DATASET}' exists in project '{authenticated_project}'")
        return False
    
    # Test 3: Test Cloud Storage connectivity
    try:
        storage_client = storage.Client(credentials=credentials, project=authenticated_project)
        # List buckets to test connectivity
        buckets = list(storage_client.list_buckets())
        print(f"‚úÖ Cloud Storage connectivity successful - Found {len(buckets)} buckets")
    except Exception as e:
        print(f"‚ùå Cloud Storage connectivity failed: {e}")
        return False
    
    print("üéâ All GCP connectivity tests passed!")
    return True

# Run the connectivity test
test_gcp_connectivity()


üîç Testing GCP connectivity...
‚úÖ Using authenticated project: graph-demo-471710
üîó BigQuery client created for project: graph-demo-471710
‚úÖ BigQuery connectivity successful - Test query result: 1
‚úÖ BigQuery dataset 'gdelt' accessible
üìä Found 0 tables in dataset
‚úÖ Cloud Storage connectivity successful - Found 1 buckets
üéâ All GCP connectivity tests passed!


True

In [None]:
# Ready for GDELT analysis!
print("üéâ Setup complete! Ready to work with GDELT data.")
print(f"üìä Project: {GCP_PROJECT_ID}")
print(f"üóÑÔ∏è  Dataset: {BIGQUERY_DATASET}")
print("üöÄ You can now run queries against your GDELT data!")
