# Climate Policy Dataset Analysis Setup

This notebook helps you test your Python environment and get started with analyzing the GCCMPD climate policy dataset.

In [None]:
# Install essential packages
%pip install numpy pandas matplotlib plotly scikit-learn numba llvmlite>=0.41.0

In [3]:
# Test core scientific computing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

print("✅ Core data science packages imported successfully")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

✅ Core data science packages imported successfully
NumPy version: 1.26.4
Pandas version: 2.3.1


In [4]:
# Test machine learning packages
try:
    from sklearn import __version__ as sklearn_version
    import numba
    print(f"✅ Scikit-learn version: {sklearn_version}")
    print(f"✅ Numba version: {numba.__version__}")
except ImportError as e:
    print(f"❌ Error importing ML packages: {e}")

✅ Scikit-learn version: 1.7.1
✅ Numba version: 0.61.2


In [7]:
# Install NLP packages
%pip install nltk spacy transformers sentence-transformers

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting spacy
  Using cached spacy-3.8.7-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting transformers
  Using cached transformers-4.55.0-py3-none-any.whl.metadata (39 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting click (from nltk)
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached regex-2025.7.34-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.0.13-cp312-cp31

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.6 which is incompatible.


   ---------------------------------- ----- 40/46 [transformers]
   ---------------------------------- ----- 40/46 [transformers]
   ---------------------------------- ----- 40/46 [transformers]
   ---------------------------------- ----- 40/46 [transformers]
   ---------------------------------- ----- 40/46 [transformers]
   ---------------------------------- ----- 40/46 [transformers]
   ---------------------------------- ----- 40/46 [transformers]
   ---------------------------------- ----- 40/46 [transformers]
   ---------------------------------- ----- 40/46 [transformers]
   ---------------------------------- ----- 40/46 [transformers]
   ---------------------------------- ----- 40/46 [transformers]
   ---------------------------------- ----- 40/46 [transformers]
   ---------------------------------- ----- 40/46 [transformers]
   ---------------------------------- ----- 40/46 [transformers]
   ---------------------------------- ----- 40/46 [transformers]
   ----------------------

In [8]:
# Test NLP packages
try:
    import nltk
    import spacy
    from transformers import __version__ as transformers_version
    print(f"✅ NLTK version: {nltk.__version__}")
    print(f"✅ spaCy version: {spacy.__version__}")
    print(f"✅ Transformers version: {transformers_version}")
except ImportError as e:
    print(f"❌ Error importing NLP packages: {e}")

  from .autonotebook import tqdm as notebook_tqdm


✅ NLTK version: 3.9.1
✅ spaCy version: 3.8.7
✅ Transformers version: 4.55.0


In [11]:
# Install topic modeling packages
%pip install bertopic umap-learn hdbscan

Collecting bertopic
  Using cached bertopic-0.17.3-py3-none-any.whl.metadata (24 kB)
Collecting umap-learn
  Using cached umap_learn-0.5.9.post2-py3-none-any.whl.metadata (25 kB)
Collecting hdbscan
  Using cached hdbscan-0.8.40-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Using cached pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Using cached bertopic-0.17.3-py3-none-any.whl (153 kB)
Using cached umap_learn-0.5.9.post2-py3-none-any.whl (90 kB)
Using cached hdbscan-0.8.40-cp312-cp312-win_amd64.whl (726 kB)
Using cached pynndescent-0.5.13-py3-none-any.whl (56 kB)
Installing collected packages: pynndescent, hdbscan, umap-learn, bertopic

   ---------------------------------------- 0/4 [pynndescent]
   ---------------------------------------- 0/4 [pynndescent]
   ---------- ----------------------------- 1/4 [hdbscan]
   ---------- ----------------------------- 1/4 [hdbscan]
   ---------- ----------------------------- 1/4 [hdbscan]
   --

In [13]:
# Test topic modeling packages
try:
    from bertopic import BERTopic
    import umap
    import hdbscan
    print("✅ BERTopic imported successfully")
    print(f"✅ UMAP version: {umap.__version__}")
    
    # HDBSCAN version check (some versions don't have __version__)
    try:
        print(f"✅ HDBSCAN version: {hdbscan.__version__}")
    except AttributeError:
        print("✅ HDBSCAN imported successfully (version info not available)")
        
except ImportError as e:
    print(f"❌ Error importing topic modeling packages: {e}")

✅ BERTopic imported successfully
✅ UMAP version: 0.5.9.post2
✅ HDBSCAN imported successfully (version info not available)


In [10]:
# Test PyTorch
try:
    import torch
    print(f"✅ PyTorch version: {torch.__version__}")
    print(f"✅ CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"✅ CUDA version: {torch.version.cuda}")
except ImportError as e:
    print(f"❌ Error importing PyTorch: {e}")

✅ PyTorch version: 2.8.0+cpu
✅ CUDA available: False


In [None]:
# Load and preview climate policy data
import os

data_dir = 'data'
if os.path.exists(data_dir):
    data_files = [f for f in os.listdir(data_dir) if f.endswith('.xlsx')]
    print(f"Found {len(data_files)} Excel files in data directory:")
    for file in data_files[:10]:  # Show first 10 files
        print(f"  - {file}")
    
    if data_files:
        # Try to load the main policy dataset
        main_files = [f for f in data_files if 'ALL_POLICIES' in f.upper()]
        if main_files:
            print(f"\nLoading main policy dataset: {main_files[0]}")
            try:
                df = pd.read_excel(os.path.join(data_dir, main_files[0]))
                print(f"✅ Dataset loaded successfully! Shape: {df.shape}")
                print(f"\nColumns: {list(df.columns)}")
                print(f"\nFirst few rows:")
                display(df.head())
            except Exception as e:
                print(f"❌ Error loading dataset: {e}")
else:
    print("❌ Data directory not found")