Date: 14 Nov, 2024

In [1]:
import sys
from dotenv import load_dotenv

sys.path.append("..")
load_dotenv("../.env")

True

In [2]:
import os
import pandas as pd
import requests
from pprint import pprint as pp

In [3]:
headers = {
    "x-api-key": os.getenv("WRI_API_KEY"),
    "Authorization": f"Bearer {os.getenv('WRI_BEARER_TOKEN')}",
}

### Create a catalog for datasets that have metadata

In [4]:
url = "https://data-api.globalforestwatch.org/datasets"
datasets = requests.get(url, headers=headers).json()

In [5]:
def format_dataset_metadata(dataset):
    """
    Formats dataset metadata into a readable string.
    
    Args:
        dataset (dict): Dictionary containing dataset information with metadata
        
    Returns:
        str: Formatted metadata string or None if required fields are missing
    """
    try:
        metadata = dataset.get("metadata")
        if not metadata or not metadata.get("overview"):
            return None
            
        # Define the fields to include and their labels
        fields = [
            ("title", "Title"),
            ("overview", "Overview"),
            ("cautions", "Caution"),
            ("function", "Function"),
            ("geographic_coverage", "Geographic Coverage"),
            ("tags", "Tags")
        ]
        
        # Build the content string
        content_parts = []
        for field_name, label in fields:
            value = metadata.get(field_name)
            if value:
                # Handle tags specially if they're in a list
                if field_name == "tags" and isinstance(value, list):
                    value = ", ".join(value)
                content_parts.append(f"{label}: {value}")
        
        return "\n".join(content_parts)
        
    except Exception as e:
        print(f"Error processing dataset metadata: {e}")
        return None

def save_datasets_to_csv(datasets, output_file):
    """
    Saves dataset information to a CSV file using pandas.
    
    Args:
        datasets (dict): Dictionary containing dataset information
        output_file (str): Name of the output CSV file
    """
    try:
        # Create lists to store data
        dataset_ids = []
        formatted_contents = []
        
        # Process each dataset
        for dataset in datasets["data"]:
            dataset_id = dataset.get("dataset")
            formatted_content = format_dataset_metadata(dataset)
            
            if dataset_id and formatted_content:
                dataset_ids.append(dataset_id)
                formatted_contents.append(formatted_content)
        
        # Create DataFrame
        df = pd.DataFrame({
            'dataset': dataset_ids,
            'content': formatted_contents
        })
        
        # Save to CSV
        df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"Successfully saved to {output_file}")
        
        return df  # Return DataFrame for potential further analysis
        
    except Exception as e:
        print(f"Error saving CSV file: {e}")
        return None

In [6]:
# Example usage:
df = save_datasets_to_csv(datasets, "../data/wri-datasets.csv")

Successfully saved to ../data/wri-datasets.csv


In [7]:
print(df.iloc[70].content)

Title: Deforestation alerts (GLAD-S2)
Overview: This data set is a forest loss alert product developed by the [GLAD](https://glad.geog.umd.edu/) (Global Land Analysis and Discovery) lab at the University of Maryland. GLAD-S2 alerts utilize data from the European Space Agency’s Sentinel-2 mission, which provides optical imagery at a 10m spatial resolution with a 5-day revisit time. The shorter revisit time, when compared to GLAD Landsat alerts, reduces the time to detect forest loss and between the initial detection of forest loss and classification as high confidence. This is particularly advantageous in wet and tropical regions, where persistent cloud cover may delay detections for weeks to months. GLAD-S2 alerts are available for primary forests in the Amazon basin from January 1st 2019 to present, updated daily.<br><br>New Sentinel-2 images are analyzed as soon as they are acquired. Cloud, shadow, and water are filtered out of each new image, and a forest loss algorithm is applied t

### Create a vectorstore for WRI datasets

In [8]:
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma

In [9]:
db = "chroma_db"
os.makedirs(db, exist_ok=True)

In [10]:
embedder = OllamaEmbeddings(model="nomic-embed-text")

In [11]:
texts = df['content'].tolist()
metadatas = [{'dataset': dataset} for dataset in df['dataset'].tolist()]
ids = [f"doc_{i}" for i in range(len(texts))]

In [12]:
%%time
vectorstore = Chroma.from_texts(
    texts=texts,
    embedding=embedder,
    metadatas=metadatas,
    ids=ids,
    persist_directory=db
)

CPU times: user 295 ms, sys: 21.8 ms, total: 317 ms
Wall time: 12.2 s


In [14]:
db = Chroma(
        persist_directory="chroma_db", 
        embedding_function=embedder
)

In [15]:
retriver = db.as_retriever(k=5)

In [16]:
docs = retriver.invoke("I am interested in preserving mangroves in Argentina", k=3)

In [17]:
for doc in docs:
    print(doc.metadata)
    print(doc.page_content)

{'dataset': 'arg_otbn_forest_loss'}
Title: Argentinian National Monitoring System of Native Forests
Overview: The Argentinian National Monitoring System of Native Forests quantifies deforestation in Argentina’s native forests since 2007. The dataset was initially created to contribute to the implementation of Argentina’s Native Forest Law (Law N°26331), which seeks to establish the minimum environmental protections for the conservation, restoration, and management of Argentina’s native forests. Additionally, the National Monitoring System of Native Forests aids in ensuring Argentina’s compliance with international agreements on climate change and forest protection.
Caution: The original data of the National Monitoring System of Native Forests is not completely annualized and reports forest loss for certain Argentinean regions pre-2017 in a series of date ranges that span multiple years. To visualize this data, multi-year ranges were transformed into annualized values by assigning the d