Date: 14 Nov, 2024

In [None]:
import sys
from dotenv import load_dotenv

sys.path.append("..")
load_dotenv("../.env")

In [None]:
import os
import pandas as pd
import requests
from pprint import pprint as pp

In [None]:
headers = {
    "x-api-key": os.getenv("WRI_API_KEY"),
    "Authorization": f"Bearer {os.getenv('WRI_BEARER_TOKEN')}",
}

### Create a catalog for datasets that have metadata

In [None]:
url = "https://data-api.globalforestwatch.org/datasets"
datasets = requests.get(url, headers=headers).json()

In [None]:
def format_dataset_metadata(dataset):
    """
    Formats dataset metadata into a readable string.
    
    Args:
        dataset (dict): Dictionary containing dataset information with metadata
        
    Returns:
        str: Formatted metadata string or None if required fields are missing
    """
    try:
        metadata = dataset.get("metadata")
        if not metadata or not metadata.get("overview"):
            return None
            
        # Define the fields to include and their labels
        fields = [
            ("title", "Title"),
            ("overview", "Overview"),
            ("cautions", "Caution"),
            ("function", "Function"),
            ("geographic_coverage", "Geographic Coverage"),
            ("tags", "Tags")
        ]
        
        # Build the content string
        content_parts = []
        for field_name, label in fields:
            value = metadata.get(field_name)
            if value:
                # Handle tags specially if they're in a list
                if field_name == "tags" and isinstance(value, list):
                    value = ", ".join(value)
                content_parts.append(f"{label}: {value}")
        
        return "\n".join(content_parts)
        
    except Exception as e:
        print(f"Error processing dataset metadata: {e}")
        return None

def save_datasets_to_csv(datasets, output_file):
    """
    Saves dataset information to a CSV file using pandas.
    
    Args:
        datasets (dict): Dictionary containing dataset information
        output_file (str): Name of the output CSV file
    """
    try:
        # Create lists to store data
        dataset_ids = []
        formatted_contents = []
        
        # Process each dataset
        for dataset in datasets["data"]:
            dataset_id = dataset.get("dataset")
            formatted_content = format_dataset_metadata(dataset)
            
            if dataset_id and formatted_content:
                dataset_ids.append(dataset_id)
                formatted_contents.append(formatted_content)
        
        # Create DataFrame
        df = pd.DataFrame({
            'dataset': dataset_ids,
            'content': formatted_contents
        })
        
        # Save to CSV
        df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"Successfully saved to {output_file}")
        
        return df  # Return DataFrame for potential further analysis
        
    except Exception as e:
        print(f"Error saving CSV file: {e}")
        return None

In [None]:
# Example usage:
df = save_datasets_to_csv(datasets, "../data/wri-datasets.csv")

In [None]:
print(df.iloc[70].content)

### Create a vectorstore for WRI datasets

In [None]:
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_chroma import Chroma

In [None]:
db = "chroma_db"
os.makedirs(db, exist_ok=True)

In [None]:
embedder = OllamaEmbeddings(model="nomic-embed-text")

In [None]:
texts = df['content'].tolist()
metadatas = [{'dataset': dataset} for dataset in df['dataset'].tolist()]
ids = [f"doc_{i}" for i in range(len(texts))]

In [None]:
%%time
vectorstore = Chroma.from_texts(
    texts=texts,
    embedding=embedder,
    metadatas=metadatas,
    ids=ids,
    persist_directory=db
)

In [None]:
db = Chroma(
        persist_directory="../data/chroma_db", 
        embedding_function=embedder
)

In [None]:
retriver = db.as_retriever(k=5)

In [None]:
docs = retriver.invoke("I am interested in preserving mangroves in Argentina", k=3)

In [None]:
for doc in docs:
    print(doc.metadata)
    print(doc.page_content)