# Google Sheets Inventory

This notebook scans the ETL extract modules to identify all Google Sheets and worksheets used in the pipeline. This is useful for managing access permissions.

In [1]:
import os
import re
import pandas as pd
from pathlib import Path

# Define paths
# We use the notebook's location to find the project root and then the extract directory
try:
    # In a notebook, __file__ isn't defined, so we use os.getcwd()
    current_dir = Path(os.getcwd())
except NameError:
    current_dir = Path('.')

# The extract directory relative to this notebook (src/ca_biositing/pipeline/ca_biositing/pipeline/utils/)
# is ../etl/extract
extract_dir = current_dir.parent / 'etl' / 'extract'

print(f"Current directory: {current_dir.resolve()}")
print(f"Target extract directory: {extract_dir.resolve()}")

def scan_extract_modules(directory):
    inventory = []
    
    # Regex patterns to find GSHEET_NAME and WORKSHEET_NAME
    gsheet_pattern = re.compile(r'GSHEET_NAME\s*=\s*["\'](.*?)["\']')
    worksheet_pattern = re.compile(r'WORKSHEET_NAME\s*=\s*["\'](.*?)["\']')
    
    if not directory.exists():
        print(f"Error: Directory {directory} does not exist.")
        # Try to find it from the workspace root as a fallback
        # Workspace root is /Users/pjsmitty301/ca-biositing
        fallback = Path('/Users/pjsmitty301/ca-biositing/src/ca_biositing/pipeline/ca_biositing/pipeline/etl/extract')
        if fallback.exists():
            print(f"Found directory at fallback: {fallback}")
            directory = fallback
        else:
            return pd.DataFrame()

    files = list(directory.glob('*.py'))
    print(f"Found {len(files)} .py files.")
    
    for file_path in files:
        if file_path.name == '__init__.py':
            continue
            
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            
            # Find all matches
            gsheets = gsheet_pattern.findall(content)
            worksheets = worksheet_pattern.findall(content)
            
            # If we found matches, add them to the inventory
            if gsheets:
                # Use the first one found or handle multiple
                gsheet = gsheets[0]
                worksheet = worksheets[0] if worksheets else "Unknown"
                
                inventory.append({
                    'Module': file_path.name,
                    'Google Sheet Name': gsheet,
                    'Worksheet Name': worksheet
                })
            else:
                # Log modules that don't use GSHEET_NAME (e.g., local files or GDrive CSVs)
                inventory.append({
                    'Module': file_path.name,
                    'Google Sheet Name': 'N/A (Non-GSheet)',
                    'Worksheet Name': 'N/A'
                })
                
    return pd.DataFrame(inventory)

# Generate the inventory
df_inventory = scan_extract_modules(extract_dir)

# Display the results
if not df_inventory.empty:
    display(df_inventory.sort_values(by='Google Sheet Name'))
else:
    print("No extract modules found. Check the 'extract_dir' path.")

Current directory: /Users/pjsmitty301/ca-biositing
Target extract directory: /Users/pjsmitty301/etl/extract
Error: Directory /Users/pjsmitty301/etl/extract does not exist.
Found directory at fallback: /Users/pjsmitty301/ca-biositing/src/ca_biositing/pipeline/ca_biositing/pipeline/etl/extract
Found 16 .py files.


Unnamed: 0,Module,Google Sheet Name,Worksheet Name
2,basic_sample_info.py,Aim 1-Feedstock Collection and Processing Data...,01-BasicSampleInfo
4,proximate.py,Aim 1-Feedstock Collection and Processing Data...,03.1-Proximate
5,ultimate.py,Aim 1-Feedstock Collection and Processing Data...,03.7-Ultimate
7,experiments.py,Aim 1-Feedstock Collection and Processing Data...,03.0-Experiments
10,resources.py,Aim 1-Feedstock Collection and Processing Data...,07.2-Resources
12,preparation.py,Aim 1-Feedstock Collection and Processing Data...,02-Preparation
13,cmpana.py,Aim 1-Feedstock Collection and Processing Data...,03.3-CmpAna
0,biodiesel_plants.py,N/A (Non-GSheet),
1,billion_ton.py,N/A (Non-GSheet),
3,landiq.py,N/A (Non-GSheet),


In [2]:
# Save to CSV for easy reference
if not df_inventory.empty:
    output_path = current_dir / 'gsheet_inventory.csv'
    df_inventory.to_csv(output_path, index=False)
    print(f"Inventory saved to {output_path}")

Inventory saved to /Users/pjsmitty301/ca-biositing/gsheet_inventory.csv
