In [None]:
# Import packages
# Dataframe Packages
import numpy as np
import xarray as xr
import pandas as pd

# Vector Packages
import geopandas as gpd
import shapely
from shapely import wkt
from shapely.geometry import Point, Polygon
from pyproj import CRS, Transformer

# Raster Packages
import rioxarray as rxr
import rasterio
from rasterio.mask import mask
from rioxarray.merge import merge_arrays
import rasterstats as rs
import osgeo
from osgeo import gdalconst

# Data Access Packages
import earthaccess as ea
import h5py
import pickle
from tensorflow.keras.models import load_model
from pystac_client import Client
import richdem as rd
import planetary_computer
from planetary_computer import sign

# General Packages
import os
import re
import shutil
import math
from datetime import datetime
import glob
from pprint import pprint
from typing import Union
from pathlib import Path
from tqdm import tqdm
import time
import requests
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
import dask
import dask.dataframe as dd
from dask.distributed import progress
from dask.distributed import Client
from dask.diagnostics import ProgressBar
from retrying import retry
import fiona
import re
import s3fs

#need to mamba install gdal, earthaccess 
#pip install pystac_client, richdem, planetary_computer, dask, distributed, retrying

#connecting to AWS
import warnings; warnings.filterwarnings("ignore")
import boto3
from botocore import UNSIGNED
from botocore.client import Config

import NSIDC_Data
'''
To create .netrc file:
import earthaccess
earthaccess.login(persist=True)
open file and change machine to https://urs.earthdata.nasa.gov

'''

#load access key
HOME = os.path.expanduser('~')
KEYPATH = "SWEML/AWSaccessKeys.csv"
ACCESS = pd.read_csv(f"{HOME}/{KEYPATH}")

#start session
SESSION = boto3.Session(
    aws_access_key_id=ACCESS['Access key ID'][0],
    aws_secret_access_key=ACCESS['Secret access key'][0],
)
S3 = SESSION.resource('s3')
#AWS BUCKET information
BUCKET_NAME = 'national-snow-model'
#S3 = boto3.resource('S3', config=Config(signature_version=UNSIGNED))
BUCKET = S3.Bucket(BUCKET_NAME)

In [None]:
class ASODataTool:
    def __init__(self, short_name, version, polygon='', filename_filter=''):
        self.short_name = short_name
        self.version = version
        self.polygon = polygon
        self.filename_filter = filename_filter
        self.url_list = []
        self.CMR_URL = 'https://cmr.earthdata.nasa.gov'
        self.CMR_PAGE_SIZE = 2000
        self.CMR_FILE_URL = ('{0}/search/granules.json?provider=NSIDC_ECS'
                             '&sort_key[]=start_date&sort_key[]=producer_granule_id'
                             '&scroll=true&page_size={1}'.format(self.CMR_URL, self.CMR_PAGE_SIZE))

    def cmr_search(self, time_start, time_end, bounding_box):
        try:
            if not self.url_list:
                self.url_list = NSIDC_Data.cmr_search(
                    self.short_name, self.version, time_start, time_end,
                    bounding_box=self.bounding_box, polygon=self.polygon,
                    filename_filter=self.filename_filter, quiet=False)
            return self.url_list
        except KeyboardInterrupt:
            quit()

    def cmr_download(self, directory):
        dpath = f"{HOME}/SWEML/data/NSMv2.0/data/ASO/{directory}"
        if not os.path.exists(dpath):
            os.makedirs(dpath, exist_ok=True)

        NSIDC_Data.cmr_download(self.url_list, dpath, False)

    @staticmethod
    def get_bounding_box(region):
        try:
            regions = pd.read_pickle(f"{HOME}/SWEML/data/PreProcessed/RegionVal.pkl")
        except:
            print('File not local, getting from AWS S3.')
            key = f"data/PreProcessed/RegionVal.pkl"            
            S3.meta.client.download_file(BUCKET_NAME, key,f"{HOME}/SWEML/data/PreProcessed/RegionVal.pkl")
            regions = pd.read_pickle(f"{HOME}/SWEML/data/PreProcessed/RegionVal.pkl")


        
        superset = []

        superset.append(regions[region])
        superset = pd.concat(superset)
        superset = gpd.GeoDataFrame(superset, geometry=gpd.points_from_xy(superset.Long, superset.Lat, crs="EPSG:4326"))
        bounding_box = list(superset.total_bounds)

        return f"{bounding_box[0]},{bounding_box[1]},{bounding_box[2]},{bounding_box[3]}"

class ASODownload(ASODataTool):
    def __init__(self, short_name, version, polygon='', filename_filter=''):
        super().__init__(short_name, version, polygon, filename_filter)
        self.region_list =    [ 'N_Sierras',
                                'S_Sierras',
                                'Greater_Yellowstone',
                                'N_Co_Rockies',
                                'SW_Mont',
                                'SW_Co_Rockies',
                                'GBasin',
                                'N_Wasatch',
                                'N_Cascade',
                                'S_Wasatch',
                                'SW_Mtns',
                                'E_WA_N_Id_W_Mont',
                                'S_Wyoming',
                                'SE_Co_Rockies',
                                'Sawtooth',
                                'Ca_Coast',
                                'E_Or',
                                'N_Yellowstone',
                                'S_Cascade',
                                'Wa_Coast',
                                'Greater_Glacier',
                                'Or_Coast'  ]

    def select_region(self):
        print("Select a region by entering its index:")
        for i, region in enumerate(self.region_list, start=1):
            print(f"{i}. {region}")

        try:
            user_input = int(input("Enter the index of the region: "))
            if 1 <= user_input <= len(self.region_list):
                selected_region = self.region_list[user_input - 1]
                self.bounding_box = self.get_bounding_box(selected_region)
                print(f"You selected: {selected_region}")
                print(f"Bounding Box: {self.bounding_box}")
            else:
                print("Invalid index. Please select a valid index.")
        except ValueError:
            print("Invalid input. Please enter a valid index.")
            


In [None]:
if __name__ == "__main__":
    short_name = 'ASO_50M_SWE'
    version = '1'

    data_tool = ASODownload(short_name, version)
    time_start = '2013-04-02T00:00:00Z'
    time_end = '2019-07-19T23:59:59Z'
    
    selected_region = data_tool.select_region()  # Call select_region on the instance
    directory = "SWE_Data"

    print(f"Fetching file URLs in progress for {selected_region} from {time_start} to {time_end}")
    url_list = data_tool.cmr_search(time_start, time_end, data_tool.bounding_box)
    data_tool.cmr_download(directory)

In [None]:
 # Get all SWE_csv into the input folder
csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

In [None]:
class ASODataProcessing:
    
    @staticmethod
    def processing_tiff(input_file, output_res):
        try:
            date = os.path.splitext(input_file)[0].split("_")[-1]
            
            # Define the output file path
            output_folder = os.path.join(os.getcwd(), "Processed_Data")
            os.makedirs(output_folder, exist_ok=True)
            output_file = os.path.join(output_folder, f"ASO_100M_{date}.tif")
    
            ds = gdal.Open(input_file)
            if ds is None:
                print(f"Failed to open '{input_file}'. Make sure the file is a valid GeoTIFF file.")
                return None
            
            # Reproject and resample
            gdal.Warp(output_file, ds, dstSRS="EPSG:4326", xRes=output_res, yRes=-output_res, resampleAlg="bilinear")
    
            # Read the processed TIFF file using rasterio
            rds = rxr.open_rasterio(output_file)
            rds = rds.squeeze().drop("spatial_ref").drop("band")
            rds.name = "data"
            df = rds.to_dataframe().reset_index()
            return df
    
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            return None
        
    @staticmethod
    def convert_tiff_to_csv(input_folder, output_res):

        curr_dir = os.getcwd()
        folder_path = os.path.join(curr_dir, input_folder)
        
        # Check if the folder exists and is not empty
        if not os.path.exists(folder_path) or not os.path.isdir(folder_path):
            print(f"The folder '{input_folder}' does not exist.")
            return
        
        if not os.listdir(folder_path):
            print(f"The folder '{input_folder}' is empty.")
            return
    
        tiff_files = [filename for filename in os.listdir(folder_path) if filename.endswith(".tif")]
    
        # Create CSV files from TIFF files
        for tiff_filename in tiff_files:
            
            # Open the TIFF file
            tiff_filepath = os.path.join(folder_path, tiff_filename)
            df = ASODataProcessing.processing_tiff(tiff_filepath, output_res)
    
            if df is not None:
                # Get the date from the TIFF filename
                date = os.path.splitext(tiff_filename)[0].split("_")[-1]
    
                # Define the CSV filename and folder
                csv_filename = f"ASO_SWE_{date}.csv"
                csv_folder = os.path.join(curr_dir, "Processed_Data", "SWE_csv")
                os.makedirs(csv_folder, exist_ok=True)
                csv_filepath = os.path.join(csv_folder, csv_filename)
    
                # Save the DataFrame as a CSV file
                df.to_csv(csv_filepath, index=False)
    
                print(f"Converted '{tiff_filename}' to '{csv_filename}'")
                
    def create_polygon(self, row):
        return Polygon([(row['BL_Coord_Long'], row['BL_Coord_Lat']),
                        (row['BR_Coord_Long'], row['BR_Coord_Lat']),
                        (row['UR_Coord_Long'], row['UR_Coord_Lat']),
                        (row['UL_Coord_Long'], row['UL_Coord_Lat'])])

    def process_folder(self, input_folder, metadata_path, output_folder):
        # Import the metadata into a pandas DataFrame
        '''
        input_folder = f"{HOME}/data/NSMv2.0/data/Processed_Data/SWE_csv"
        metadata_path = f"{HOME}/data/NSMv2.0/data/Provided_Data/grid_cells_meta.csv"
        output_folder = f"{HOME}/data/NSMv2.0/data/Processed_SWE"
        '''
        try:
            pred_obs_metadata_df = pd.read_csv(metadata_path)
        except:
            key = "NSMv2.0"+metadata_path.split("NSMv2.0",1)[1]        
            S3.meta.client.download_file(BUCKET_NAME, key,metadata_path)
            pred_obs_metadata_df = pd.read_csv(metadata_path)


        # Get all SWE_csv into the input folder
        csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

            
    
        # Assuming create_polygon is defined elsewhere, we add a column with polygon geometries
        pred_obs_metadata_df = pred_obs_metadata_df.drop(columns=['Unnamed: 0'], axis=1)
        pred_obs_metadata_df['geometry'] = pred_obs_metadata_df.apply(self.create_polygon, axis=1)
    
        # Convert the DataFrame to a GeoDataFrame
        metadata = gpd.GeoDataFrame(pred_obs_metadata_df, geometry='geometry')
    
        # Drop coordinates columns
        metadata_df = metadata.drop(columns=['BL_Coord_Long', 'BL_Coord_Lat', 
                                             'BR_Coord_Long', 'BR_Coord_Lat', 
                                             'UR_Coord_Long', 'UR_Coord_Lat', 
                                             'UL_Coord_Long', 'UL_Coord_Lat'], axis=1)
    
        # List all CSV files in the input folder
        csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]
    
        for csv_file in csv_files:
            input_aso_path = os.path.join(input_folder, csv_file)
            output_aso_path = os.path.join(output_folder, csv_file)
    
            # Check if the output file already exists
            if os.path.exists(output_aso_path):
                print(f"CSV file {csv_file} already exists in the output folder.")
                continue
    
            # Process each CSV file
            aso_swe_df = pd.read_csv(input_aso_path)
    
            # Convert the "aso_swe_df" into a GeoDataFrame with point geometries
            geometry = [Point(xy) for xy in zip(aso_swe_df['x'], aso_swe_df['y'])]
            aso_swe_geo = gpd.GeoDataFrame(aso_swe_df, geometry=geometry)

            result = gpd.sjoin(aso_swe_geo, metadata_df, how='left', predicate='within', op = 'intersects')
    
            # Select specific columns for the final DataFrame
            Final_df = result[['y', 'x', 'data', 'cell_id']]
            Final_df.rename(columns={'data': 'swe'}, inplace=True)
    
            # Drop rows where 'cell_id' is NaN
            if Final_df['cell_id'].isnull().values.any():
                Final_df = Final_df.dropna(subset=['cell_id'])
    
            # Save the processed DataFrame to a CSV file
            Final_df.to_csv(output_aso_path, index=False)
            print(f"Processed {csv_file}")
            
    def converting_ASO_to_standardized_format(self, input_folder, output_csv):
        
        # Initialize an empty DataFrame to store the final transformed data
        final_df = pd.DataFrame()
    
        # Iterate through all CSV files in the directory
        for filename in os.listdir(input_folder):
            if filename.endswith(".csv"):
                file_path = os.path.join(input_folder, filename)
    
                # Extract the time frame from the filename
                time_frame = filename.split('_')[-1].split('.')[0]
    
                # Read the CSV file into a DataFrame
                df = pd.read_csv(file_path)
    
                # Rename the 'SWE' column to the time frame for clarity
                df = df.rename(columns={'SWE': time_frame})
    
                # Merge or concatenate the data into the final DataFrame
                if final_df.empty:
                    final_df = df
                else:
                    final_df = pd.merge(final_df, df, on='cell_id', how='outer')
    
        # Save the final transformed DataFrame to a single CSV file
        final_df.to_csv(output_csv, index=False)
        
if __name__ == "__main__":
    
    #data_processor = ASODataProcessing()
    #folder_name = "SWE_Data"
    #output_res = 0.001
    data_processor.convert_tiff_to_csv(folder_name, output_res)
    input_folder = f"{HOME}/data/v2.0/Processed_Data/SWE_csv"
    metadata_path = f"{HOME}/data/v2.0/Provided_Data/grid_cells_meta.csv"
    output_folder = f"{HOME}/data/v2.0/Processed_SWE"

    data_processor.process_folder(input_folder, metadata_path, output_folder)

In [None]:
def load_aso_snotel_geometry(aso_swe_file, folder_path):
    
    aso_file = pd.read_csv(os.path.join(folder_path, aso_swe_file))
    aso_file.set_index('cell_id', inplace=True)
    aso_geometry = [Point(xy) for xy in zip(aso_file['x'], aso_file['y'])]
    aso_gdf = gpd.GeoDataFrame(aso_file, geometry=aso_geometry)
    
    return aso_gdf

def haversine_vectorized(lat1, lon1, lat2, lon2):
    
    lon1 = np.radians(lon1)
    lon2 = np.radians(lon2)
    lat1 = np.radians(lat1)
    lat2 = np.radians(lat2)

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    r = 6371.0
    # Distance calculation
    distances = r * c

    return distances

def calculate_nearest_snotel(aso_gdf, snotel_gdf, n=6, distance_cache=None):

    if distance_cache is None:
        distance_cache = {}

    nearest_snotel = {}
    for idx, aso_row in aso_gdf.iterrows():
        cell_id = idx

        # Check if distances for this cell_id are already calculated and cached
        if cell_id in distance_cache:
            nearest_snotel[idx] = distance_cache[cell_id]
        else:
            # Calculate Haversine distances between the grid cell and all SNOTEL locations
            distances = haversine_vectorized(
                aso_row.geometry.y, aso_row.geometry.x,
                snotel_gdf.geometry.y.values, snotel_gdf.geometry.x.values)

            # Store the nearest stations in the cache
            nearest_snotel[idx] = list(snotel_gdf['station_id'].iloc[distances.argsort()[:n]])
            distance_cache[cell_id] = nearest_snotel[idx]

    return nearest_snotel, distance_cache

def calculate_distances_for_cell(aso_row, snotel_gdf, n=6):
   
    distances = haversine_vectorized(
        aso_row.geometry.y, aso_row.geometry.x,
        snotel_gdf.geometry.y.values, snotel_gdf.geometry.x.values)
    
    nearest_sites = list(snotel_gdf['station_id'].iloc[distances.argsort()[:n]])
    
    return nearest_sites

def calculate_nearest_snotel_parallel(aso_gdf, snotel_gdf, n = 6, distance_cache = None):
    
    if distance_cache is None:
        distance_cache = {}

    nearest_snotel = {}
    with ProcessPoolExecutor(max_workers = 16) as executor:
        futures = []
        
        for idx, aso_row in aso_gdf.iterrows():
            if idx not in distance_cache:
                # Submit the task for parallel execution
                futures.append(executor.submit(calculate_distances_for_cell, aso_row, snotel_gdf, n))
            else:
                nearest_snotel[idx] = distance_cache[idx]

        # Retrieve results as they are completed
        for future in tqdm(futures):
            result = future.result()
  
            cell_id = result[0]  
            nearest_snotel[cell_id] = result[1]
            distance_cache[cell_id] = result[1]

    return nearest_snotel, distance_cache

def fetch_snotel_sites_for_cellids(aso_swe_files_folder_path, metadata_path, snotel_data_path):
    
    metadata_df = pd.read_csv(metadata_path)
    #metadata_df['geometry'] = metadata_df['geometry'].apply(wkt.loads)
    
    def create_polygon(row):
        return Polygon([(row['BL_Coord_Long'], row['BL_Coord_Lat']),
                        (row['BR_Coord_Long'], row['BR_Coord_Lat']),
                        (row['UR_Coord_Long'], row['UR_Coord_Lat']),
                        (row['UL_Coord_Long'], row['UL_Coord_Lat'])])
        
    metadata_df = metadata_df.drop(columns=['Unnamed: 0'], axis=1)
    metadata_df['geometry'] = metadata_df.apply(create_polygon, axis=1)
    
    metadata = gpd.GeoDataFrame(metadata_df, geometry='geometry')
    snotel_data = pd.read_csv(snotel_data_path)

    date_columns = snotel_data.columns[1:]
    new_column_names = {col: pd.to_datetime(col, format='%Y-%m-%d').strftime('%Y%m%d') for col in date_columns}
    snotel_data_f = snotel_data.rename(columns=new_column_names)

    snotel_file = pd.read_csv("/home/vgindi/Provided_Data/ground_measures_metadata.csv")
    snotel_geometry = [Point(xy) for xy in zip(snotel_file['longitude'], snotel_file['latitude'])]
    snotel_gdf = gpd.GeoDataFrame(snotel_file, geometry=snotel_geometry)

    final_df = pd.DataFrame()

    for aso_swe_file in os.listdir(aso_swe_files_folder_path):

        if os.path.isdir(os.path.join(aso_swe_files_folder_path, aso_swe_file)):
            continue

        timestamp = aso_swe_file.split('_')[-1].split('.')[0]
        print(f"Processing file with timestamp: {timestamp}")

        aso_gdf = load_aso_snotel_geometry(aso_swe_file, aso_swe_files_folder_path)
        aso_swe_data = pd.read_csv(os.path.join(aso_swe_files_folder_path, aso_swe_file))

        # Calculating nearest SNOTEL sites
        nearest_snotel, distance_cache = calculate_nearest_snotel(aso_gdf, snotel_gdf, n=6)
        print(f"calculated nearest snotel for file with timestamp {timestamp}")

        transposed_data = {}

        if timestamp in new_column_names.values():
            for idx, aso_row in aso_gdf.iterrows():    
                cell_id = idx
                station_ids = nearest_snotel[cell_id]
                selected_snotel_data = snotel_data_f[['station_id', timestamp]].loc[snotel_data_f['station_id'].isin(station_ids)]
                station_mapping = {old_id: f"nearest site {i+1}" for i, old_id in enumerate(station_ids)}
                
                # Rename the station IDs in the selected SNOTEL data
                selected_snotel_data['station_id'] = selected_snotel_data['station_id'].map(station_mapping)

                # Transpose and set the index correctly
                transposed_data[cell_id] = selected_snotel_data.set_index('station_id').T

            transposed_df = pd.concat(transposed_data, axis=0)
            
            # Reset index and rename columns
            transposed_df = transposed_df.reset_index()
            transposed_df.rename(columns={'level_0': 'cell_id', 'level_1': 'Date'}, inplace = True)
            transposed_df['Date'] = pd.to_datetime(transposed_df['Date'])
        
            aso_swe_data['Date'] = pd.to_datetime(timestamp)
            aso_swe_data = aso_swe_data[['cell_id', 'Date', 'swe']]
            merged_df = pd.merge(aso_swe_data, transposed_df, how='left', on=['cell_id', 'Date'])
        
            final_df = pd.concat([final_df, merged_df], ignore_index=True)
        
        else:
            aso_swe_data['Date'] = pd.to_datetime(timestamp)
            aso_swe_data = aso_swe_data[['cell_id', 'Date', 'swe']]
    
            # No need to merge in this case, directly concatenate
            final_df = pd.concat([final_df, aso_swe_data], ignore_index=True)


    # Merge with metadata
    req_cols = ['cell_id', 'lat', 'lon', 'BR_Coord_Long', 'BR_Coord_Lat', 'UR_Coord_Long', 'UR_Coord_Lat',
                'UL_Coord_Long', 'UL_Coord_Lat', 'BL_Coord_Long', 'BL_Coord_Lat', 'geometry']
    Result = final_df.merge(metadata[req_cols], how='left', on='cell_id')

    # Column renaming and ordering
    Result.rename(columns={'swe': 'ASO_SWE_in'}, inplace=True)
    Result = Result[['cell_id', 'Date', 'ASO_SWE_in', 'lat', 'lon', 'nearest site 1', 'nearest site 2',
                     'nearest site 3', 'nearest site 4', 'nearest site 5', 'nearest site 6',
                     'BR_Coord_Long', 'BR_Coord_Lat', 'UR_Coord_Long', 'UR_Coord_Lat',
                     'UL_Coord_Long', 'UL_Coord_Lat', 'BL_Coord_Long', 'BL_Coord_Lat']]

    # Save the merged data to a new file
    output_filename = r"/home/vgindi/Provided_Data/Merged_aso_snotel_data.csv"
    Result.to_csv(output_filename, index=False)
    print("Processed and saved data")
    
def main():
    aso_swe_files_folder_path = r"/home/vgindi/Processed_SWE"
    metadata_path = r"/home/vgindi/Provided_Data/grid_cells_meta_idx.csv"
    snotel_data_path = r"/home/vgindi/Provided_Data/ground_measures_train_featuresALLDATES.parquet"
    fetch_snotel_sites_for_cellids(aso_swe_files_folder_path, metadata_path, snotel_data_path)

if __name__ == "__main__":
    main()

In [None]:
Result = pd.read_csv(r'/home/vgindi/Provided_Data/Merged_aso_snotel_data.csv')
Result.head(10)

In [None]:
"""
A Simple implementation of parallel processing using concurrency it takes so long to execute,
Explore terrain_daskconcurrency and terrain-processing_cluster python for more optimized implementations.
"""

def process_single_location(args):
    lat, lon, regions, tiles = args

    if (lat, lon) in elevation_cache:
        elev, slop, asp = elevation_cache[(lat, lon)]
        return elev, slop, asp

    tile_id = 'Copernicus_DSM_COG_30_N' + str(math.floor(lon)) + '_00_W' + str(math.ceil(abs(lat))) + '_00_DEM'
    index_id = regions.loc[tile_id]['sliceID']

    signed_asset = planetary_computer.sign(tiles[index_id].assets["data"])
    #print(signed_asset)
    elevation = rxr.open_rasterio(signed_asset.href)
    
    slope = elevation.copy()
    aspect = elevation.copy()

    transformer = Transformer.from_crs("EPSG:4326", elevation.rio.crs, always_xy=True)
    xx, yy = transformer.transform(lon, lat)

    tilearray = np.around(elevation.values[0]).astype(int)
    #print(tilearray)
    geo = (math.floor(float(lon)), 90, 0.0, math.ceil(float(lat)), 0.0, -90)

    no_data_value = -9999
    driver = gdal.GetDriverByName('MEM')
    temp_ds = driver.Create('', tilearray.shape[1], tilearray.shape[0], 1, gdalconst.GDT_Float32)

    temp_ds.GetRasterBand(1).WriteArray(tilearray)
    temp_ds.GetRasterBand(1).SetNoDataValue(no_data_value)
    temp_ds.SetProjection('EPSG:4326')
    temp_ds.SetGeoTransform(geo)

    tilearray_np = temp_ds.GetRasterBand(1).ReadAsArray()
    slope_arr, aspect_arr = np.gradient(tilearray_np)
    aspect_arr = np.rad2deg(np.arctan2(aspect_arr[0], aspect_arr[1]))
    
    slope.values[0] = slope_arr
    aspect.values[0] = aspect_arr

    elev = round(elevation.sel(x=xx, y=yy, method="nearest").values[0])
    slop = round(slope.sel(x=xx, y=yy, method="nearest").values[0])
    asp = round(aspect.sel(x=xx, y=yy, method="nearest").values[0])

    elevation_cache[(lat, lon)] = (elev, slop, asp)  
    return elev, slop, asp

def extract_terrain_data_threaded(metadata_df, bounding_box, max_workers=10):
    global elevation_cache 

    elevation_cache = {} 
    min_x, min_y, max_x, max_y = *bounding_box[0], *bounding_box[1]
    
    client = Client.open(
            "https://planetarycomputer.microsoft.com/api/stac/v1",
            ignore_conformance=True,
        )

    search = client.search(
                    collections=["cop-dem-glo-90"],
                    intersects = {
                            "type": "Polygon",
                            "coordinates": [[
                            [min_x, min_y],
                            [max_x, min_y],
                            [max_x, max_y],
                            [min_x, max_y],
                            [min_x, min_y]  
                        ]]})

    tiles = list(search.items())

    regions = []

    print("Retrieving Copernicus 90m DEM tiles")
    for i in tqdm(range(0, len(tiles))):
        row = [i, tiles[i].id]
        regions.append(row)
    regions = pd.DataFrame(columns = ['sliceID', 'tileID'], data = regions)
    regions = regions.set_index(regions['tileID'])
    del regions['tileID']

    print("Interpolating Grid Cell Spatial Features")

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_single_location, (metadata_df.iloc[i]['cen_lat'], metadata_df.iloc[i]['cen_lon'], regions, tiles))
                   for i in tqdm(range(len(metadata_df)))]
        
        results = []
        for future in tqdm(as_completed(futures), total=len(futures)):
            results.append(future.result())
    
    metadata_df['Elevation_m'], metadata_df['Slope_Deg'], metadata_df['Aspect_L'] = zip(*results)

metadata_df = pd.read_csv(r"/home/vgindi/Provided_Data/Merged_aso_nearest_sites1.csv")
metadata_df= metadata_df.head(20)
bounding_box = ((-120.3763448720203, 36.29256774541929), (-118.292253412863, 38.994985247736324))    
    
extract_terrain_data_threaded(metadata_df, bounding_box)

# Display the results
metadata_df.head(10)

In [None]:
"""
This code block crops the global coverage VIIRS data to south sierras subregion. 
"""

def crop_sierras(input_file_path, output_file_path, shapes):
    with rasterio.open(input_file_path) as src:
        out_image, out_transform = rasterio.mask.mask(src, shapes, crop=True)
        out_meta = src.out_meta
        out_meta.update({"driver": "GTiff",
                         "height": out_image.shape[1],
                         "width": out_image.shape[2],
                         "transform": out_transform})
                         
        with rasterio.open(output_file_path, "w", **out_meta) as dest:
            dest.write(out_image)

def download_viirs_sca(input_dir, output_dir, shapefile_path):
    
    # Load shapes from the shapefile
    with fiona.open(shapefile_path, 'r') as shapefile:
        shapes = [feature["geometry"] for feature in shapefile]
    
    # Iterate through each year directory in the input directory
    for year_folder in os.listdir(input_dir):
        year_folder_path = os.path.join(input_dir, year_folder)
        if os.path.isdir(year_folder_path):
            # Extract year from the folder name (assuming folder names like 'WY2013')
            year = re.search(r'\d{4}', year_folder).group()
            output_year_folder = os.path.join(output_dir, year)
            os.makedirs(output_year_folder, exist_ok=True)
        
            for file_name in os.listdir(year_folder_path):        
                if file_name.endswith('.tif'):   
                    parts = file_name.split('_')
                    output_file_name = '_'.join(parts[:3]) + '.tif'
                    output_file_path = os.path.join(output_year_folder, output_file_name)
                    input_file_path = os.path.join(year_folder_path, file_name)
                    crop_sierras(input_file_path, output_file_path, shapes)
                    print(f"Processed and saved {output_file_path}")

if __name__ == "__main__":
    
    input_directory = r"/home/vgindi/VIIRS_Data"
    output_directory = r"/home/vgindi/VIIRS_Sierras"
    shapefile_path = r"/home/vgindi/Provided_Data/low_sierras_points.shp"
    download_viirs_sca(input_directory, output_directory, shapefile_path)

In [None]:
"""
This code cell transforms the raw VIIRS tiff files to 100m resolution and saves each file in .csv format
"""
def processing_VIIRS(input_file, output_res):
    try:
        # Define the output file path for TIFFs using the original file name
        output_folder_tiff = os.path.join("/home/vgindi/Processed_VIIRS", os.path.basename(os.path.dirname(input_file)))
        os.makedirs(output_folder_tiff, exist_ok=True)
        output_file = os.path.join(output_folder_tiff, os.path.basename(input_file))

        # Reproject and resample
        ds = gdal.Open(input_file)
        if ds is None:
            print(f"Failed to open '{input_file}'. Make sure the file is a valid GeoTIFF file.")
            return None
        
        gdal.Warp(output_file, ds, dstSRS="EPSG:4326", xRes=output_res, yRes=-output_res, resampleAlg="bilinear")

        # Read the processed TIFF file using rasterio
        rds = rxr.open_rasterio(output_file)
        rds = rds.squeeze().drop("spatial_ref").drop("band")
        rds.name = "data"
        df = rds.to_dataframe().reset_index()
        return df
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

def process_and_convert_viirs(input_dir, output_res):
    # Iterate over subdirectories in the input directory
    for year in os.listdir(input_dir):
        year_dir = os.path.join(input_dir, year)
        
        if os.path.isdir(year_dir):
            for file_name in os.listdir(year_dir):
                if file_name.endswith('.tif'):
                    input_file_path = os.path.join(year_dir, file_name)
                    df = processing_VIIRS(input_file_path, output_res)
                    
                    if df is not None:
                        csv_folder = os.path.join("/home/vgindi/Processed_VIIRS", "VIIRS_csv")
                        os.makedirs(csv_folder, exist_ok=True)
                        csv_file_path = os.path.join(csv_folder, file_name.replace('.tif', '.csv'))
 
                        df.to_csv(csv_file_path, index=False)
                        print(f"Processed and saved {csv_file_path}")

if __name__ == "__main__":
    input_directory = "/home/vgindi/VIIRS_Sierras"
    output_res = 100  # Desired resolution in meters
    process_and_convert_viirs(input_directory, output_res)

In [None]:
"""
This code cell fetches the cell id using grid_cells_meta_idx metadata for each lat/lon pair for VIIRS csv file
"""
def create_polygon(self, row):
    return Polygon([(row['BL_Coord_Long'], row['BL_Coord_Lat']),
                    (row['BR_Coord_Long'], row['BR_Coord_Lat']),
                    (row['UR_Coord_Long'], row['UR_Coord_Lat']),
                    (row['UL_Coord_Long'], row['UL_Coord_Lat'])])
    
def process_folder(self, input_folder, metadata_path, output_folder):
    # Import the metadata into a pandas DataFrame
    pred_obs_metadata_df = pd.read_csv(metadata_path)

    # Assuming create_polygon is defined elsewhere, we add a column with polygon geometries
    pred_obs_metadata_df = pred_obs_metadata_df.drop(columns=['Unnamed: 0'], axis=1)
    pred_obs_metadata_df['geometry'] = pred_obs_metadata_df.apply(self.create_polygon, axis=1)

    # Convert the DataFrame to a GeoDataFrame
    metadata = gpd.GeoDataFrame(pred_obs_metadata_df, geometry='geometry')

    # Drop coordinates columns
    metadata = metadata.drop(columns=['BL_Coord_Long', 'BL_Coord_Lat', 
                                         'BR_Coord_Long', 'BR_Coord_Lat', 
                                         'UR_Coord_Long', 'UR_Coord_Lat', 
                                         'UL_Coord_Long', 'UL_Coord_Lat'], axis=1)

    # List all CSV files in the input folder
    csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

    for csv_file in csv_files:
        input_path = os.path.join(input_folder, csv_file)
        output_path = os.path.join(output_folder, csv_file)

        # Check if the output file already exists
        if os.path.exists(output_path):
            print(f"CSV file {csv_file} already exists in the output folder.")
            continue

        # Process each CSV file
        viirs_sca_df = pd.read_csv(input_path)

        # Convert the "aso_swe_df" into a GeoDataFrame with point geometries
        geometry = [Point(xy) for xy in zip(viirs_sca_df['x'], viirs_sca_df['y'])]
        viirs_sca_geo = gpd.GeoDataFrame(viirs_sca_df, geometry=geometry)
        result = gpd.sjoin(viirs_sca_geo, metadata, how='left', predicate='within', op = 'intersects')

        # Select specific columns for the final DataFrame
        Final_df = result[['y', 'x', 'data', 'cell_id']]
        Final_df.rename(columns={'data': 'VIIRS_SCA'}, inplace=True)

        # Drop rows where 'cell_id' is NaN
        if Final_df['cell_id'].isnull().values.any():
            Final_df = Final_df.dropna(subset=['cell_id'])

        # Save the processed DataFrame to a CSV file
        Final_df.to_csv(output_path, index=False)
        print(f"Processed {csv_file}")

if __name__ == "__main__":
    input_folder = r""
    metadata_path = r""
    output_folder = r""
    process_folder(input_folder, metadata_path, output_folder)