# Data Processing script for the NSM/SWEML v2.0
This .ipynb script uses python module for retrieving NASA ASO observations, locating nearest SNOTEL sites, connecting SNOTEL obs with ASO obs, and add geospatial features to the ML training/testing/hindcast dataframes.

# Next steps 
- Revisist other scripts and convert to PyArrow/parquet Brocli compressed file storage
- connect precip to DF,
- VIIRS
- add new sites (e.g., regionval) to training DF with all the respective spatial resolution information
- connect regional data together to train model
- connect different regions
- add precipitation phase features (seasonal accumulated rain precip, seasonal accumulated snow precip as a function of temperature)
- explore adding other features stemming from SNOTEL, remote sensing (LULC), Snow Classifications (Sturms), energy balance
- add snotel script to functions

Put all units in SI, while it should not matter for model training since they are being normalized, they will be more interpretable.

In [1]:
from ASOget import ASODownload, ASODataProcessing

# Inputs for fetching ASO data for a region
short_name = 'ASO_50M_SWE'
version = '1'
time_start = '2013-04-02T00:00:00Z'
time_end = '2019-07-19T23:59:59Z'
region = 'S_Sierras'
output_res = 100 #desired spatial resoultion in meters (m)
directory = "Raw_ASO_Data"
folder_name = f"{region}/{directory}"

#Get ASO data
data_tool = ASODownload(short_name, version)
b_box = data_tool.BoundingBox(region)  
url_list = data_tool.cmr_search(time_start, time_end, region, b_box)
data_tool.cmr_download(directory, region)

#Convert ASO tifs to parquet
data_processor = ASODataProcessing()
data_processor.convert_tiff_to_parquet_multiprocess(folder_name, output_res, region) 

Converting .tif to parquet
Converting 131 ASO tif files to parquet'


100%|██████████| 131/131 [00:00<00:00, 360.55it/s]
ERROR 1: Deleting /home/rjohnson18/SWEMLv2.0/data/ASO/S_Sierras/Processed_100M_SWE/ASO_100M_20170129.tif failed:
No such file or directory


An error occurred: /home/rjohnson18/SWEMLv2.0/data/ASO/S_Sierras/Processed_100M_SWE/ASO_100M_20160426.tif: No such file or directory
An error occurred: '/home/rjohnson18/SWEMLv2.0/data/ASO/S_Sierras/Processed_100M_SWE/ASO_100M_20180601.tif' not recognized as a supported file format.


ERROR 6: Unable to determine files associated with /home/rjohnson18/SWEMLv2.0/data/ASO/S_Sierras/Processed_100M_SWE/ASO_100M_20180601.tif, delete fails.
ERROR 4: Unable to open /home/rjohnson18/SWEMLv2.0/data/ASO/S_Sierras/Processed_100M_SWE/ASO_100M_20180601.tif to obtain file list.


An error occurred: '/home/rjohnson18/SWEMLv2.0/data/ASO/S_Sierras/Processed_100M_SWE/ASO_100M_20190501.tif' not recognized as a supported file format.
Checking to make sure all files successfully converted...


 55%|█████▍    | 54/99 [00:02<00:03, 13.63it/s]

Bad file conversion for ASO_100M_SWE_20180423.parquet, attempting to reprocess
ASO_100M_20180423.tif


 57%|█████▋    | 56/99 [00:03<00:10,  4.13it/s]

Attempt 2
Bad file conversion for ASO_100M_20180423.tif, attempting to reprocess


 81%|████████  | 80/99 [00:05<00:00, 20.05it/s]

Bad file conversion for ASO_100M_SWE_20190324.parquet, attempting to reprocess
ASO_100M_20190324.tif


 84%|████████▍ | 83/99 [00:07<00:03,  4.02it/s]

Attempt 2
Bad file conversion for ASO_100M_20190324.tif, attempting to reprocess


100%|██████████| 99/99 [00:08<00:00, 11.72it/s]


In [2]:
from tqdm import tqdm
import os
import pandas as pd

HOME = os.path.expanduser('~')
region = 'S_Sierras'
output_res = 100

#make cell_id
def make_cell_id(region, output_res, cen_lat, cen_lon):
    cell_id = f"{region}_{output_res}M_{cen_lat}_{cen_lon}"
    return cell_id

aso_swe_files_folder_path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"

print('Loading all Geospatial prediction/observation files and concatenating into one dataframe')
for aso_swe_file in tqdm(os.listdir(aso_swe_files_folder_path)):
    try:
        aso_file = pd.read_parquet(os.path.join(aso_swe_files_folder_path, aso_swe_file), engine='fastparquet')
    except:
        print(aso_swe_file)


Loading all Geospatial prediction/observation files and concatenating into one dataframe


100%|██████████| 99/99 [00:04<00:00, 21.57it/s]


In [3]:
aso_file

Unnamed: 0,cen_lat,cen_lon,swe_m,cell_id
103986,37.739871,-119.189549,0.0,S_Sierras_100M_37.73987131307046_-119.18954900...
103987,37.739871,-119.188549,0.0,S_Sierras_100M_37.73987131307046_-119.18854900...
103988,37.739871,-119.187549,0.0,S_Sierras_100M_37.73987131307046_-119.18754900...
105174,37.738871,-119.190549,0.0,S_Sierras_100M_37.73887131307046_-119.19054900...
105175,37.738871,-119.189549,0.0,S_Sierras_100M_37.73887131307046_-119.18954900...
...,...,...,...,...
995285,36.989871,-119.640549,0.0,S_Sierras_100M_36.98987131307046_-119.64054900...
995286,36.989871,-119.639549,0.0,S_Sierras_100M_36.98987131307046_-119.63954900...
995287,36.989871,-119.638549,0.0,S_Sierras_100M_36.98987131307046_-119.63854900...
995288,36.989871,-119.637549,0.0,S_Sierras_100M_36.98987131307046_-119.63754900...


In [None]:
import os
import pandas as pd

HOME = os.path.expanduser('~')
region = 'S_Sierras'
output_res = 100
aso_swe_files_folder_path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"
aso_swe_file = "ASO_100M_SWE_20170717.parquet"

aso_file = pd.read_parquet(os.path.join(aso_swe_files_folder_path, aso_swe_file), engine='fastparquet')
aso_file

In [None]:
import pyarrow

pyarrow.__version__

In [None]:
from tqdm import tqdm
import os
import pandas as pd

HOME = os.path.expanduser('~')
region = 'S_Sierras'
output_res = 100

#make cell_id
def make_cell_id(region, output_res, cen_lat, cen_lon):
    cell_id = f"{region}_{output_res}M_{cen_lat}_{cen_lon}"
    return cell_id

aso_swe_files_folder_path = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{output_res}M_SWE_parquet"

metadf = pd.DataFrame()
print('Loading all Geospatial prediction/observation files and concatenating into one dataframe')
for aso_swe_file in tqdm(os.listdir(aso_swe_files_folder_path)):
    try:
        aso_file = pd.read_parquet(os.path.join(aso_swe_files_folder_path, aso_swe_file))
        metadf = pd.concat([metadf, aso_file])
    except:
        print(aso_swe_file)





In [None]:
#make cell ids
print('Identifying unique locations')
tqdm.pandas()
ASO_meta_loc_DF['cell_id'] = ASO_meta_loc_DF.progress_apply(lambda row: make_cell_id(region, output_res, row['cen_lat'], row['cen_lon']), axis=1)

#ASO_meta_loc_DF.to_csv(f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/ASO_meta.parquet")

In [None]:
metadf = ASO_meta_loc_DF.drop_duplicates('cell_id').set_index('cell_id')
metadf.pop('swe_m')
metadf

In [None]:
100/111111

In [None]:
def bounding_box(x_coordinate, y_coordinate, output_res):

    degs = (output_res/111111)/2 #general formulat is three are 111,111m to one degree, divide by two because the given point is the centeroid
    
    '''returns 'BL_Coord_Long', 'BL_Coord_Lat', 
             'BR_Coord_Long', 'BR_Coord_Lat', 
             'UR_Coord_Long', 'UR_Coord_Lat', 
              'UL_Coord_Long', 'UL_Coord_Lat']

    '''
    #Bottom left
    BL_Coord_Long = x_coordinate-degs
    BL_Coord_Lat = y_coordinate-degs

    #Upper left
    UL_Coord_Long = x_coordinate-degs
    UL_Coord_Lat = y_coordinate+degs

    #Upper right
    UR_Coord_Long = x_coordinate+degs
    UR_Coord_Lat = y_coordinate+degs

    #Lower right
    BR_Coord_Long = x_coordinate+degs
    BR_Coord_Lat = y_coordinate-degs

    return BL_Coord_Long, BL_Coord_Lat, BR_Coord_Long, BR_Coord_Lat, UR_Coord_Long, UR_Coord_Lat, UL_Coord_Long, UL_Coord_Lat

In [None]:
test = metadf.head(10)


In [None]:
from shapely.geometry import Point, Polygon

def create_polygon(row):
    return Polygon([(row['BL_Coord_Long'], row['BL_Coord_Lat']),
                    (row['BR_Coord_Long'], row['BR_Coord_Lat']),
                    (row['UR_Coord_Long'], row['UR_Coord_Lat']),
                    (row['UL_Coord_Long'], row['UL_Coord_Lat'])])

In [None]:
aso_swe_file = 'ASO_100M_SWE_20190324.parquet'

pd.read_parquet(os.path.join(aso_swe_files_folder_path, aso_swe_file))

In [None]:
len(list(set(cell_ids)))

In [None]:
cell_ids

In [None]:
res = []
[res.append(x) for x in cell_ids if x not in res]
len(res)

In [None]:
import numpy as np
import xarray as xr
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

#load access key
HOME = os.path.expanduser('~')

file = f"{HOME}/SWEMLv2.0/data/ASO/S_Sierras/100M_SWE_parquet/ASO_100M_SWE_20180423.parquet"

file = pd.read_parquet(file)

file

In [None]:
import os
HOME = os.path.expanduser('~')

region = 'S_Sierras'
directory = "Raw_ASO_Data"
folder = f"{HOME}/SWEMLv2.0/data/ASO/{region}/{directory}"


for index, url in enumerate(url_list, start=1):
    filename = os.path.join(folder, url.split('/')[-1])  # Specify the full path to the file
    print(filename)
    #print('{0}/{1}: {2}'.format(str().zfill(len(str(len(url_list)))), len(url_list), filename))

In [None]:
folder_name, output_res, region

# Code for generating ML dataframe using nearest in situ monitoring sites

In [None]:
import GeoDF

# GeoDF used to create a dataframe for ML model development. Its function is to connect in situ observations to gridded locations
region = 'S_Sierras' #Should be done in above code block
output_res = 100

#load snotel meta location data, use haversive function
GeoDF.fetch_snotel_sites_for_cellids(region) # Using known up to date sites, can this be threaded?

# Get geophysical attributes for each site, need to see how to add output resolution
gdf = GeoDF.GeoSpatial(region)
#gdf = gdf.head(100)
#use geodataframe with lat/long meta of all sites to determine slope, aspect, and elevation
metadf = GeoDF.extract_terrain_data_threaded(gdf, region)




In [None]:
import Obs_to_DF
region = "S_Sierras"
output_res = 100

#Connect nearest snotel observations with ASO data, makes a parquet file for each date
finaldf = Obs_to_DF.Nearest_Snotel_2_obs_MultiProcess(region, output_res) 

In [None]:
import GeoDF

region = 'S_Sierras'
output_res = 100

#Connect cell ids with ASO obs and snotel obs to geospatial features
GeoDF.add_geospatial_threaded(region, output_res)

In [None]:
import get_Precip

#gets precipitation for each location, accumulates it through the water year

#set start/end date for a water year
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019]
region = 'S_Sierras'
output_res = 100
for year in years:
    get_Precip.get_precip_threaded(year, region, output_res)

In [None]:
import os
import pandas as pd

import pyarrow as pa
import pyarrow.parquet as pq

HOME = os.path.expanduser('~')

region = 'S_Sierras'
year = 2013
output_res = 100

Precippath = f"{HOME}/SWEMLv2.0/data/Precipitation/{region}/{output_res}M_NLDAS_Precip/{year}"

ppt = pd.read_csv(f"{Precippath}/NLDAS_PPT_2013-04-03.parquet")

ppt.set_index('cell_id', inplace=True)

ppt

In [None]:
#Convert DataFrame to Apache Arrow Table
table = pa.Table.from_pandas(ppt)

# Parquet with Brotli compression
pq.write_table(table, f"{Precippath}/PYARROW_NLDAS_PPT_2013-04-03.parquet", compression='BROTLI')

In [None]:
pptparquet = pd.read_parquet(f"{Precippath}/PYARROW_NLDAS_PPT_2013-04-03.parquet")
pptparquet

In [None]:
ppt

In [None]:
# Merge with metadata
req_cols = ['cell_id', 'lat', 'lon', 'BR_Coord_Long', 'BR_Coord_Lat', 'UR_Coord_Long', 'UR_Coord_Lat',
            'UL_Coord_Long', 'UL_Coord_Lat', 'BL_Coord_Long', 'BL_Coord_Lat', 'geometry']
Result = final_df.merge(metadata[req_cols], how='left', on='cell_id')

# Column renaming and ordering
Result.rename(columns={'swe': 'ASO_SWE_in'}, inplace=True)
Result = Result[['cell_id', 'Date', 'ASO_SWE_in', 'lat', 'lon', 'nearest site 1', 'nearest site 2',
                    'nearest site 3', 'nearest site 4', 'nearest site 5', 'nearest site 6',
                    'BR_Coord_Long', 'BR_Coord_Lat', 'UR_Coord_Long', 'UR_Coord_Lat',
                    'UL_Coord_Long', 'UL_Coord_Lat', 'BL_Coord_Long', 'BL_Coord_Lat']]

# Save the merged data to a new file
output_filename = f"{HOME}/SWEML/data/NSMv2.0/data/TrainingDFs/Merged_aso_snotel_data.parquet"
Result.to_csv(output_filename, index=False)
display(Result.head(10))
print("Processed and saved data")

In [None]:
region = 'S_Sierras'
ASO_meta_loc_DF = pd.read_csv(f"{HOME}/SWEMLv2.0/data/TrainingDFs/{region}/ASO_meta.parquet")

In [None]:
#Connect nearest snotel with ASO data, this should be last for now, need to add geophysical characteristics to the site first, then this...
finaldf = GeoDF.Nearest_Snotel_2_obs(region, output_res, dropna = True) 

In [None]:
"""
A Simple implementation of parallel processing using concurrency it takes so long to execute,
Explore terrain_daskconcurrency and terrain-processing_cluster python for more optimized implementations.
"""

def process_single_location(args):
    lat, lon, regions, tiles = args
    print(lat, lon, regions, tiles)

    if (lat, lon) in elevation_cache:
        elev, slop, asp = elevation_cache[(lat, lon)]
        return elev, slop, asp

    tile_id = 'Copernicus_DSM_COG_30_N' + str(math.floor(lon)) + '_00_W' + str(math.ceil(abs(lat))) + '_00_DEM'
    index_id = regions.loc[tile_id]['sliceID']

    signed_asset = planetary_computer.sign(tiles[index_id].assets["data"])
    #print(signed_asset)
    elevation = rxr.open_rasterio(signed_asset.href)
    
    slope = elevation.copy()
    aspect = elevation.copy()

    transformer = Transformer.from_crs("EPSG:4326", elevation.rio.crs, always_xy=True)
    xx, yy = transformer.transform(lon, lat)

    tilearray = np.around(elevation.values[0]).astype(int)
    #print(tilearray)
    geo = (math.floor(float(lon)), 90, 0.0, math.ceil(float(lat)), 0.0, -90)

    no_data_value = -9999
    driver = gdal.GetDriverByName('MEM')
    temp_ds = driver.Create('', tilearray.shape[1], tilearray.shape[0], 1, gdalconst.GDT_Float32)

    temp_ds.GetRasterBand(1).WriteArray(tilearray)
    temp_ds.GetRasterBand(1).SetNoDataValue(no_data_value)
    temp_ds.SetProjection('EPSG:4326')
    temp_ds.SetGeoTransform(geo)

    tilearray_np = temp_ds.GetRasterBand(1).ReadAsArray()
    slope_arr, aspect_arr = np.gradient(tilearray_np)
    aspect_arr = np.rad2deg(np.arctan2(aspect_arr[0], aspect_arr[1]))
    
    slope.values[0] = slope_arr
    aspect.values[0] = aspect_arr

    elev = round(elevation.sel(x=xx, y=yy, method="nearest").values[0])
    slop = round(slope.sel(x=xx, y=yy, method="nearest").values[0])
    asp = round(aspect.sel(x=xx, y=yy, method="nearest").values[0])

    elevation_cache[(lat, lon)] = (elev, slop, asp)  
    return elev, slop, asp

def extract_terrain_data_threaded(metadata_df, bounding_box, max_workers=10):
    global elevation_cache 

    elevation_cache = {} 
    min_x, min_y, max_x, max_y = *bounding_box[0], *bounding_box[1]
    
    client = Client.open(
            "https://planetarycomputer.microsoft.com/api/stac/v1",
            ignore_conformance=True,
        )

    search = client.search(
                    collections=["cop-dem-glo-90"],
                    intersects = {
                            "type": "Polygon",
                            "coordinates": [[
                            [min_x, min_y],
                            [max_x, min_y],
                            [max_x, max_y],
                            [min_x, max_y],
                            [min_x, min_y]  
                        ]]})

    tiles = list(search.items())

    regions = []

    print("Retrieving Copernicus 90m DEM tiles")
    for i in tqdm(range(0, len(tiles))):
        row = [i, tiles[i].id]
        regions.append(row)
    regions = pd.DataFrame(columns = ['sliceID', 'tileID'], data = regions)
    regions = regions.set_index(regions['tileID'])
    del regions['tileID']

    print("Interpolating Grid Cell Spatial Features")

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_single_location, (metadata_df.iloc[i]['cen_lat'], metadata_df.iloc[i]['cen_lon'], regions, tiles))
                   for i in tqdm(range(len(metadata_df)))]
        
        results = []
        for future in tqdm(as_completed(futures), total=len(futures)):
            results.append(future.result())
    
    metadata_df['Elevation_m'], metadata_df['Slope_Deg'], metadata_df['Aspect_L'] = zip(*results)

In [None]:
metadata_df = pd.read_csv(r"/home/vgindi/Provided_Data/Merged_aso_nearest_sites1.csv")
metadata_df= metadata_df.head(20)
bounding_box = ((-120.3763448720203, 36.29256774541929), (-118.292253412863, 38.994985247736324))    
    
extract_terrain_data_threaded(metadata_df, bounding_box)

# Display the results
metadata_df.head(10)

In [None]:
"""
This code block crops the global coverage VIIRS data to south sierras subregion. 
"""

def crop_sierras(input_file_path, output_file_path, shapes):
    with rasterio.open(input_file_path) as src:
        out_image, out_transform = rasterio.mask.mask(src, shapes, crop=True)
        out_meta = src.out_meta
        out_meta.update({"driver": "GTiff",
                         "height": out_image.shape[1],
                         "width": out_image.shape[2],
                         "transform": out_transform})
                         
        with rasterio.open(output_file_path, "w", **out_meta) as dest:
            dest.write(out_image)

def download_viirs_sca(input_dir, output_dir, shapefile_path):
    
    # Load shapes from the shapefile
    with fiona.open(shapefile_path, 'r') as shapefile:
        shapes = [feature["geometry"] for feature in shapefile]
    
    # Iterate through each year directory in the input directory
    for year_folder in os.listdir(input_dir):
        year_folder_path = os.path.join(input_dir, year_folder)
        if os.path.isdir(year_folder_path):
            # Extract year from the folder name (assuming folder names like 'WY2013')
            year = re.search(r'\d{4}', year_folder).group()
            output_year_folder = os.path.join(output_dir, year)
            os.makedirs(output_year_folder, exist_ok=True)
        
            for file_name in os.listdir(year_folder_path):        
                if file_name.endswith('.tif'):   
                    parts = file_name.split('_')
                    output_file_name = '_'.join(parts[:3]) + '.tif'
                    output_file_path = os.path.join(output_year_folder, output_file_name)
                    input_file_path = os.path.join(year_folder_path, file_name)
                    crop_sierras(input_file_path, output_file_path, shapes)
                    print(f"Processed and saved {output_file_path}")

if __name__ == "__main__":
    
    input_directory = r"/home/vgindi/VIIRS_Data"
    output_directory = r"/home/vgindi/VIIRS_Sierras"
    shapefile_path = r"/home/vgindi/Provided_Data/low_sierras_points.shp"
    download_viirs_sca(input_directory, output_directory, shapefile_path)

In [None]:
"""
This code cell transforms the raw VIIRS tiff files to 100m resolution and saves each file in .csv format
"""
def processing_VIIRS(input_file, output_res):
    try:
        # Define the output file path for TIFFs using the original file name
        output_folder_tiff = os.path.join("/home/vgindi/Processed_VIIRS", os.path.basename(os.path.dirname(input_file)))
        os.makedirs(output_folder_tiff, exist_ok=True)
        output_file = os.path.join(output_folder_tiff, os.path.basename(input_file))

        # Reproject and resample
        ds = gdal.Open(input_file)
        if ds is None:
            print(f"Failed to open '{input_file}'. Make sure the file is a valid GeoTIFF file.")
            return None
        
        gdal.Warp(output_file, ds, dstSRS="EPSG:4326", xRes=output_res, yRes=-output_res, resampleAlg="bilinear")

        # Read the processed TIFF file using rasterio
        rds = rxr.open_rasterio(output_file)
        rds = rds.squeeze().drop("spatial_ref").drop("band")
        rds.name = "data"
        df = rds.to_dataframe().reset_index()
        return df
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

def process_and_convert_viirs(input_dir, output_res):
    # Iterate over subdirectories in the input directory
    for year in os.listdir(input_dir):
        year_dir = os.path.join(input_dir, year)
        
        if os.path.isdir(year_dir):
            for file_name in os.listdir(year_dir):
                if file_name.endswith('.tif'):
                    input_file_path = os.path.join(year_dir, file_name)
                    df = processing_VIIRS(input_file_path, output_res)
                    
                    if df is not None:
                        csv_folder = os.path.join("/home/vgindi/Processed_VIIRS", "VIIRS_csv")
                        os.makedirs(csv_folder, exist_ok=True)
                        csv_file_path = os.path.join(csv_folder, file_name.replace('.tif', '.csv'))
 
                        df.to_csv(csv_file_path, index=False)
                        print(f"Processed and saved {csv_file_path}")

if __name__ == "__main__":
    input_directory = "/home/vgindi/VIIRS_Sierras"
    output_res = 100  # Desired resolution in meters
    process_and_convert_viirs(input_directory, output_res)

In [None]:
"""
This code cell fetches the cell id using grid_cells_meta_idx metadata for each lat/lon pair for VIIRS csv file
"""
def create_polygon(self, row):
    return Polygon([(row['BL_Coord_Long'], row['BL_Coord_Lat']),
                    (row['BR_Coord_Long'], row['BR_Coord_Lat']),
                    (row['UR_Coord_Long'], row['UR_Coord_Lat']),
                    (row['UL_Coord_Long'], row['UL_Coord_Lat'])])
    
def process_folder(self, input_folder, metadata_path, output_folder):
    # Import the metadata into a pandas DataFrame
    pred_obs_metadata_df = pd.read_csv(metadata_path)

    # Assuming create_polygon is defined elsewhere, we add a column with polygon geometries
    pred_obs_metadata_df = pred_obs_metadata_df.drop(columns=['Unnamed: 0'], axis=1)
    pred_obs_metadata_df['geometry'] = pred_obs_metadata_df.apply(self.create_polygon, axis=1)

    # Convert the DataFrame to a GeoDataFrame
    metadata = gpd.GeoDataFrame(pred_obs_metadata_df, geometry='geometry')

    # Drop coordinates columns
    metadata = metadata.drop(columns=['BL_Coord_Long', 'BL_Coord_Lat', 
                                         'BR_Coord_Long', 'BR_Coord_Lat', 
                                         'UR_Coord_Long', 'UR_Coord_Lat', 
                                         'UL_Coord_Long', 'UL_Coord_Lat'], axis=1)

    # List all CSV files in the input folder
    csv_files = [f for f in os.listdir(input_folder) if f.endswith('.csv')]

    for csv_file in csv_files:
        input_path = os.path.join(input_folder, csv_file)
        output_path = os.path.join(output_folder, csv_file)

        # Check if the output file already exists
        if os.path.exists(output_path):
            print(f"CSV file {csv_file} already exists in the output folder.")
            continue

        # Process each CSV file
        viirs_sca_df = pd.read_csv(input_path)

        # Convert the "aso_swe_df" into a GeoDataFrame with point geometries
        geometry = [Point(xy) for xy in zip(viirs_sca_df['x'], viirs_sca_df['y'])]
        viirs_sca_geo = gpd.GeoDataFrame(viirs_sca_df, geometry=geometry)
        result = gpd.sjoin(viirs_sca_geo, metadata, how='left', predicate='within', op = 'intersects')

        # Select specific columns for the final DataFrame
        Final_df = result[['y', 'x', 'data', 'cell_id']]
        Final_df.rename(columns={'data': 'VIIRS_SCA'}, inplace=True)

        # Drop rows where 'cell_id' is NaN
        if Final_df['cell_id'].isnull().values.any():
            Final_df = Final_df.dropna(subset=['cell_id'])

        # Save the processed DataFrame to a CSV file
        Final_df.to_csv(output_path, index=False)
        print(f"Processed {csv_file}")

if __name__ == "__main__":
    input_folder = r""
    metadata_path = r""
    output_folder = r""
    process_folder(input_folder, metadata_path, output_folder)

In [None]:
#Applying polygon geometries
# input_folder = f"ASO/{region}/{output_res}M_SWE_parquet/"
# metadata_file = f"grid_cells_meta.csv"
# output_folder = f"ASO/{region}/Processed_SWE"
# data_processor = ASODataProcessing()
# data_processor.process_folder(input_folder, metadata_file, output_folder) 