In [None]:
import xarray as xr
import pandas as pd
import numpy as np
from pathlib import Path
import gc
import os


def convert_nc_to_parquet(nc_filepath, output_filepath=None, chunk_size=None):
    """
    Convert NetCDF file to Parquet format
    
    Parameters:
    nc_filepath (str): Path to the .nc file
    output_filepath (str): Path for output .parquet file (optional)
    chunk_size (int): Number of rows to process at once for large files (optional)
    """
    
    print(f"Loading NetCDF file: {nc_filepath}")
    ds = xr.open_dataset(nc_filepath)
    
    print(ds)

    print("\nConverting to DataFrame...")
    df = ds.to_dataframe()
    
    print(f"\nDataFrame shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")
    print("\nDataFrame info:")
    df.info()
    
    if output_filepath is None:
        nc_path = Path(nc_filepath)
        output_filepath = nc_path.with_suffix('.parquet')
    
    print(f"\nSaving to Parquet: {output_filepath}")
    
    if chunk_size is not None:
        print(f"Processing in chunks of {chunk_size:,} rows...")
        df_reset = df.reset_index()
    
        for i in range(0, len(df_reset), chunk_size):
            chunk = df_reset.iloc[i:i+chunk_size]
            chunk_filepath = str(output_filepath).replace('.parquet', f'_chunk_{i//chunk_size}.parquet')
            chunk.to_parquet(chunk_filepath, index=False)
            print(f"Saved chunk {i//chunk_size + 1}: {chunk_filepath}")
    else:
        df.to_parquet(output_filepath)
        print(f"Successfully saved to: {output_filepath}")
    
    nc_size = Path(nc_filepath).stat().st_size / 1024**2
    if chunk_size is None:
        parquet_size = Path(output_filepath).stat().st_size / 1024**2
        print(f"\nFile size comparison:")
        print(f"Original .nc file: {nc_size:.2f} MB")
        print(f"Parquet file: {parquet_size:.2f} MB")
        print(f"Compression ratio: {nc_size/parquet_size:.2f}x")
    
    return output_filepath

nc_file = "Weather_data_bi_2003.nc"

try:
    parquet_file = convert_nc_to_parquet(nc_file)
    print(f"\nConversion completed successfully!")
    
except MemoryError:
    print("Memory error - trying chunked approach...")
    parquet_file = convert_nc_to_parquet(nc_file, chunk_size=1000000)  # 1M rows per chunk


try:
    df_parquet = pd.read_parquet(parquet_file)
    print(f"Parquet file shape: {df_parquet.shape}")
    print(f"Columns: {list(df_parquet.columns)}")
    print("\nFirst few rows:")
    print(df_parquet.head())
    
    print("\nData types:")
    print(df_parquet.dtypes)
    
except Exception as e:
    print(f"Error loading parquet file: {e}")


In [None]:
# Looping year

In [None]:
def convert_nc_to_parquet(nc_filepath, output_filepath=None, chunk_size=None):
    """
    Convert NetCDF file to Parquet format
    
    Parameters:
    nc_filepath (str): Path to the .nc file
    output_filepath (str): Path for output .parquet file (optional)
    chunk_size (int): Number of rows to process at once for large files (optional)
    """
    try:
        print(f"Loading NetCDF file: {nc_filepath}")
        ds = xr.open_dataset(nc_filepath)
        
        print(f"Dataset structure for {Path(nc_filepath).name}:")
        print(f"Dimensions: {dict(ds.dims)}")
        print(f"Variables: {list(ds.data_vars)}")
        
        print("Converting to DataFrame")
        df = ds.to_dataframe()
        
        print(f"DataFrame shape: {df.shape}")
        print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")
        
        if output_filepath is None:
            nc_path = Path(nc_filepath)
            output_filepath = nc_path.with_suffix('.parquet')
        
        print(f"Saving to Parquet: {output_filepath}")
        if chunk_size is not None:
            print(f"Processing in chunks of {chunk_size:,} rows...")
            df_reset = df.reset_index()
            
            for i in range(0, len(df_reset), chunk_size):
                chunk = df_reset.iloc[i:i+chunk_size]
                chunk_filepath = str(output_filepath).replace('.parquet', f'_chunk_{i//chunk_size}.parquet')
                chunk.to_parquet(chunk_filepath, index=False)
                print(f"Saved chunk {i//chunk_size + 1}: {chunk_filepath}")
                
                del chunk
                gc.collect()
        else:
            df.to_parquet(output_filepath)
            print(f"Successfully saved to: {output_filepath}")
        
        nc_size = Path(nc_filepath).stat().st_size / 1024**2
        if chunk_size is None:
            parquet_size = Path(output_filepath).stat().st_size / 1024**2
            print(f"File size comparison:")
            print(f"Original .nc file: {nc_size:.2f} MB")
            print(f"Parquet file: {parquet_size:.2f} MB")
            print(f"Compression ratio: {nc_size/parquet_size:.2f}x")
        
        del df
        ds.close()
        del ds
        gc.collect()
        
        return output_filepath
        
    except Exception as e:
        print(f"Error processing {nc_filepath}: {str(e)}")
        try:
            if 'ds' in locals():
                ds.close()
            if 'df' in locals():
                del df
            if 'ds' in locals():
                del ds
            gc.collect()
        except:
            pass
        return None

def process_all_weather_files(year=2015, chunk_size=None):
    """
    Process all weather data files for a given year
    
    Parameters:
    year (int): Year to process (default 2003)
    chunk_size (int): Chunk size for large files (optional)
    """
    
    weather_vars = [
        'bi', 'etr', 'fm100', 'fm1000', 'metdata_elevationdata', 
        'pet', 'pr', 'rmax', 'rmin', 'sph', 'srad', 'th', 
        'tmmn', 'tmmx', 'vpd', 'vs'
    ]
    
    successful_conversions = []
    failed_conversions = []
    
    print(f"Starting batch conversion for {len(weather_vars)} files from year {year}")
    print("=" * 60)
    
    for i, var in enumerate(weather_vars, 1):
        print(f"\n[{i}/{len(weather_vars)}] Processing variable: {var}")
        print("-" * 40)
        
        nc_file = f"Weather_data_{var}_{year}.nc"
        if not Path(nc_file).exists():
            print(f"File not found: {nc_file}")
            failed_conversions.append((var, "File not found"))
            continue
        
        try:
            parquet_file = convert_nc_to_parquet(nc_file, chunk_size=chunk_size)
            
            if parquet_file:
                print(f"Successfully converted {var}")
                successful_conversions.append(var)
            else:
                print(f"Failed to convert {var}")
                failed_conversions.append((var, "Conversion returned None"))
                
        except MemoryError:
            print(f"Memory error for {var} - trying chunked approach...")
            try:
                parquet_file = convert_nc_to_parquet(nc_file, chunk_size=1000000)  # 1M rows per chunk
                if parquet_file:
                    print(f"Successfully converted {var} (chunked)")
                    successful_conversions.append(var)
                else:
                    print(f"Failed to convert {var} even with chunking")
                    failed_conversions.append((var, "Failed even with chunking"))
            except Exception as e:
                print(f"Error converting {var}: {str(e)}")
                failed_conversions.append((var, str(e)))
        
        except Exception as e:
            print(f"Error converting {var}: {str(e)}")
            failed_conversions.append((var, str(e)))
        
        gc.collect()
        
        try:
            import psutil
            memory_percent = psutil.virtual_memory().percent
            print(f"Current memory usage: {memory_percent:.1f}%")
        except ImportError:
            pass
    
    print(f"Successfully converted: {len(successful_conversions)}/{len(weather_vars)} files")
    
    if successful_conversions:
        print(f"\nSuccessful conversions:")
        for var in successful_conversions:
            print(f"   - {var}")
    
    if failed_conversions:
        print(f"\nFailed conversions:")
        for var, error in failed_conversions:
            print(f"   - {var}: {error}")
    
    return successful_conversions, failed_conversions


if __name__ == "__main__":
    successful, failed = process_all_weather_files(year=2015)
    
    if successful:
        print(f"\n" + "="*50)
        print("VERIFICATION - Loading sample Parquet file:")
        print("="*50)
        
        sample_var = successful[0]
        sample_file = f"Weather_data_{sample_var}_2003.parquet"
        
        try:
            df_sample = pd.read_parquet(sample_file)
            print(f"Sample file ({sample_var}) shape: {df_sample.shape}")
            print(f"Columns: {list(df_sample.columns)}")
            print("\nFirst few rows:")
            print(df_sample.head())
            
            del df_sample
            gc.collect()
            
        except Exception as e:
            print(f"Error loading sample parquet file: {e}")
    
    print("\nBatch processing completed")

In [None]:
# youll have to change this to your local 
folder_path = r"C:\Users\zscho\OneDrive\Documents\Capstone\Weather"

for filename in os.listdir(folder_path):
    if filename.startswith("Weather_data_") and filename.endswith(".parquet"):
        new_name = filename.replace("Weather_data_", "", 1)
        old_path = os.path.join(folder_path, filename)
        new_path = os.path.join(folder_path, new_name)
        os.rename(old_path, new_path)
        print(f"Renamed: {filename} → {new_name}")
