# Weather Master File Testing

This notebook contains functions to test and explore the weather master parquet file in our wildfire prediction project, focusing on efficient methods that avoid loading the entire file at once.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
import geopandas as gpd
import pyarrow
import pyarrow.parquet as pq
import dask.dataframe as dd

# Set up paths
project_root = Path().resolve().parents[0]
data_path = project_root / 'data' / 'interim'
print(f"Project root: {project_root}")

Project root: D:\FCAI\data_science\project\wildfire_prediction\wildfire_prediction


In [2]:
def inspect_parquet_metadata():
    """
    Inspect the metadata of the weather master parquet file without loading the entire dataset
    """
    file_path = data_path / 'weather_master.parquet'
    print(f"Examining metadata for: {file_path}")
    
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Weather master file not found at {file_path}")
    
    # Read metadata using pyarrow
    parquet_file = pq.ParquetFile(file_path)
    metadata = parquet_file.metadata
    
    # Get basic file info
    num_rows = metadata.num_rows
    num_columns = len(metadata.schema.names)
    num_row_groups = metadata.num_row_groups
    file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
    
    print(f"File size: {file_size_mb:.2f} MB")
    print(f"Number of rows: {num_rows:,}")
    print(f"Number of columns: {num_columns}")
    print(f"Number of row groups: {num_row_groups}")
    print(f"Avg rows per group: {num_rows/num_row_groups:,.0f}")
    
    # Show schema (column names and types)
    print("\nSchema:")
    for i, column in enumerate(metadata.schema):
        print(f"{i}: {column.name} ({column.physical_type})")
        
    # Display detailed stats for each row group
    print("\nRow group statistics:")
    for i in range(min(10, num_row_groups)):  # Show stats for first 3 row groups
        print(f"\nRow Group {i}:")
        for j in range(num_columns):
            col_stats = parquet_file.metadata.row_group(i).column(j).statistics
            if col_stats is not None and col_stats.has_min_max:
                print(f"  {metadata.schema.names[j]}: min={col_stats.min}, max={col_stats.max}")
                
    return metadata

In [3]:
# Inspect parquet metadata
try:
    metadata = inspect_parquet_metadata()
except Exception as e:
    print(f"Error inspecting parquet metadata: {e}")

Examining metadata for: D:\FCAI\data_science\project\wildfire_prediction\wildfire_prediction\data\interim\weather_master.parquet
File size: 1028.18 MB
Number of rows: 103,304,160
Number of columns: 10
Number of row groups: 213
Avg rows per group: 484,996

Schema:
0: longitude (DOUBLE)
1: latitude (DOUBLE)
2: ppt (DOUBLE)
3: tmax (DOUBLE)
4: vbdmax (DOUBLE)
5: date (INT64)
6: year (INT32)
7: month (INT32)
8: day_of_year (INT32)
9: grid_id (BYTE_ARRAY)

Row group statistics:

Row Group 0:
  longitude: min=-124.374999999995, max=-114.124999999913
  latitude: min=32.541666666528, max=41.95833333327
  ppt: min=-0.0, max=60.251
  tmax: min=-21.002, max=28.311
  vbdmax: min=-0.0, max=36.821
  date: min=2013-01-01 00:00:00, max=2013-01-21 00:00:00
  year: min=2013, max=2013
  month: min=1, max=1
  day_of_year: min=1, max=21
  grid_id: min=10S_414_4428, max=11S_764_3798

Row Group 1:
  longitude: min=-124.374999999995, max=-114.124999999913
  latitude: min=32.541666666528, max=41.95833333327
  

In [6]:
def load_sample_data(sample_size=10000, random=False):
    """
    Load a sample of the weather master file rather than the full dataset
    
    Parameters:
    -----------
    sample_size : int, optional
        Number of rows to sample
    random : bool, optional
        Whether to take a random sample or the first n rows
        
    Returns:
    --------
    pandas.DataFrame : Sample of weather master data
    """
    file_path = data_path / 'weather_master.parquet'
    print(f"Loading sample from: {file_path}")
    
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Weather master file not found at {file_path}")
    
    if random:
        # Load with Dask and take a random sample
        ddf = dd.read_parquet(file_path)
        # Get total row count to calculate fraction
        total_rows = len(ddf)
        fraction = min(1.0, sample_size / total_rows)
        sample_df = ddf.sample(frac=fraction).compute()
        print(f"Randomly sampled {len(sample_df):,} rows from {total_rows:,} total rows")
    else:
        # Read just the first n rows
        parquet_file = pq.ParquetFile(file_path)  
        sample_df = parquet_file.read_row_group(55).to_pandas().head(sample_size)  
        print(f"Loaded first {len(sample_df):,} rows")
    
    return sample_df

In [7]:
# Load a sample of data for testing
try:
    sample_df = load_sample_data(sample_size=5000)
    display(sample_df.head())
except Exception as e:
    print(f"Error loading sample data: {e}")

Loading sample from: D:\FCAI\data_science\project\wildfire_prediction\wildfire_prediction\data\interim\weather_master.parquet
Loaded first 5,000 rows


Unnamed: 0,longitude,latitude,ppt,tmax,vbdmax,date,year,month,day_of_year,grid_id
0,-115.791667,33.875,0.0,19.072,13.275,2016-01-21,2016,1,21,11S_611_3748
1,-115.75,33.875,0.0,19.135,13.415,2016-01-21,2016,1,21,11S_615_3748
2,-115.708333,33.875,0.0,19.35,13.384,2016-01-21,2016,1,21,11S_619_3749
3,-115.666667,33.875,0.0,19.244,13.173,2016-01-21,2016,1,21,11S_623_3749
4,-115.625,33.875,0.0,19.118,13.128,2016-01-21,2016,1,21,11S_627_3749
