# Weather Features Data Exploration

This notebook explores the `weather_features.parquet` dataset to understand its structure, contents, and basic statistics. We'll analyze sample data, check for missing values, and visualize key distributions.

In [14]:
import pyarrow.parquet as pq
import pandas as pd
from pathlib import Path


def show_full_sample(file_path, sample_rows=5, random_sample=False):
    """
    Display sample data with ALL columns without full file load.
    
    Args:
        file_path (str): Path to Parquet file
        sample_rows (int): Number of rows to display
        random_sample (bool): If True, takes random rows (slower)
        
    Returns:
        pandas.DataFrame: The sample data
    """
    # Open file and read metadata only
    parquet_file = pq.ParquetFile(file_path)
    total_rows = parquet_file.metadata.num_rows
    columns = parquet_file.schema.names
    
    print(f"=== Parquet File ===")
    print(f"Path: {file_path}")
    print(f"Total rows: {total_rows:,}")
    print(f"Columns ({len(columns)}): {columns}")
    
    if random_sample:
        # Random sampling (more representative but slightly slower)
        import random
        row_indices = random.sample(range(total_rows), min(sample_rows, total_rows))
        sample = pq.read_table(
            file_path,
            filters=[('index', 'in', row_indices)]
        ).to_pandas()
    else:
        # Fast sequential sampling (first N rows)
        sample = parquet_file.read_row_group(0).to_pandas().head(sample_rows)
    
    print(f"\nSample data ({'random' if random_sample else 'first'} {len(sample)} rows):")
    pd.set_option('display.max_columns', None)  # Show all columns

    
    return sample  # Return the sample dataframe



In [15]:
root = project_root = Path().resolve().parents[0]
path_file = root / "data/interim/weather_features.parquet"
print(path_file)
data = show_full_sample(
        file_path=path_file,
        sample_rows=5,
        random_sample=False  # Set True for random sampling
)

display(data)




D:\FCAI\data_science\project\wildfire_prediction\wildfire_prediction\data\interim\weather_features.parquet
=== Parquet File ===
Path: D:\FCAI\data_science\project\wildfire_prediction\wildfire_prediction\data\interim\weather_features.parquet
Total rows: 103,304,160
Columns (34): ['longitude', 'latitude', 'ppt', 'tmax', 'vbdmax', 'date', 'year', 'month', 'day_of_year', 'grid_id', 'week', 'hot_dry_index', 'high_temp_day', 'low_rain_day', 'hot_dry_day', 'spi_7day', 'drought_category', 'vpd_extreme', 'vpd_anomaly', 'vpd_risk_category', 'is_fire_season', 'is_santa_ana_season', 'season', 'week_sin', 'week_cos', 'month_sin', 'month_cos', 'tmax_7day_mean', 'ppt_7day_mean', 'vbd_7day_mean', 'fire_weather_index', 'drought_weather_index', 'fire_risk_index', 'date_bin']

Sample data (first 5 rows):


Unnamed: 0,longitude,latitude,ppt,tmax,vbdmax,date,year,month,day_of_year,grid_id,week,hot_dry_index,high_temp_day,low_rain_day,hot_dry_day,spi_7day,drought_category,vpd_extreme,vpd_anomaly,vpd_risk_category,is_fire_season,is_santa_ana_season,season,week_sin,week_cos,month_sin,month_cos,tmax_7day_mean,ppt_7day_mean,vbd_7day_mean,fire_weather_index,drought_weather_index,fire_risk_index,date_bin
0,-124.166667,41.958333,4.73,9.725,1.458,2013-01-08,2013,1,8,10T_403_4645,2013-01-07/2013-01-13,1.564292,0,0,0,0.0,1,0,-1.261411,1,0,1,0,0.120537,0.992709,0.5,0.866025,,,,2.277418,1.766451,1.843096,2013-01-08_2013-01-14
1,-123.291667,41.958333,0.0,4.278,1.337,2013-01-08,2013,1,8,10T_475_4645,2013-01-07/2013-01-13,1.528848,0,1,0,1.130235,0,0,-1.287608,1,0,1,0,0.120537,0.992709,0.5,0.866025,,,,2.218637,1.031182,1.464301,2013-01-08_2013-01-14
2,-123.333333,41.958333,0.0,5.405,1.973,2013-01-08,2013,1,8,10T_472_4645,2013-01-07/2013-01-13,1.552425,0,1,0,1.434517,0,0,-1.149912,1,0,1,0,0.120537,0.992709,0.5,0.866025,6.469333,1.576667,1.589333,2.257737,1.054642,1.490106,2013-01-08_2013-01-14
3,-118.291667,36.958333,0.0,9.133,8.773,2013-01-08,2013,1,8,11S_385_4091,2013-01-07/2013-01-13,1.630413,0,1,0,1.684421,0,0,0.322312,4,0,1,0,0.120537,0.992709,0.5,0.866025,7.13525,1.1825,3.38525,3.887073,1.132244,2.565468,2013-01-08_2013-01-14
4,-124.083333,41.958333,5.099,9.407,1.187,2013-01-08,2013,1,8,10T_410_4645,2013-01-07/2013-01-13,1.551516,0,0,0,1.744296,0,0,-1.320083,1,0,1,0,0.120537,0.992709,0.5,0.866025,7.5896,1.9658,2.9456,2.256229,1.053738,1.489111,2013-01-08_2013-01-14


## Metadata and Statistics Analysis

Below we extract more detailed information from the parquet file metadata and calculate key statistics for our weather features dataset. This helps us understand the data structure, storage efficiency, and distribution of values.

In [16]:
def analyze_parquet_metadata(file_path):
    """Analyze and display detailed metadata statistics from a parquet file"""
    # Open the parquet file
    parquet_file = pq.ParquetFile(file_path)
    metadata = parquet_file.metadata
    schema = parquet_file.schema
    
    # Basic file information
    print(f"=== Parquet File Metadata Analysis ===")
    print(f"File size: {Path(file_path).stat().st_size / (1024*1024):.2f} MB")
    print(f"Number of row groups: {metadata.num_row_groups}")
    print(f"Total rows: {metadata.num_rows:,}")
    print(f"Created by: {metadata.created_by if metadata.created_by else 'Unknown'}")
    
    # Row group details
    print(f"\n=== Row Group Information ===")
    for i in range(metadata.num_row_groups):
        row_group = metadata.row_group(i)
        print(f"Row Group #{i}: {row_group.num_rows:,} rows, "
              f"{row_group.total_byte_size / (1024*1024):.2f} MB compressed")
    
    # Column details
    print(f"\n=== Column Details ===")
    all_cols = []
    for i in range(metadata.num_row_groups):
        row_group = metadata.row_group(i)
        for j in range(row_group.num_columns):
            col = row_group.column(j)
            all_cols.append({
                'name': col.path_in_schema,
                'type': schema.field_by_name(col.path_in_schema).type,
                'compression': col.compression,
                'size_mb': col.total_compressed_size / (1024*1024),
                'encodings': col.encodings
            })
    
    # Convert to dataframe for easy display
    cols_df = pd.DataFrame(all_cols)
    cols_df = cols_df.drop_duplicates(subset='name')
    
    # Display as styled table
    from IPython.display import display, HTML
    style = """
    <style>
        .stats-table {width: 100%; border-collapse: collapse;}
        .stats-table th {background-color: #e6f2ff; color: black; text-align: left; padding: 8px; border: 1px solid #ddd;}
        .stats-table td {padding: 8px; border: 1px solid #ddd;}
        .stats-table tr:nth-child(even) {background-color: #f9f9f9;}
        .stats-table tr:hover {background-color: #f2f2f2;}
    </style>
    """
    display(HTML(style + cols_df.to_html(classes='stats-table')))
    
    return cols_df

# Call the function with our parquet file
metadata_stats = analyze_parquet_metadata(path_file)

=== Parquet File Metadata Analysis ===
File size: 6345.03 MB
Number of row groups: 618
Total rows: 103,304,160
Created by: parquet-cpp-arrow version 20.0.0

=== Row Group Information ===
Row Group #0: 167,391 rows, 10.05 MB compressed
Row Group #1: 167,391 rows, 7.30 MB compressed
Row Group #2: 167,391 rows, 7.46 MB compressed
Row Group #3: 167,391 rows, 7.59 MB compressed
Row Group #4: 167,391 rows, 9.07 MB compressed
Row Group #5: 167,391 rows, 10.29 MB compressed
Row Group #6: 167,391 rows, 12.44 MB compressed
Row Group #7: 167,391 rows, 10.26 MB compressed
Row Group #8: 167,391 rows, 8.80 MB compressed
Row Group #9: 167,391 rows, 7.77 MB compressed
Row Group #10: 167,391 rows, 9.16 MB compressed
Row Group #11: 167,391 rows, 11.48 MB compressed
Row Group #12: 167,391 rows, 10.71 MB compressed
Row Group #13: 167,391 rows, 11.24 MB compressed
Row Group #14: 167,391 rows, 8.92 MB compressed
Row Group #15: 167,391 rows, 8.99 MB compressed
Row Group #16: 167,391 rows, 7.96 MB compressed


AttributeError: 'pyarrow._parquet.ParquetSchema' object has no attribute 'field_by_name'

In [None]:
# Calculate and display basic statistics for numerical columns
def display_basic_statistics(data_sample):
    """Calculate and display basic statistics for the dataset"""
    # Read more data for better statistical analysis (still using efficient loading)
    parquet_file = pq.ParquetFile(path_file)
    # Try to get a larger sample for statistics (not too large to avoid memory issues)
    stats_sample_size = min(10000, parquet_file.metadata.num_rows)
    stats_sample = parquet_file.read_row_group(0).to_pandas().head(stats_sample_size)
    
    # Get numerical columns only
    numeric_cols = stats_sample.select_dtypes(include=['number']).columns.tolist()
    
    if not numeric_cols:
        print("No numerical columns found in the dataset.")
        return
    
    # Calculate statistics
    print(f"=== Basic Statistics (based on {stats_sample_size:,} rows) ===")
    stats_df = stats_sample[numeric_cols].describe().T
    
    # Add more metrics
    stats_df['missing'] = stats_sample[numeric_cols].isna().sum()
    stats_df['missing_pct'] = (stats_sample[numeric_cols].isna().sum() / len(stats_sample)) * 100
    
    # Round for better display
    stats_df = stats_df.round(2)
    
    # Display as styled table
    from IPython.display import display, HTML
    style = """
    <style>
        .stats-desc-table {width: 100%; border-collapse: collapse;}
        .stats-desc-table th {background-color: #e6ffe6; color: black; text-align: left; padding: 8px; border: 1px solid #ddd;}
        .stats-desc-table td {padding: 8px; border: 1px solid #ddd;}
        .stats-desc-table tr:nth-child(even) {background-color: #f9f9f9;}
        .stats-desc-table tr:hover {background-color: #f2f2f2;}
    </style>
    """
    display(HTML(style + stats_df.to_html(classes='stats-desc-table')))
    
    return stats_df

# Call the function with our data sample
stats_df = display_basic_statistics(data)

## Data Visualizations

Below are some visualizations to help understand the distribution of key variables in the weather features dataset.

In [None]:
# Visualize distributions of key variables
import matplotlib.pyplot as plt
import seaborn as sns

# Try to get a sample size that works for visualization but isn't too large
def plot_key_distributions(file_path, sample_size=5000):
    """Create distribution plots for key numerical variables"""
    # Get a reasonable sample for visualization
    parquet_file = pq.ParquetFile(file_path)
    sample_size = min(sample_size, parquet_file.metadata.num_rows)
    vis_sample = parquet_file.read_row_group(0).to_pandas().head(sample_size)
    
    # Get numerical columns only
    numeric_cols = vis_sample.select_dtypes(include=['number']).columns.tolist()
    
    # Select a subset of interesting columns (up to 6) to visualize
    if len(numeric_cols) > 6:
        # Prioritize columns with names related to temperature, precipitation, wind, etc.
        priority_keywords = ['temp', 'prcp', 'wind', 'humid', 'fire', 'pressure']
        priority_cols = []
        for keyword in priority_keywords:
            priority_cols.extend([c for c in numeric_cols if keyword.lower() in c.lower()])
        
        # If we found priority columns, use them, otherwise use first 6
        cols_to_plot = priority_cols[:6] if priority_cols else numeric_cols[:6]
    else:
        cols_to_plot = numeric_cols
    
    # Create distribution plots
    fig, axes = plt.subplots(len(cols_to_plot), 1, figsize=(10, 3*len(cols_to_plot)))
    if len(cols_to_plot) == 1:
        axes = [axes]  # Make sure axes is always iterable
    
    for i, col in enumerate(cols_to_plot):
        sns.histplot(vis_sample[col].dropna(), ax=axes[i], kde=True)
        axes[i].set_title(f'Distribution of {col}')
        axes[i].grid(True, linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.show()

# Call the visualization function
plot_key_distributions(path_file)