In [1]:
import pandas as pd
df = pd.read_csv("weatherstats_vancouver_hourly.csv")

In [2]:
df.drop(columns=["wind_dir","wind_gust","windchill","cloud_cover_4","cloud_cover_10","solar_radiation","health_index","humidex"], inplace=True)
df.isnull().sum()

date_time_local        0
unixtime               0
pressure_station       0
pressure_sea           0
wind_dir_10s           8
wind_speed             0
relative_humidity      0
dew_point              0
temperature            0
visibility             3
cloud_cover_8          3
max_air_temp_pst1hr    0
min_air_temp_pst1hr    0
dtype: int64

# Missing Values Analysis and Treatment

Let's analyze the missing values pattern and apply appropriate filling strategies for different weather metrics.

In [3]:
# Analyze missing values percentage
missing_stats = df.isnull().sum()
missing_pct = (missing_stats / len(df)) * 100

print("Missing Values Analysis:")
print("-" * 40)
for col in ['wind_dir_10s', 'visibility', 'cloud_cover_8']:
    if col in missing_stats:
        print(f"{col}: {missing_stats[col]} missing ({missing_pct[col]:.1f}%)")

# Check if missing values occur together
print("\nPattern Analysis:")
print("Rows with all three metrics missing:", 
      len(df[(df['wind_dir_10s'].isna()) & (df['visibility'].isna()) & (df['cloud_cover_8'].isna())]))
print("Total rows:", len(df))

Missing Values Analysis:
----------------------------------------
wind_dir_10s: 8 missing (2.1%)
visibility: 3 missing (0.8%)
cloud_cover_8: 3 missing (0.8%)

Pattern Analysis:
Rows with all three metrics missing: 0
Total rows: 387


In [4]:
import numpy as np

def fill_with_week_ahead_data(df, col='relative_humidity'):
    """Fill missing relative humidity using linear interpolation"""
    df = df.copy()

    # fetch a week ahead data
    df[col] = df[col].ffill(limit=7)

    return df

def fill_wind_direction_circular(df, col='wind_dir_10s'):
    """Fill missing wind directions using circular interpolation"""
    df = df.copy()
    
    # Convert wind directions to radians for circular calculations
    wind_rad = np.radians(df[col])
    
    # Calculate circular mean (excluding NaN values)
    valid_winds = wind_rad[~np.isnan(wind_rad)]
    if len(valid_winds) > 0:
        # Calculate mean of sine and cosine components
        sin_mean = np.nanmean(np.sin(valid_winds))
        cos_mean = np.nanmean(np.cos(valid_winds))
        
        # Calculate circular mean
        circular_mean_rad = np.arctan2(sin_mean, cos_mean)
        circular_mean_deg = np.degrees(circular_mean_rad)
        
        # Ensure positive degrees (0-360)
        if circular_mean_deg < 0:
            circular_mean_deg += 360
            
        # Fill missing values with circular mean
        df[col] = df[col].fillna(circular_mean_deg)
    
    return df

def fill_visibility_interpolation(df, col='visibility'):
    """Fill missing visibility using linear interpolation with bounds"""
    df = df.copy()
    
    # Use linear interpolation for visibility
    df[col] = df[col].interpolate(method='linear')
    
    # Fill any remaining NaN at start/end with forward/backward fill
    df[col] = df[col].ffill().bfill()

    return df

def fill_cloud_cover_mode(df, col='cloud_cover_8'):
    """Fill missing cloud cover with mode (most frequent value)"""
    df = df.copy()
    
    # Calculate mode (most frequent value)
    mode_value = df[col].mode()
    if len(mode_value) > 0:
        df[col] = df[col].fillna(mode_value[0])
    
    return df

def calc_humidex(row):
    import math as m
    if pd.isna(row['temperature']) or pd.isna(row['dew_point']):
        return None
    T = row['temperature']
    DP = row['dew_point']
    return round(T + (0.5555 * (6.11 * m.exp(5417.7530 * (1/273.15 - 1/(DP + 273.15))) - 10)), 2)

In [5]:
# Show current missing values
print("BEFORE filling:")
print("wind_dir_10s missing:", df['wind_dir_10s'].isnull().sum())
print("visibility missing:", df['visibility'].isnull().sum())  
print("cloud_cover_8 missing:", df['cloud_cover_8'].isnull().sum())
print("relative_humidity missing:", df['relative_humidity'].isnull().sum())
print("dew_point missing:", df['dew_point'].isnull().sum())

# Apply filling methods
df_filled = df.copy()

# 1. Fill wind direction using circular statistics
df_filled = fill_wind_direction_circular(df_filled, 'wind_dir_10s')

# 2. Fill visibility using interpolation
df_filled = fill_visibility_interpolation(df_filled, 'visibility')

# 3. Fill cloud cover using mode
df_filled = fill_cloud_cover_mode(df_filled, 'cloud_cover_8')

# 4. Fill relative humidity and dew point with week-ahead data
df_filled = fill_with_week_ahead_data(df_filled, 'relative_humidity')
df_filled = fill_with_week_ahead_data(df_filled, 'dew_point')

# 5. Calculate humidex
df_filled['humidex_v'] = df_filled.apply(calc_humidex, axis=1)

print("\nAFTER filling:")
print("wind_dir_10s missing:", df_filled['wind_dir_10s'].isnull().sum())
print("visibility missing:", df_filled['visibility'].isnull().sum())
print("cloud_cover_8 missing:", df_filled['cloud_cover_8'].isnull().sum())
print("relative_humidity missing:", df_filled['relative_humidity'].isnull().sum())
print("dew_point missing:", df_filled['dew_point'].isnull().sum())

BEFORE filling:
wind_dir_10s missing: 8
visibility missing: 3
cloud_cover_8 missing: 3
relative_humidity missing: 0
dew_point missing: 0

AFTER filling:
wind_dir_10s missing: 0
visibility missing: 0
cloud_cover_8 missing: 0
relative_humidity missing: 0
dew_point missing: 0


In [6]:
# Show some examples of filled values
print("\nExamples of filled values:")
print("=" * 50)

# Show wind direction examples
wind_filled_mask = df['wind_dir_10s'].isnull() & df_filled['wind_dir_10s'].notnull()
if wind_filled_mask.sum() > 0:
    print("Wind Direction (wind_dir_10s) - filled with circular mean:")
    examples = df_filled[wind_filled_mask][['date_time_local', 'wind_dir_10s']].head(3)
    for _, row in examples.iterrows():
        print(f"  {row['date_time_local']}: {row['wind_dir_10s']:.1f}°")

# Show visibility examples  
vis_filled_mask = df['visibility'].isnull() & df_filled['visibility'].notnull()
if vis_filled_mask.sum() > 0:
    print("\nVisibility - filled with interpolation:")
    examples = df_filled[vis_filled_mask][['date_time_local', 'visibility']].head(3)
    for _, row in examples.iterrows():
        print(f"  {row['date_time_local']}: {row['visibility']:.0f}m")

# Show cloud cover examples
cloud_filled_mask = df['cloud_cover_8'].isnull() & df_filled['cloud_cover_8'].notnull()
if cloud_filled_mask.sum() > 0:
    print("\nCloud Cover (cloud_cover_8) - filled with mode:")
    examples = df_filled[cloud_filled_mask][['date_time_local', 'cloud_cover_8']].head(3)
    for _, row in examples.iterrows():
        print(f"  {row['date_time_local']}: {row['cloud_cover_8']}")

# Update the main dataframe
df = df_filled.copy()
print(f"\n✓ Updated df with filled values. Total missing values now: {df.isnull().sum().sum()}")


Examples of filled values:
Wind Direction (wind_dir_10s) - filled with circular mean:
  2025-09-05 20:00:00 PDT: 16.6°
  2025-09-05 06:00:00 PDT: 16.6°
  2025-09-05 05:00:00 PDT: 16.6°

Visibility - filled with interpolation:
  2025-09-17 11:00:00 PDT: 44250m
  2025-09-10 13:00:00 PDT: 48300m
  2025-09-08 13:00:00 PDT: 48300m

Cloud Cover (cloud_cover_8) - filled with mode:
  2025-09-17 11:00:00 PDT: 8.0
  2025-09-10 13:00:00 PDT: 8.0
  2025-09-08 13:00:00 PDT: 8.0

✓ Updated df with filled values. Total missing values now: 0


In [7]:
df.to_csv("weatherstats_vancouver_hourly_filled.csv", index=False)

## Filling Strategy Explanation

### 1. **Wind Direction (`wind_dir_10s`)** - Circular Mean
- **Why**: Wind direction is circular (0° = 360°)
- **Method**: Calculate circular mean using trigonometric functions
- **Advantage**: Properly handles the circular nature (e.g., average of 350° and 10° = 0°, not 180°)

### 2. **Visibility** - Linear Interpolation
- **Why**: Visibility changes gradually over time
- **Method**: Linear interpolation between known values, with forward/backward fill for edges
- **Advantage**: Maintains realistic temporal trends

### 3. **Cloud Cover (`cloud_cover_8`)** - Mode (Most Frequent)
- **Why**: Cloud cover is categorical (0-8 oktas scale)
- **Method**: Fill with the most frequently occurring value
- **Advantage**: Preserves the discrete nature and typical weather patterns

### Alternative Methods (if needed):
- **Seasonal/Time-based**: Use historical patterns for the same time of day/season
- **Weather-state correlation**: Use other weather variables to predict missing values
- **Forward/Backward fill**: Simple temporal propagation for short gaps