In [None]:
# temporal validation
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

folder_path = r"C:\Users\zscho\OneDrive\Documents\Capstone\Weather"
year = 2015
expected_days = 366 if year % 4 == 0 else 365

parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet') and f"{year}.parquet" in f]

In [None]:
# coverage and gaps
for file in parquet_files:
    df = pd.read_parquet(os.path.join(folder_path, file))
    if 'day' in df.columns:
        unique_days = df['day'].nunique()
        if unique_days != expected_days:
            print(f"{file}: Has {unique_days} days, expected {expected_days}")
        
        day_range = df['day'].max() - df['day'].min() + 1
        if day_range != expected_days:
            print(f"{file}: Day range spans {day_range}, expected {expected_days}")
    del df

In [None]:
# coordinate validation
coord_ranges = {}
for file in parquet_files:
    df = pd.read_parquet(os.path.join(folder_path, file))
    
    lat_col = [col for col in df.columns if 'lat' in col.lower()]
    lon_col = [col for col in df.columns if 'lon' in col.lower()]
    
    if lat_col:
        lat_min, lat_max = df[lat_col[0]].min(), df[lat_col[0]].max()
        if lat_min < -90 or lat_max > 90:
            print(f"{file}: Invalid latitude range {lat_min} to {lat_max}")
        coord_ranges[file] = {'lat_range': (lat_min, lat_max)}
    
    if lon_col:
        lon_min, lon_max = df[lon_col[0]].min(), df[lon_col[0]].max()
        if lon_min < -180 or lon_max > 180:
            print(f"{file}: Invalid longitude range {lon_min} to {lon_max}")
        if file in coord_ranges:
            coord_ranges[file]['lon_range'] = (lon_min, lon_max)
    del df

In [None]:
# value validation
weather_limits = {
    'tmmn': (-80, 60),  # min temp celsius
    'tmmx': (-60, 80),  # max temp celsius  
    'pr': (0, 1000),    # precipitation mm
    'rmin': (0, 100),   # min humidity %
    'rmax': (0, 100),   # max humidity %
    'vs': (0, 200),     # wind speed m/s
    'srad': (0, 50),    # solar radiation MJ/m2/day
    'vpd': (0, 10000)   # vapor pressure deficit Pa
}

for file in parquet_files:
    df = pd.read_parquet(os.path.join(folder_path, file))
    
    for var, (min_val, max_val) in weather_limits.items():
        if var in df.columns:
            out_of_range = ((df[var] < min_val) | (df[var] > max_val)).sum()
            if out_of_range > 0:
                actual_min, actual_max = df[var].min(), df[var].max()
                print(f"{file} - {var}: {out_of_range} values outside range ({actual_min:.2f} to {actual_max:.2f})")
    del df


In [None]:
# temp relationship validation
for file in parquet_files:
    df = pd.read_parquet(os.path.join(folder_path, file))
    
    if 'tmmn' in df.columns and 'tmmx' in df.columns:
        invalid_temp = (df['tmmx'] <= df['tmmn']).sum()
        if invalid_temp > 0:
            print(f"{file}: {invalid_temp} records where max temp <= min temp")
    del df

In [None]:
# constant values (no variance)
for file in parquet_files:
    df = pd.read_parquet(os.path.join(folder_path, file))
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    for col in numeric_cols:
        if df[col].nunique() == 1 and df[col].notna().sum() > 0:
            print(f"{file} - {col}: All values are constant ({df[col].iloc[0]})")
    del df

In [None]:
# spikes in time series
for file in parquet_files:
    df = pd.read_parquet(os.path.join(folder_path, file))
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    if 'day' in df.columns:
        df_sorted = df.sort_values('day')
        
        for col in numeric_cols:
            if col != 'day':
                values = df_sorted[col].dropna()
                if len(values) > 1:
                    diff = values.diff().abs()
                    mean_diff = diff.mean()
                    spike_threshold = mean_diff * 10
                    spikes = (diff > spike_threshold).sum()
                    if spikes > 0:
                        print(f"{file} - {col}: {spikes} potential spikes detected")
    del df

In [None]:
# cross-file coordinate consistency
if coord_ranges:
    lat_ranges = [ranges.get('lat_range') for ranges in coord_ranges.values() if 'lat_range' in ranges]
    lon_ranges = [ranges.get('lon_range') for ranges in coord_ranges.values() if 'lon_range' in ranges]
    
    if lat_ranges and len(set(lat_ranges)) > 1:
        print("Inconsistent latitude ranges across files:")
        for file, ranges in coord_ranges.items():
            if 'lat_range' in ranges:
                print(f"  {file}: {ranges['lat_range']}")
    
    if lon_ranges and len(set(lon_ranges)) > 1:
        print("Inconsistent longitude ranges across files:")
        for file, ranges in coord_ranges.items():
            if 'lon_range' in ranges:
                print(f"  {file}: {ranges['lon_range']}")

In [None]:
# file size check
file_sizes = {}
for file in parquet_files:
    file_path = os.path.join(folder_path, file)
    size_mb = os.path.getsize(file_path) / (1024 * 1024)
    file_sizes[file] = size_mb

mean_size = np.mean(list(file_sizes.values()))
std_size = np.std(list(file_sizes.values()))

for file, size in file_sizes.items():
    if abs(size - mean_size) > 3 * std_size:
        print(f"{file}: Unusual file size {size:.1f}MB (mean: {mean_size:.1f}MB)")

In [None]:
# ccount 
record_counts = {}
for file in parquet_files:
    df = pd.read_parquet(os.path.join(folder_path, file))
    record_counts[file] = len(df)
    del df

unique_counts = set(record_counts.values())
if len(unique_counts) > 1:
    print("Inconsistent record counts across files:")
    for file, count in record_counts.items():
        print(f"  {file}: {count:,} records")