In [None]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

folder_path = r"C:\Users\zscho\OneDrive\Documents\Capstone\Weather"
year = 2015

In [None]:
# list of parquet files for x year
parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet') and f"{year}.parquet" in f]
print(f"Found {len(parquet_files)} files for {year}")


In [None]:
# info check
for file in parquet_files:
    df = pd.read_parquet(os.path.join(folder_path, file))
    print(f"{file}: {df.shape[0]} rows, {df.shape[1]} columns")
    del df


In [None]:
# missing 
missing_summary = {}
for file in parquet_files:
    df = pd.read_parquet(os.path.join(folder_path, file))
    missing_count = df.isnull().sum().sum()
    missing_pct = (missing_count / (df.shape[0] * df.shape[1])) * 100
    missing_summary[file] = {'count': missing_count, 'percent': missing_pct}
    del df

for file, stats in missing_summary.items():
    if stats['count'] > 0:
        print(f"{file}: {stats['count']} missing ({stats['percent']:.2f}%)")

In [None]:
# infinite values (yes we did find one)
for file in parquet_files:
    df = pd.read_parquet(os.path.join(folder_path, file))
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    inf_count = np.isinf(df[numeric_cols]).sum().sum()
    if inf_count > 0:
        print(f"{file}: {inf_count} infinite values")
    del df

In [None]:
# duplicates
for file in parquet_files:
    df = pd.read_parquet(os.path.join(folder_path, file))
    dup_count = df.duplicated().sum()
    if dup_count > 0:
        print(f"{file}: {dup_count} duplicate rows")
    del df

In [None]:
# outliers
for file in parquet_files:
    df = pd.read_parquet(os.path.join(folder_path, file))
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    for col in numeric_cols:
        values = df[col].dropna()
        if len(values) > 0:
            min_val, max_val = values.min(), values.max()
            q1, q3 = values.quantile([0.25, 0.75])
            iqr = q3 - q1
            outlier_threshold_low = q1 - 3 * iqr
            outlier_threshold_high = q3 + 3 * iqr
            
            extreme_outliers = ((values < outlier_threshold_low) | (values > outlier_threshold_high)).sum()
            if extreme_outliers > 0:
                print(f"{file} - {col}: {extreme_outliers} extreme outliers (range: {min_val:.2f} to {max_val:.2f})")
    del df

In [None]:
# consistency check
dtypes_summary = {}
for file in parquet_files:
    df = pd.read_parquet(os.path.join(folder_path, file))
    dtypes_summary[file] = df.dtypes.to_dict()
    del df

common_cols = set.intersection(*[set(dtypes.keys()) for dtypes in dtypes_summary.values()])
for col in common_cols:
    unique_types = set([dtypes_summary[file][col] for file in parquet_files])
    if len(unique_types) > 1:
        print(f"Column {col} has inconsistent types across files: {unique_types}")