In [23]:
from datetime import datetime
import pandas as pd
from meteostat import Point, Hourly

# Parameters
start = datetime(1995, 10, 1)
end   = datetime(2025, 6, 1)
location = Point(35.4333, -82.0333, 660)

# Fetch hourly data
df = Hourly(location, start, end).fetch()
print("Available columns:", df.columns.tolist())

# Create model inputs
df_clean = df[['pres', 'temp', 'rhum', 'wspd']].copy()

# Verify no missing values
print("Missing proportion per column:\n", df_clean.isna().mean())

# Interpolate time gaps (there should be none, but good practice)
df_clean = df_clean.interpolate(method='time')

print("Final data snapshot:")
print(df_clean.head())

# Convert hourly data to daily aggregations
df_daily = df_clean.resample('D').agg({
    'pres': 'mean',      # Average daily pressure
    'temp': 'mean',      # Average daily temperature
    'rhum': 'mean',      # Average daily relative humidity
    'wspd': 'mean'       # Average daily wind speed
}).round(2)

print(f"\nHourly data shape: {df_clean.shape}")
print(f"Daily data shape: {df_daily.shape}")
print("\nDaily data sample:")
print(df_daily.head())

# save the dataframe to a csv file named "weather_dataset.csv" with datetime as index
df_daily.to_csv("weather_dataset.csv", index=True)


Available columns: ['temp', 'dwpt', 'rhum', 'prcp', 'snow', 'wdir', 'wspd', 'wpgt', 'pres', 'tsun', 'coco']
Missing proportion per column:
 pres    0.006971
temp    0.000531
rhum    0.001019
wspd    0.001876
dtype: float64
Final data snapshot:
                       pres  temp  rhum  wspd
time                                         
1995-10-01 00:00:00  1022.8  16.1  90.0   7.6
1995-10-01 01:00:00  1023.1  15.0  93.0   7.6
1995-10-01 02:00:00  1023.5  15.6  93.0  11.2
1995-10-01 03:00:00  1023.1  15.6  90.0  11.2
1995-10-01 04:00:00  1022.7  15.6  90.0   7.6

Hourly data shape: (260065, 4)
Daily data shape: (10837, 4)

Daily data sample:
               pres   temp   rhum   wspd
time                                    
1995-10-01  1021.18  17.63  82.83   8.01
1995-10-02  1018.78  17.85  83.46   6.87
1995-10-03  1015.09  17.94  93.92    8.4
1995-10-04   1011.3  16.96  98.79   5.57
1995-10-05  1002.01  20.36  91.67  25.63


In [24]:
# Load both datasets
print("Loading datasets...")

# Load weather data
weather_df = pd.read_csv("weather_dataset.csv", index_col=0, parse_dates=True)
print(f"Weather dataset shape: {weather_df.shape}")
print("Weather data sample:")
print(weather_df.head())

# Load water level data  
water_df = pd.read_csv("dataset.csv", parse_dates=['datetime'])
print(f"\nWater level dataset shape: {water_df.shape}")
print("Water level data sample:")
print(water_df.head())

# Convert datetime to date for alignment (remove time component)
water_df['date'] = water_df['datetime'].dt.date
weather_df.index = pd.to_datetime(weather_df.index).date

# Set date as index for water level data
water_df = water_df.set_index('date').drop('datetime', axis=1)

print(f"\nAfter processing:")
print(f"Weather dates range: {weather_df.index.min()} to {weather_df.index.max()}")
print(f"Water level dates range: {water_df.index.min()} to {water_df.index.max()}")

# Combine datasets using inner join (only dates that exist in both)
combined_df = weather_df.join(water_df, how='inner')

# Reorder columns to put datetime first and stage_m second
combined_df = combined_df.reset_index()
combined_df = combined_df[['index', 'stage_m'] + [col for col in combined_df.columns if col not in ['index', 'stage_m']]]

print(f"\nCombined dataset shape: {combined_df.shape}")
print("Combined data sample:")
print(combined_df.head())

# Check for any missing values
print(f"\nMissing values in combined dataset:")
print(combined_df.isnull().sum())

# Save the combined dataset
combined_df.to_csv("combined_dataset.csv", index=False)
print(f"\nCombined dataset saved as 'combined_dataset.csv'")
print(f"Final dataset has {len(combined_df)} rows and {len(combined_df.columns)} columns")
print(f"Columns: {combined_df.columns.tolist()}")

Loading datasets...
Weather dataset shape: (10837, 4)
Weather data sample:
               pres   temp   rhum   wspd
time                                    
1995-10-01  1021.18  17.63  82.83   8.01
1995-10-02  1018.78  17.85  83.46   6.87
1995-10-03  1015.09  17.94  93.92   8.40
1995-10-04  1011.30  16.96  98.79   5.57
1995-10-05  1002.01  20.36  91.67  25.63

Water level dataset shape: (10837, 2)
Water level data sample:
                   datetime  stage_m
0 1995-10-01 00:00:00+00:00     1.96
1 1995-10-02 00:00:00+00:00     1.93
2 1995-10-03 00:00:00+00:00     1.91
3 1995-10-04 00:00:00+00:00     3.79
4 1995-10-05 00:00:00+00:00     5.41

After processing:
Weather dates range: 1995-10-01 to 2025-06-01
Water level dates range: 1995-10-01 to 2025-06-01

Combined dataset shape: (10837, 6)
Combined data sample:
        index  stage_m     pres   temp   rhum   wspd
0  1995-10-01     1.96  1021.18  17.63  82.83   8.01
1  1995-10-02     1.93  1018.78  17.85  83.46   6.87
2  1995-10-03     1.