In [6]:
# import libraries needed
from seebuoy import NDBC
from IPython.display import display
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm, skew, probplot
from scipy.special import boxcox1p
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import sys
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from datetime import datetime, timedelta
from prophet import Prophet

In [7]:
def rmse(y_true, y_pred):
    """
    Compute Root Mean Squared Error (RMSE).
    
    Parameters:
    y_true : array-like of shape (n_samples,)
        Ground truth (correct) target values.
        
    y_pred : array-like of shape (n_samples,)
        Estimated target values.

    Returns:
    float
        The RMSE value.
    """
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

In [8]:
def buoySetUp(buoyNum):  
    ndbc = NDBC(timeframe="historical")
    df_avail = ndbc.available_data(station_id=buoyNum)
    df_data = ndbc.get_data(buoyNum)

    df_data.dropna(axis=1, how='all', inplace=True)
    df_data = df_data.reset_index()

    # lets limit the df to 2 columns: date and wave height
    buoy_df = df_data[["date","wave_height", "average_period"]]

    # Set 'date' column as the index
    buoy_df = buoy_df.set_index("date")

    buoy_df['wave_height_interpolated'] = buoy_df['wave_height'].interpolate(method='time') # interpolate missing values based on time
    buoy_df['average_period_interpolated'] = buoy_df['average_period'].interpolate(method='time') # interpolate missing values based on time

    return buoy_df


In [11]:
def doit(buoy_df, f, c, target, buoyNum):
    """
    Parameters:
    f : floor value, how many days back to train the model on

    c : ceiling value, how many days we want to predict for (want 10-15 days)

    target : either "wave_height" or "average_period". Variable we want to traiun and predict on.
    """
    if target != "wave_height" and target != "average_period":
        print("Not a valid target variable")
        return

    #Sets up date objects and floor and ceiling
    today_date = datetime.today().date()
    floor = f + c
    ceiling = c

    # Calculate the floordate and the ceiling date
    floorDate = today_date - timedelta(days=floor)
    ceilingDate = today_date - timedelta(days=ceiling)

    # Splits up the df into recent and past
    # Recent holds the 15 most recent days of data
    # Past holds all data from the floor to the ceiling
    recent_df = buoy_df[buoy_df['date'] > pd.Timestamp(ceilingDate)]
    past_df = buoy_df[(buoy_df['date'] > pd.Timestamp(floorDate)) & (buoy_df['date'] < pd.Timestamp(ceilingDate))]

    #Sets up the modeling df with the date and target variable column as well as the cap for logistic growth prophet algo
    modeling_df = past_df[["date",f'{target}_interpolated']]
    modeling_df = modeling_df.rename(columns={"date": "ds", f'{target}_interpolated': "y"})
    cap = modeling_df['y'].max() + 1

    # Initialize Prophet model with the cap
    model = Prophet(growth="logistic")
    modeling_df['cap'] = cap
    model.fit(modeling_df)

    #Makes prediction
    future = model.make_future_dataframe(periods=ceiling)
    future['cap'] = cap
    forecast = model.predict(future)

    forecast['date'] = forecast['ds']
    merged_df = pd.merge(forecast.tail(15), recent_df, on='date', how='left')

    print(f"RMSE for {buoyNum} for {ceiling} days using {floor - 15} days worth of training data for {target}")
    print(rmse(merged_df[f'{target}_interpolated'], merged_df["yhat"]))
    print()

In [12]:
buoy_list = ["46239", "46258", "46232", "42001", "41002", "44037", "44097", "44009", "41064"]


for buoyNum in buoy_list:
    buoy_df = buoySetUp(buoyNum)
    buoy_df = buoy_df.reset_index()
    doit(buoy_df, 365, 15, "wave_height", buoyNum)

14:44:53 - cmdstanpy - INFO - Chain [1] start processing
14:44:58 - cmdstanpy - INFO - Chain [1] done processing


RMSE for 46239 for 15 days using 365 days worth of training data for wave_height
0.7381412655699661



14:45:13 - cmdstanpy - INFO - Chain [1] start processing
14:45:18 - cmdstanpy - INFO - Chain [1] done processing


RMSE for 46258 for 15 days using 365 days worth of training data for wave_height
0.3796250863542164



14:45:39 - cmdstanpy - INFO - Chain [1] start processing
14:45:41 - cmdstanpy - INFO - Chain [1] done processing


RMSE for 46232 for 15 days using 365 days worth of training data for wave_height
0.4713763422654524



14:46:28 - cmdstanpy - INFO - Chain [1] start processing
14:46:53 - cmdstanpy - INFO - Chain [1] done processing


RMSE for 42001 for 15 days using 365 days worth of training data for wave_height
0.8409574676069215



14:47:35 - cmdstanpy - INFO - Chain [1] start processing
14:48:05 - cmdstanpy - INFO - Chain [1] done processing


RMSE for 41002 for 15 days using 365 days worth of training data for wave_height
0.9279168916864616



KeyError: "['average_period'] not in index"