## Setup

Conda environment (from [darts GitHub](https://github.com/unit8co/darts/blob/master/INSTALL.md)):
- `conda create --name darts_models python=3.9`
- `conda activate darts_models`
- `conda install -c conda-forge -c pytorch u8darts-all`
- in a cell in a Jupyter notebook: `%pip install darts`
- only after done using the environment: `conda deactivate`

In [None]:
import numpy as np
import pandas as pd
import darts
from sklearn.metrics import mean_squared_error

from darts_model_utils import prep_station_data
from darts_model_utils import get_station_data
from darts_model_utils import darts_station_data

## Trip Start Station

In [None]:
start_file = "../model_trips_start_station_2017_2022.csv"

#s3_start_location = f"s3://{bucket}/{start_file}*"
#trips_start = pd.read_csv(s3_start_location, parse_dates = True)

trips_start = pd.read_csv(start_file, parse_dates = True)
trips_start.shape

### Data Processing

In [None]:
trips_start_train = trips_start[(trips_start["starttime"] > "2017-09-01") & (trips_start["starttime"] < "2020-09-01")]
print(trips_start_train.shape)
trips_start_val = trips_start[(trips_start["starttime"] > "2020-09-01") & (trips_start["starttime"] < "2021-09-01")]
print(trips_start_val.shape)
trips_start_test = trips_start[(trips_start["starttime"] > "2021-09-01") & (trips_start["starttime"] < "2022-09-01")]
print(trips_start_test.shape)

In [None]:
trips_start_train_group = prep_station_data(trips_start_train, "start station id", "starttime")
print(sum(trips_start_train_group["size"]))
trips_start_val_group = prep_station_data(trips_start_val, "start station id", "starttime")
print(sum(trips_start_val_group["size"]))
trips_start_test_group = prep_station_data(trips_start_test, "start station id", "starttime")
print(sum(trips_start_test_group["size"]))

In [None]:
freq = "15min" # group and sum trips by a set increment
train_max_date = "2020-08-31 23:45:00" # make sure all series end at the same time
val_max_date = "2021-08-31 23:45:00" # make sure all series end at the same time
test_max_date = "2022-08-31 23:45:00" # make sure all series end at the same time

In [None]:
trips_start_train_list = darts_station_data(trips_start_train_group, "start station id", "starttime", freq, train_max_date)
trips_start_val_list = darts_station_data(trips_start_val_group, "start station id", "starttime", freq, val_max_date)
trips_start_test_list = darts_station_data(trips_start_test_group, "start station id", "starttime", freq, test_max_date)

### Random Forest Model

In [None]:
from darts.models import RandomForest

In [None]:
rf_mod1 = RandomForest(lags = 1)

In [None]:
rf_mod1.fit(trips_start_train_list)

In [None]:
print(np.where(trips_start_val_group["start station id"].unique() == 177))
print(np.where(trips_start_train_group["start station id"].unique() == 177))

In [None]:
rf_mod1_pred = rf_mod1.predict(1000, trips_start_val_list[247])

In [None]:
mean_squared_error(trips_start_val_list[247].values()[:1000], rf_mod1_pred[0].values(), squared = False)

In [None]:
trips_start_val_list[247].plot()
rf_mod1_pred[0].plot()