# Step 1: Install and Import Libraries

In [None]:
# Get time series data
#import yfinance as yf

# Prophet model for time series forecast
from prophet import Prophet

# Data processing
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Model performance evaluation
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# Step 2: Pull Data

In [None]:
# import pandas as pd
import requests

# Replace with your actual GitHub raw file URL
# github_url = "https://raw.githubusercontent.com/cfwiecha/Fall2024Capstone/refs/heads/main/frontend_shipping_source_300m_30s_pruned.json"

github_url = "https://raw.githubusercontent.com/yuyunfrancis/aiops/refs/heads/master/Lab6/Lab5/boutique_training.json"

try:
    response = requests.get(github_url)
    response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

    data = response.json()
    df_train = pd.DataFrame( data['data']['result'][0]['values'] )
    df_train.columns = ['ds', 'y']
    print( df_train.head() )

except requests.exceptions.RequestException as e:
    print(f"Error fetching data from GitHub: {e}")
except ValueError as e:
    print(f"Error decoding JSON response: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [None]:
import json

In [None]:
#f = open("frontend_shipping_source_300m_30s_pruned.json")
#prom = json.load(f)
#df_train = pd.DataFrame( data['data']['result'][0]['values'] )
#df_train.columns = ['ds', 'y']
#df_train

In [None]:
# Replace with your actual GitHub raw file URL
# github_url = "https://raw.githubusercontent.com/cfwiecha/Fall2024Capstone/refs/heads/main/20min.json"

github_url = "https://raw.githubusercontent.com/yuyunfrancis/aiops/refs/heads/master/Lab6/Lab5/boutique_training.json"


try:
    response = requests.get(github_url)
    response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

    data = response.json()
    df_test = pd.DataFrame( data['data']['result'][0]['values'] )
    df_test.columns = ['ds', 'y']
    print( df_test.head() )

except requests.exceptions.RequestException as e:
    print(f"Error fetching data from GitHub: {e}")
except ValueError as e:
    print(f"Error decoding JSON response: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")



In [None]:
df_test

In [None]:
#align test timestamps with training by time shifting back to the first train time...assume test data starts from 0 cycle time like training data

train_start_ds = df_train['ds'].iloc[0]
print(train_start_ds)

df_train['ds'] = df_train['ds'] - train_start_ds

df_train.head()

In [None]:
#align test timestamps with training by time shifting back to the first train time...assume test data starts from 0 cycle time like training data

test_start_ds = df_test['ds'].iloc[0]
print(test_start_ds)

df_test['ds'] = df_test['ds'] - test_start_ds

df_test.head()

In [None]:
from datetime import datetime

In [None]:
df_train['ds'] = df_train['ds'].apply(lambda sec: datetime.fromtimestamp(sec))
df_train

In [None]:
df_test['ds'] = df_test['ds'].apply(lambda sec: datetime.fromtimestamp(sec))
df_test

In [None]:
# Information on the dataframe
df_train['y']=df_train['y'].astype(float)
df_train.info()

In [None]:
# Information on the dataframe
df_test['y']=df_test['y'].astype(float)
df_test.info()

Next, let's visualize the closing prices of the two tickers using `seaborn`, and add the legend to the plot using `matplotlib`. We can see that the price for Google increased a lot starting in late 2020, and almost doubled in late 2021.

In [None]:
# Visualize data using seaborn
sns.set(rc={'figure.figsize':(12,8)})
sns.lineplot(x=df_train['ds'], y=df_train['y'])
plt.legend(['Training metric'])

# Step 3: Build Time Series Model Using Prophet in Python

In [None]:
# Add seasonality
model = Prophet(interval_width=0.99, yearly_seasonality=False, weekly_seasonality=False, daily_seasonality=False, growth='flat')
model.add_seasonality(name='hourly', period=1/24, fourier_order=5)

# Fit the model on the training dataset
model.fit(df_train)

# Step 4: Make Predictions Using Prophet in Python

After building the model, in step 4, we use the model to make predictions on the dataset. The forecast plot shows that the predictions are in general aligned with the actual values.

In [None]:
# Make prediction
forecast = model.predict(df_test)

# Visualize the forecast
model.plot(forecast); # Add semi-colon to remove the duplicated chart

We can also check the components plot for the trend, weekly seasonality, and yearly seasonality.

In [None]:
# Visualize the forecast components
model.plot_components(forecast);

# Step 5: Check Time Series Model Performace

In [None]:
forecast

In [None]:
# Merge actual and predicted values
performance = pd.merge(df_test, forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']], on='ds')

In [None]:
performance

In [None]:
performance.dropna(subset=['y', 'yhat'], inplace=True)

# Check MAE value
performance_MAE = mean_absolute_error(performance['y'], performance['yhat'])
print(f'The MAE for the model is {performance_MAE}')

# Check MAPE value
performance_MAPE = mean_absolute_percentage_error(performance['y'], performance['yhat'])
print(f'The MAPE for the model is {performance_MAPE}')

# Step 6: Identify Anomalies

In step 6, we will identify the time series anomalies by checking if the actual value is outside of the uncertainty interval. If the actual value is smaller than the lower bound or larger than the upper bound of the uncertainty interval, the anomaly indicator is set to 1, otherwise, it's set to 0.

Using `value_counts()`, we can see that there are 6 outliers out of 505 data points.

In [None]:
# Create an anomaly indicator
performance['anomaly'] = performance.apply(lambda rows: 1 if ((float(rows.y)<rows.yhat_lower)|(float(rows.y)>rows.yhat_upper)) else 0, axis = 1)


In [None]:
performance.info()

In [None]:
# Check the number of anomalies
performance['anomaly'].value_counts()

After printing out the anomalies, we can see that all the outliers are lower than the lower bound of the uncertainty interval.

In [None]:
# Take a look at the anomalies
anomalies = performance[performance['anomaly']==1].sort_values(by='ds')
anomalies

In the visualization, all the dots are actual values and the black line represents the predicted values. The orange dots are the outliers.

In [None]:
# Visualize the anomalies
sns.scatterplot(x='ds', y='y', data=performance, hue='anomaly')
sns.lineplot(x='ds', y='yhat', data=performance, color='black')