In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
import plotly.graph_objects as go

df = pd.read_csv(r"C:\Users\Admin\Documents\Data Intelligence Engineer project\Task Files\cleaned-dataset.csv")
df['Order Timestamp'] = pd.to_datetime(df['Order Timestamp'])
df['Month'] = df['Order Timestamp'].dt.month

X = df[['Month']]
y = df['Total amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
regressor = GradientBoostingRegressor()
regressor.fit(X_train, y_train)
months = pd.DataFrame({'Month': range(1, 13)})
predictions = regressor.predict(months)
actual = df.groupby('Month')['Total amount'].mean()
months['Month'] = months['Month'].apply(lambda x: ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'][x-1])

actual_formatted = ["${:,.2f}".format(a) for a in actual]
predictions_formatted = ["${:,.2f}".format(p) for p in predictions]
offset = max(actual.max(), predictions.max()) * 0.03

fig = go.Figure()
fig.add_trace(go.Scatter(x=months['Month'], y=actual,
                    mode='lines+markers+text',
                    name='Actual Average Sales',
                    text=actual_formatted,
                    textposition='top center',
                    customdata=actual + offset,
                    hovertemplate='%{text}<extra></extra>'))
fig.add_trace(go.Scatter(x=months['Month'], y=predictions,
                    mode='lines+markers+text',
                    name='Model Prediction',
                    text=predictions_formatted,
                    textposition='bottom center',
                    customdata=predictions - offset,
                    hovertemplate='%{text}<extra></extra>'))
fig.update_layout(title='Predicted vs Actual Average Sales Amount by Month of Year',
                   xaxis_title='Month of Year',
                   yaxis_title='Sales Amount')
fig.show()
