In [38]:
!pip install prophet xgboost



In [40]:
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy as np
import matplotlib.dates as mdates

In [42]:
data_df = pd.read_csv("/Users/vamsikeshwaranm/Downloads/CTS/salesweekly_corrected_no_outliers.csv", parse_dates=['datum'])
data_df.set_index('datum', inplace=True)
categories = ['M01AB', 'M01AE', 'N02BA', 'N02BE', 'N05B', 'N05C', 'R03', 'R06']

In [44]:
def prepare_data(df, lags=4):
    df_prep = df[categories].copy()
    df_prep['year'] = df_prep.index.year
    df_prep['month'] = df_prep.index.month
    df_prep['week'] = df_prep.index.to_series().dt.isocalendar().week
    for col in categories:
        for l in range(1, lags+1):
            df_prep[f'{col}_lag{l}'] = df_prep[col].shift(l)
    df_prep = df_prep.dropna()
    return df_prep

def train_model(X, y, prev_model=None):
    dtrain = xgb.DMatrix(X, label=y)
    params = {
        'objective': 'reg:squarederror',
        'eta': 0.1,
        'max_depth': 3,
        'eval_metric': 'rmse'
    }
    num_round = 100
    if prev_model:
        bst = xgb.train(params, dtrain, num_round, xgb_model=prev_model)
    else:
        bst = xgb.train(params, dtrain, num_round)
    return bst

df_prep = prepare_data(data_df)

models = {}
for col in categories:
    features = ['year', 'month', 'week'] + [f'{col}_lag{i}' for i in range(1,5)]
    X = df_prep[features]
    y = df_prep[col]
    models[col] = train_model(X, y)

In [46]:
test_size = 12  
train_df = df_prep.iloc[:-test_size]
test_df = df_prep.iloc[-test_size:]

y_test = test_df[categories]
y_pred = pd.DataFrame(index=test_df.index, columns=categories)
for col in categories:
    features = ['year', 'month', 'week'] + [f'{col}_lag{i}' for i in range(1,5)]
    X_test = test_df[features]
    y_pred[col] = models[col].predict(xgb.DMatrix(X_test))

In [48]:
def forecast_col(col, n, model, df):
    preds = []
    current_lags = list(df[col].tail(4))
    current_date = df.index.max()
    for i in range(n):
        next_date = current_date + pd.Timedelta(weeks=1)
        feat_dict = {
            'year': next_date.year,
            'month': next_date.month,
            'week': next_date.isocalendar()[1],
            f'{col}_lag1': current_lags[-1],
            f'{col}_lag2': current_lags[-2],
            f'{col}_lag3': current_lags[-3],
            f'{col}_lag4': current_lags[-4],
        }
        pred = model.predict(xgb.DMatrix(pd.DataFrame([feat_dict])))[0]
        preds.append(pred)
        current_lags = current_lags[1:] + [pred]
        current_date = next_date
    future_dates = pd.date_range(start=df.index.max() + pd.Timedelta(weeks=1), periods=n, freq='W')
    return pd.Series(preds, index=future_dates)

In [None]:
weeks = int(input("Enter number of weeks to forecast: "))

future_forecasts = {}

for col in categories:
    features = ['year', 'month', 'week'] + [f'{col}_lag{i}' for i in range(1, 5)]
    X_all = df_prep[features]
    y_all = df_prep[col]

    y_pred_all = models[col].predict(xgb.DMatrix(X_all))

    forecast_series = forecast_col(col, weeks, models[col], df_prep)

    last_date = df_prep.index[-1]
    last_pred = y_pred_all[-1]
    forecast_series = pd.concat([
        pd.Series([last_pred], index=[last_date], name=col),
        forecast_series
    ])

    future_forecasts[col] = forecast_series

    plt.figure(figsize=(14, 5))
    plt.plot(df_prep.index, y_all.values, label="Actual", marker="o")
    plt.plot(df_prep.index, y_pred_all, label="Predicted (Historical)", marker="x", alpha=0.7)
    plt.plot(forecast_series.index, forecast_series.values,
             label=f"Forecast (Next {months} Months)", marker="x", color="red")

    plt.title(f"{col} - Actual vs Predicted & Forecast")
    plt.xlabel("Date")
    plt.ylabel("Sales")
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

    rmse = np.sqrt(mean_squared_error(y_all, y_pred_all))
    r2 = r2_score(y_all, y_pred_all)

    nonzero_idx = y_all != 0
    mape = np.mean(np.abs((y_all[nonzero_idx] - y_pred_all[nonzero_idx]) / y_all[nonzero_idx])) * 100
    accuracy = 100 - mape

    print(f"\nAccuracy metrics for {col} (whole data):")
    print(f"RMSE: {rmse:.4f}")
    print(f"R-squared: {r2:.4f}")
    print(f"MAPE: {mape:.2f}%")
    print(f"Accuracy (100 - MAPE): {accuracy:.2f}%")
