In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate
import plotly.express as px
from statsmodels.tsa.seasonal import seasonal_decompose
from pandas.plotting import autocorrelation_plot
from sklearn.preprocessing import StandardScaler
from prophet import Prophet
from datetime import datetime
# import xgbregressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, make_scorer
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor, DMatrix, train


In [29]:
train_df = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

sub_file = pd.read_csv('data/sample_submission.csv')


In [30]:
def transform_data(data):
    if 'num_sold' in data.columns: 
        data.fillna({'num_sold':0}, inplace=True)

    data['month'] = pd.to_datetime(data['date']).dt.month
    data['year'] = pd.to_datetime(data['date']).dt.year
    data['season'] = data['month'].map({1: 'Winter', 2: 'Winter', 3: 'Spring', 4: 'Spring', 5: 'Spring', 6: 'Summer', 7: 'Summer', 8: 'Summer', 9: 'Autumn', 10: 'Autumn', 11: 'Autumn', 12: 'Winter'})

    # change season to categorical
    data['season'] = data['season'].astype('category')
    data['season'] = data['season'].cat.codes

    # change product to categorical
    data['product'] = data['product'].astype('category')
    data['product'] = data['product'].cat.codes

    # change country to categorical
    data['country'] = data['country'].astype('category')
    data['country'] = data['country'].cat.codes

    # change store to categorical
    data['store'] = data['store'].astype('category')
    data['store'] = data['store'].cat.codes

    # change store to categorical
    data['year'] = data['year'].astype('category')
    data['year'] = data['year'].cat.codes

    data[['country', 'store', 'product',  'month', 'year', 'season']] = data[['country', 'store', 'product',  'month', 'year', 'season']].astype(int)

    return data

train_df = transform_data(train_df)
test = transform_data(test)

In [31]:
train_df

Unnamed: 0,id,date,country,store,product,num_sold,month,year,season
0,0,2010-01-01,0,0,0,0.0,1,0,3
1,1,2010-01-01,0,0,1,973.0,1,0,3
2,2,2010-01-01,0,0,2,906.0,1,0,3
3,3,2010-01-01,0,0,3,423.0,1,0,3
4,4,2010-01-01,0,0,4,491.0,1,0,3
...,...,...,...,...,...,...,...,...,...
230125,230125,2016-12-31,5,1,0,466.0,12,6,3
230126,230126,2016-12-31,5,1,1,2907.0,12,6,3
230127,230127,2016-12-31,5,1,2,2299.0,12,6,3
230128,230128,2016-12-31,5,1,3,1242.0,12,6,3


# Scaling

In [32]:
# # log scale train and test
# train_log = train.copy()
# test_log = test.copy()

# train_log[['country', 'store', 'product',  'month', 'year', 'season','num_sold']] = train_log[['country', 'store', 'product',  'month', 'year', 'season', 'num_sold']] + 1
# test_log[['country', 'store', 'product',  'month', 'year', 'season']] = test_log[['country', 'store', 'product',  'month', 'year', 'season']] + 1

# train_log[['country', 'store', 'product',  'month', 'year', 'season','num_sold']] = np.log(train_log[['country', 'store', 'product',  'month', 'year', 'season','num_sold']])
# test_log[['country', 'store', 'product',  'month', 'year', 'season']] = np.log(test_log[['country', 'store', 'product',  'month', 'year', 'season']])


In [33]:
# # start scaling 
# scaler = StandardScaler()

# train_log[['country', 'store', 'product',  'month', 'year', 'season']] = scaler.fit_transform(train_log[['country', 'store', 'product',  'month', 'year', 'season']])
# test_log[['country', 'store', 'product',  'month', 'year', 'season']] = scaler.transform(test_log[['country', 'store', 'product',  'month', 'year', 'season']])

# train[['country', 'store', 'product',  'month', 'year', 'season']] = scaler.fit_transform(train[['country', 'store', 'product',  'month', 'year', 'season']])
# test[['country', 'store', 'product',  'month', 'year', 'season']] = scaler.transform(test[['country', 'store', 'product',  'month', 'year', 'season']])

In [None]:
X = train_df[['country', 'store', 'product',  'month', 'year', 'season']]
y = train_df['num_sold']

# XGBRegressor

In [35]:
def mape_objective(y_pred, dtrain):
    y_true = dtrain.get_label()  # Extract true labels from DMatrix
    grad = np.sign(y_pred - y_true) / (np.abs(y_true) + 1e-6)  # Gradient
    hess = 1 / (np.abs(y_true) + 1e-6)                        # Hessian
    return grad, hess

# Corrected MAPE evaluation metric
def mape_metric(y_pred, dtrain):
    y_true = dtrain.get_label()  # Extract true labels from DMatrix
    mape = np.mean(np.abs((y_true - y_pred) / (y_true + 1e-6))) * 100
    return 'mape', mape

In [36]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to DMatrix for custom training
dtrain = DMatrix(X_train, label=y_train)
dval = DMatrix(X_val, label=y_val)

# Define XGBoost parameters
params = {
    'max_depth': 6,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'reg:squarederror',  # Objective set to regressor for compatibility
    'eval_metric': 'mae'             # Default evaluation metric
}

# Train with the custom MAPE objective and evaluation metric
evals = [(dtrain, 'train'), (dval, 'validation')]

model = train(
    params,
    dtrain,
    num_boost_round=1000,
    early_stopping_rounds=50,
    obj=mape_objective,       # Custom MAPE objective function
    feval=mape_metric,        # Custom MAPE evaluation metric
    evals=evals,
    verbose_eval=True
)

# Predict on validation set
y_val_pred = model.predict(dval)

# Calculate MAPE on the validation set
mape = mean_absolute_percentage_error(y_val, y_val_pred)
print(f"Validation MAPE: {mape * 100:.2f}%")


[0]	train-mae:722.69487	train-mape:1739194.50000	validation-mae:724.73965	validation-mape:1716951.50000
[1]	train-mae:722.70662	train-mape:1545961.25000	validation-mae:724.75128	validation-mape:1526190.00000
[2]	train-mae:722.68643	train-mape:1352728.37500	validation-mae:724.73098	validation-mape:1335428.37500
[3]	train-mae:722.71305	train-mape:1159495.00000	validation-mae:724.75763	validation-mape:1144666.50000
[4]	train-mae:722.70909	train-mape:966316.18750	validation-mae:724.75341	validation-mape:953905.00000
[5]	train-mae:722.75522	train-mape:773083.37500	validation-mae:724.79959	validation-mape:763143.37500
[6]	train-mae:722.80136	train-mape:579850.31250	validation-mae:724.84577	validation-mape:572381.81250




[7]	train-mae:722.84749	train-mape:386617.34375	validation-mae:724.89195	validation-mape:381620.25000
[8]	train-mae:722.81842	train-mape:193384.20312	validation-mae:724.86274	validation-mape:190858.57812
[9]	train-mae:722.86456	train-mape:151.22472	validation-mae:724.90893	validation-mape:97.00474
[10]	train-mae:722.84823	train-mape:193327.98438	validation-mae:724.89244	validation-mape:190856.67188
[11]	train-mae:722.79853	train-mape:97.12917	validation-mae:724.84272	validation-mape:97.09620
[12]	train-mae:722.84820	train-mape:193265.35938	validation-mae:724.89240	validation-mape:190790.25000
[13]	train-mae:722.79820	train-mape:159.73828	validation-mae:724.84240	validation-mape:163.51363
[14]	train-mae:722.78187	train-mape:193265.25000	validation-mae:724.82592	validation-mape:190790.12500
[15]	train-mae:722.73187	train-mape:159.83391	validation-mae:724.77591	validation-mape:163.60661
[16]	train-mae:722.73300	train-mape:193265.09375	validation-mae:724.77693	validation-mape:190790.00000


In [None]:
# Define the MAPE scorer for GridSearchCV
def mape_scorer(y_true, y_pred):
    return mean_absolute_percentage_error(y_true, y_pred)

mape_scorer = make_scorer(mape_scorer, greater_is_better=False)  # Negative for minimization

# Initialize XGBRegressor
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# Define parameter grid for tuning
param_grid = {
    'max_depth': [4, 6, 8],                  # Depth of trees
    'learning_rate': [0.01, 0.05, 0.1],     # Step size for gradient descent
    'n_estimators': [100, 300, 500],        # Number of boosting rounds
    'subsample': [.70,.8],                # Fraction of samples used for training each tree
    'colsample_bytree': [0.8, 1.0],         # Fraction of features used for training each tree
}

# Use GridSearchCV for parameter tuning
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring=mape_scorer,   # Use custom MAPE scorer
    cv=3,                  # 3-fold cross-validation
    verbose=2,
    n_jobs=-1              # Use all available cores
)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Validation MAPE:", -grid_search.best_score_ * 100)  # Negative due to scorer inversion

# Train the best model on the full training set
best_model = grid_search.best_estimator_

# Predict on the validation set
y_val_pred = best_model.predict(X_val)

# Calculate final MAPE on validation set
final_mape = mean_absolute_percentage_error(y_val, y_val_pred)
print(f"Final Validation MAPE: {final_mape * 100:.2f}%")

In [70]:
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

In [71]:
# Initialize and fit the Prophet model
model = Prophet()

for each in ['country', 'store', 'product',  'month', 'year', 'season']:
    model.add_regressor(each)

model.fit(train)



11:53:09 - cmdstanpy - INFO - Chain [1] start processing
11:53:29 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x2880a8bf0>

In [72]:

# predict y given a df of x values
test_pred = model.predict(test)
test_pred

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,country,...,year,year_lower,year_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2017-01-01,775.063851,-207.989887,1530.479869,775.063851,775.063851,-122.136828,-122.136828,-122.136828,-148.478579,...,112.630461,112.630461,112.630461,88.452206,88.452206,88.452206,0.0,0.0,0.0,652.927023
1,2017-01-01,775.063851,-178.881652,1576.192522,775.063851,775.063851,-91.495778,-91.495778,-91.495778,-148.478579,...,112.630461,112.630461,112.630461,88.452206,88.452206,88.452206,0.0,0.0,0.0,683.568072
2,2017-01-01,775.063851,-129.568713,1559.728066,775.063851,775.063851,-60.854729,-60.854729,-60.854729,-148.478579,...,112.630461,112.630461,112.630461,88.452206,88.452206,88.452206,0.0,0.0,0.0,714.209122
3,2017-01-01,775.063851,-126.828562,1618.499773,775.063851,775.063851,-30.213679,-30.213679,-30.213679,-148.478579,...,112.630461,112.630461,112.630461,88.452206,88.452206,88.452206,0.0,0.0,0.0,744.850172
4,2017-01-01,775.063851,-123.431293,1636.022053,775.063851,775.063851,0.427371,0.427371,0.427371,-148.478579,...,112.630461,112.630461,112.630461,88.452206,88.452206,88.452206,0.0,0.0,0.0,775.491221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98545,2019-12-31,829.942270,170.182935,2040.892710,413.006966,1249.362726,211.032918,211.032918,211.032918,148.478579,...,37.563059,37.563059,37.563059,77.842639,77.842639,77.842639,0.0,0.0,0.0,1040.975188
98546,2019-12-31,829.942270,181.038838,2022.020384,413.000536,1249.376932,241.673967,241.673967,241.673967,148.478579,...,37.563059,37.563059,37.563059,77.842639,77.842639,77.842639,0.0,0.0,0.0,1071.616238
98547,2019-12-31,829.942270,177.204362,2079.265893,412.994107,1249.391138,272.315017,272.315017,272.315017,148.478579,...,37.563059,37.563059,37.563059,77.842639,77.842639,77.842639,0.0,0.0,0.0,1102.257287
98548,2019-12-31,829.942270,165.588790,2053.544006,412.987677,1249.405344,302.956067,302.956067,302.956067,148.478579,...,37.563059,37.563059,37.563059,77.842639,77.842639,77.842639,0.0,0.0,0.0,1132.898337


In [73]:
# non log submission 1 
sub_file['num_sold'] = test_pred['yhat']
# add timestamp to file name 
file_name = datetime.now().strftime('submissions/submission_%Y%m%d_%H%M.csv')
sub_file.to_csv(file_name, index=False)