In [1]:
from data_pipeline import data_transform_pipeline, update_time_features, cap_sales
import pandas as pd

data = pd.read_csv('data/train.csv')

X = data_transform_pipeline.fit_transform(data)

In [2]:
X_update = update_time_features(X)

In [3]:
X_update.tail(4)

Unnamed: 0,date,store_nbr,sales,onpromotion,month,day,year,day_of_week,is_weekend,cluster,...,state_Tungurahua,type_B,type_C,type_D,type_E,family,lagged_sales_1,lagged_sales_2,rolling_mean_14_days,rolling_mean_28_days
3000884,2017-08-15,9,154.553,1,8,15,2017,1,0,6,...,False,True,False,False,False,PREPARED FOODS,114.12,105.169,120.867428,104.266357
3000885,2017-08-15,9,2419.729,148,8,15,2017,1,0,6,...,False,True,False,False,False,PRODUCE,1348.425,1693.607,1606.1055,1586.302607
3000886,2017-08-15,9,121.0,8,8,15,2017,1,0,6,...,False,True,False,False,False,SCHOOL AND OFFICE SUPPLIES,182.0,200.0,150.071429,87.464286
3000887,2017-08-15,9,16.0,0,8,15,2017,1,0,6,...,False,True,False,False,False,SEAFOOD,17.0,20.0,19.001929,17.715036


In [4]:
X_update = X_update[X_update["date"] > "2013-01-28"]
y_update = X_update.sales
y_update = cap_sales(y_update)

In [5]:
X_dropped = X_update.drop(columns=['date', 'store_nbr', 'family', 'sales'])
X_dropped.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2950992 entries, 924 to 3000887
Data columns (total 85 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   onpromotion                           int64  
 1   month                                 int32  
 2   day                                   int32  
 3   year                                  int32  
 4   day_of_week                           int32  
 5   is_weekend                            int64  
 6   cluster                               int64  
 7   is_holiday                            bool   
 8   oil_price                             float64
 9   family_BABY CARE                      bool   
 10  family_BEAUTY                         bool   
 11  family_BEVERAGES                      bool   
 12  family_BOOKS                          bool   
 13  family_BREAD/BAKERY                   bool   
 14  family_CELEBRATION                    bool   
 15  family_CLEANING   

In [6]:
import mlflow

mlflow.set_tracking_uri(uri="http://localhost:8080")

mlflow.set_experiment("Time Series Sales")

<Experiment: artifact_location='mlflow-artifacts:/463103012251617584', creation_time=1711893278784, experiment_id='463103012251617584', last_update_time=1711893278784, lifecycle_stage='active', name='Time Series Sales', tags={}>

In [12]:
import mlflow
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from xgboost import XGBRegressor
from datetime import datetime
import mlflow
import lightgbm as lgb

tscv = TimeSeriesSplit(n_splits=3)

date = datetime.now().strftime("%Y%m%d-%H%M%S")

with mlflow.start_run(run_name=f"Train {date}"):

    models = {
        # 'XGBoost' : (XGBRegressor(), {'n_estimators': [50, 100, 200], 'max_depth': [2, 3, 4], 'learning_rate': [0.15, 0.1, 0.01, 0.001]}),
        'XGBoost' : (XGBRegressor(), {'n_estimators': [50], 'max_depth': [3], 'learning_rate': [0.15]}),
        # 'XGBoost2' : (XGBRegressor(), {'n_estimators': [50, 100], 'max_depth': [3], 'learning_rate': [0.1]}),
        # 'LightGBM': (lgb.LGBMClassifier(), {'n_estimators': [50, 100, 150, 300], 'max_depth': [2, 3, 4], 'learning_rate': [0.1, 0.01, 0.001]})
    }

    # Perform GridSearchCV for each model
    results = {}
    for model_name, (model, param_grid) in models.items():
        with mlflow.start_run(run_name=model_name, nested=True):  # Start a separate MLflow run for each model
            grid_search = GridSearchCV(model, param_grid, cv=tscv, scoring='neg_mean_squared_error')
            grid_search.fit(X_dropped, y_update)
            
            # Log parameters and metrics to MLflow
            mlflow.log_params(grid_search.best_params_)
            mlflow.log_metric("cross_validation_mse", grid_search.best_score_)
            mlflow.log_metric("cross_validation_rmse", (grid_search.best_score_ *-1) ** 0.5)
            mlflow.log_param("Model Name", model_name)
            mlflow.set_tag("Model Name", model_name)
            
            results[model_name] = {
                'Best Parameters': grid_search.best_params_,
                'Best Cross-Validation Score': grid_search.best_score_
            }

    for model_name, result in results.items():
        print(f"Model: {model_name}")
        print(f"Best Parameters: {result['Best Parameters']}")
        print(f"Best Cross-Validation Score: {result['Best Cross-Validation Score']}\n")

    # Select the best model based on cross-validation score
    best_model_name = max(results, key=lambda x: results[x]['Best Cross-Validation Score'])
    best_model = models[best_model_name][0]
    
    # Train the best model on the entire training data
    best_model.fit(X_dropped, y_update)
    model_path = f"models/{best_model_name}-{date}.pkl"
    mlflow.sklearn.save_model(best_model, model_path)
    mlflow.sklearn.log_model(best_model, "model")

    # # Evaluate the best model on the test data
    # test_score = best_model.score(X_test, y_test)
    # print(f"Test Accuracy of the Best Model ({best_model_name}): {test_score}")

    # # Log the test accuracy to MLflow
    # mlflow.log_metric("test_accuracy", test_score)


Model: XGBoost
Best Parameters: {'learning_rate': 0.15, 'max_depth': 3, 'n_estimators': 50}
Best Cross-Validation Score: -63598.84540178825





In [55]:
X_test = pd.read_csv('data/test.csv')
X_test = data_transform_pipeline.fit_transform(X_test)
X_test['sales'] = 0
# X_test_update = update_time_features(X_test, y.sales, X_dropped)

In [56]:
from datetime import date, timedelta, datetime

def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

start_date = datetime(2017, 8, 16)
end_date = datetime(2017, 8, 31)

In [11]:
model = mlflow.sklearn.load_model(r'models/XGBoost-20240331-123334.pkl')

In [57]:
last_day = X_update[X_update.date == start_date - timedelta(1)]

In [97]:
import numpy as np
predictions = np.array([], dtype=float)
for day in daterange(start_date, end_date + timedelta(1)):
    X_temp = X_test[X_test.date == day]
    X_temp.loc[:,['lagged_sales_1', 'lagged_sales_2', 'rolling_mean_14_days', 'rolling_mean_28_days']] = last_day[['lagged_sales_1', 'lagged_sales_2', 'rolling_mean_14_days', 'rolling_mean_28_days']].values
    X_temp_dropped = X_temp.drop(columns=['date', 'store_nbr', 'family', 'sales'])
    y_pred = model.predict(X_temp_dropped)
    predictions = np.concatenate([predictions, y_pred])

In [99]:
from datetime import datetime

X_competition = pd.read_csv('data/test.csv')
competition_indexes = X_competition['id']
path = fr'submissions/{datetime.now().strftime("submission_%Y-%m-%d_%H-%M-%S")}.csv'
pd.DataFrame({'id':competition_indexes, 
              'sales':predictions }).to_csv(path, index=False)