### Sticker Sales Prediction - Log Transformed Target (XGBoost)

#### 1. Importing thge necessary libraries and reading the dataset

In [12]:
# Importing necessary libraries for data manipulation, visualization, and modeling
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import xgboost as xgb

# Importing additional libraries for preprocessing, model evaluation, and hyperparameter tuning
from itertools import product
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from xgboost import XGBRegressor

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Set inline display for matplotlib
%matplotlib inline

In [13]:
# Read datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submission_df = pd.read_csv('sample_submission.csv')

# Display the first few rows
train_df.head()

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0


In [14]:
test_df.head()

Unnamed: 0,id,date,country,store,product
0,230130,2017-01-01,Canada,Discount Stickers,Holographic Goose
1,230131,2017-01-01,Canada,Discount Stickers,Kaggle
2,230132,2017-01-01,Canada,Discount Stickers,Kaggle Tiers
3,230133,2017-01-01,Canada,Discount Stickers,Kerneler
4,230134,2017-01-01,Canada,Discount Stickers,Kerneler Dark Mode


In [15]:
sample_submission_df.head()

Unnamed: 0,id,num_sold
0,230130,100
1,230131,100
2,230132,100
3,230133,100
4,230134,100


#### 3. Preprocessing - Evaluating Various Techniques

In [5]:
def preprocess_and_evaluate_with_hyperparameter_tuning(train, test):
    """
    Preprocess data using various strategies, perform hyperparameter tuning with XGBoost,
    and select the best preprocessing pipeline based on MAPE score on validation data.

    Parameters:
    - train (pd.DataFrame): The training dataset with features and target variable.
    - test (pd.DataFrame): The test dataset for which predictions are made.

    Returns:
    - best_X_train: Processed training features of the best combination.
    - best_X_val: Processed validation features of the best combination.
    - best_y_train: Training labels.
    - best_y_val: Validation labels.
    - best_X_test: Processed test features.
    - best_model: The best tuned XGBoost model.
    - best_preprocessing: The best preprocessing strategy combination.
    """

    # Options for preprocessing
    imputation_strategies = ['mean', 'median', None]  # Added None for dropping missing rows
    encoding_strategies = ['label', 'onehot']
    scaling_strategies = ['standard', 'minmax', None]

    # Hyperparameter grid for XGBoost
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }

    # Keep track of the best combination
    best_mape = float('inf')
    best_preprocessing = None
    best_X_train, best_X_val, best_y_train, best_y_val = None, None, None, None
    best_X_test = None
    best_model = None
    best_log_transformed = None

    # Split train into training and validation sets
    train, val = train_test_split(train, test_size=0.2, random_state=42)

    # Drop 'id' column from all datasets
    train = train.drop(columns=['id'])
    val = val.drop(columns=['id'])
    test = test.drop(columns=['id'])

    # Iterate over preprocessing options
    for imputation in imputation_strategies:
        for encoding in encoding_strategies:
            for scaling in scaling_strategies:
                for log_transform in [True, False]:
                    print(f"Trying combination: Imputation={imputation}, Encoding={encoding}, Scaling={scaling}, Log_Transform={log_transform}")

                    # Create copies of the datasets
                    train_copy = train.copy()
                    val_copy = val.copy()
                    test_copy = test.copy()

                    # Handle missing values in `num_sold`
                    if imputation is None:
                        # Drop rows with missing target values
                        train_copy = train_copy.dropna(subset=['num_sold'])
                        val_copy = val_copy.dropna(subset=['num_sold'])
                    else:
                        # Impute missing values
                        imputer = SimpleImputer(strategy=imputation)
                        train_copy['num_sold'] = imputer.fit_transform(train_copy[['num_sold']])
                        val_copy['num_sold'] = imputer.transform(val_copy[['num_sold']])

                    # Apply log transformation to the target variable if required
                    if log_transform:
                        train_copy['num_sold'] = np.log1p(train_copy['num_sold'])
                        val_copy['num_sold'] = np.log1p(val_copy['num_sold'])

                    # Convert date to datetime and extract features
                    for df in [train_copy, val_copy, test_copy]:
                        df['date'] = pd.to_datetime(df['date'])
                        df['year'] = df['date'].dt.year
                        df['month'] = df['date'].dt.month
                        df['day'] = df['date'].dt.day
                        df['weekday'] = df['date'].dt.weekday
                        df['is_weekend'] = df['weekday'].apply(lambda x: 1 if x >= 5 else 0)
                        df.drop(columns=['date'], inplace=True)

                    # Separate features and target
                    feature_cols = train_copy.columns.difference(['num_sold'])
                    X_train = train_copy[feature_cols]
                    y_train = train_copy['num_sold']
                    X_val = val_copy[feature_cols]
                    y_val = val_copy['num_sold']
                    X_test = test_copy[feature_cols]

                    # Encoding categorical variables
                    cat_cols = ['country', 'store', 'product']
                    if encoding == 'label':
                        encoder = LabelEncoder()
                        for col in cat_cols:
                            X_train[col] = encoder.fit_transform(X_train[col])
                            X_val[col] = encoder.transform(X_val[col])
                            X_test[col] = encoder.transform(X_test[col])
                    elif encoding == 'onehot':
                        transformer = ColumnTransformer(
                            transformers=[('onehot', OneHotEncoder(handle_unknown='ignore'), cat_cols)],
                            remainder='passthrough'
                        )
                        X_train = pd.DataFrame(
                            transformer.fit_transform(X_train),
                            columns=transformer.get_feature_names_out()
                        )
                        X_val = pd.DataFrame(
                            transformer.transform(X_val),
                            columns=transformer.get_feature_names_out()
                        )
                        X_test = pd.DataFrame(
                            transformer.transform(X_test),
                            columns=transformer.get_feature_names_out()
                        )

                    # Feature scaling
                    if scaling == 'standard':
                        scaler = StandardScaler()
                        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
                        X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
                        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
                    elif scaling == 'minmax':
                        scaler = MinMaxScaler()
                        X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
                        X_val = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
                        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

                    # Hyperparameter tuning using RandomizedSearchCV
                    xgb_model = XGBRegressor(
                        tree_method='gpu_hist',
                        predictor='gpu_predictor',
                        gpu_id=0,
                        random_state=42
                    )

                    random_search = RandomizedSearchCV(
                        estimator=xgb_model,
                        param_distributions=param_grid,
                        n_iter=20,
                        scoring='neg_mean_absolute_percentage_error',
                        cv=3,
                        verbose=2,
                        random_state=42
                    )

                    random_search.fit(X_train, y_train)
                    tuned_model = random_search.best_estimator_

                    # Evaluate the model on the validation set
                    y_pred = tuned_model.predict(X_val)

                    # If log transformation was applied, reverse it for MAPE calculation
                    if log_transform:
                        y_val = np.expm1(y_val)
                        y_pred = np.expm1(y_pred)

                    mape = mean_absolute_percentage_error(y_val, y_pred)
                    print(f"Combination: {mape:.4f} MAPE")

                    # Check if this is the best preprocessing combination
                    if mape < best_mape:
                        best_mape = mape
                        best_preprocessing = (imputation, encoding, scaling, log_transform)
                        best_X_train, best_X_val, best_y_train, best_y_val = X_train, X_val, y_train, y_val
                        best_X_test = X_test
                        best_model = tuned_model
                        best_log_transformed = log_transform

    print(f"\nBest preprocessing: {best_preprocessing} with MAPE = {best_mape:.4f}")
    return best_X_train, best_X_val, best_y_train, best_y_val, best_X_test, best_model, best_preprocessing

In [6]:
# Perform preprocessing optimization and hyperparameter tuning
best_X_train, best_X_val, best_y_train, best_y_val, best_X_test, best_model, best_preprocessing = preprocess_and_evaluate_with_hyperparameter_tuning(train_df, test_df)

Trying combination: Imputation=mean, Encoding=label, Scaling=standard, Log_Transform=True
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, n_estimators=300, subsample=0.6; total time=   1.4s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, n_estimators=300, subsample=0.6; total time=   1.1s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=7, n_estimators=300, subsample=0.6; total time=   1.0s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=0.6; total time=   0.4s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=0.6; total time=   0.4s
[CV] END colsample_bytree=0.6, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=0.6; total time=   0.4s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=300, subsample=1.0; total time=   0.6s
[CV] END colsample_bytree=0.8, learnin

#### 4. Building Model and Prediction

In [7]:
# Evaluate the final tuned model on the validation set
y_pred = best_model.predict(best_X_val)
final_mape = mean_absolute_percentage_error(best_y_val, y_pred)
print(f"Final Model MAPE: {final_mape}")

Final Model MAPE: 0.9614962972922296


In [8]:
# Predict on the test set
test_df['num_sold'] = best_model.predict(best_X_test)

In [9]:
# Create submission file
submission = test_df[['id', 'num_sold']]
submission.to_csv('submission.csv', index=False)