In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv
/kaggle/input/tabular-playground-series-jan-2022/train.csv
/kaggle/input/tabular-playground-series-jan-2022/test.csv


In [2]:
import gc
import random

from IPython import display as ipd
from tqdm import tqdm
import xgboost as xgb

from sklearn.preprocessing import MinMaxScaler, RobustScaler, LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold

from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

import optuna 
from optuna.visualization.matplotlib import plot_optimization_history
from optuna.visualization.matplotlib import plot_param_importances

### Utils

In [3]:
def seeding(SEED, use_tf=False):
    np.random.seed(SEED)
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    os.environ['TF_CUDNN_DETERMINISTIC'] = str(SEED)
    if use_tf:
        tf.random.set_seed(SEED)
    print('seeding done!!!')
    
## https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/298201
def SMAPE(y_true, y_pred):
    denominator = (y_true + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)    

### Data Load

In [4]:
RANDOM_SEED = 42
TUNING = False

seeding(RANDOM_SEED)

train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/test.csv')
submission = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2022/sample_submission.csv')

seeding done!!!


In [5]:
train.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911


## Targets distribution display

Check my other notebooks: 

https://www.kaggle.com/vladlee/tps-jan2022-lgbm-optuna

https://www.kaggle.com/vladlee/tps-jan-2022-eda-baseline


### Date-based FE

In [6]:
def process_dates(df):
    df.date = pd.to_datetime(df.date)
    df['month'] = df.date.dt.month
    df['week'] = df.date.dt.week
    df['weekday'] = df.date.dt.weekday
    df['dayofweek'] = df.date.dt.dayofweek
    df['dayofyear'] = df.date.dt.dayofyear
    df['day'] = df.date.dt.day
    return df

train = process_dates(train)
test = process_dates(test)

  after removing the cwd from sys.path.


In [7]:
print(f'Train unique days: {train.day.unique().size}, test: {test.day.unique().size}')
print(f'Train unique weeks: {train.week.unique().size}, test: {test.week.unique().size}')
print(f'Train unique dayofweeks: {train.dayofweek.unique().size}, test: {train.dayofweek.unique().size}')
print(f'Train unique months: {train.month.unique().size}, test: {train.month.unique().size}')
print(f'Train unique dayofyear: {train.dayofyear.unique().size}, test: {train.dayofyear.unique().size}')

Train unique days: 31, test: 31
Train unique weeks: 53, test: 52
Train unique dayofweeks: 7, test: 7
Train unique months: 12, test: 12
Train unique dayofyear: 366, test: 366


In [8]:
target = train.num_sold
train.drop(['row_id','num_sold','date'], axis=1, inplace=True)
test.drop(['row_id', 'date'], axis=1, inplace=True)

### Encode category columns 

In [9]:
country_encoder = LabelEncoder()
train['country_enc'] = country_encoder.fit_transform(train['country'])
test['country_enc'] = country_encoder.transform(test['country'])

store_encoder = LabelEncoder()
train['store_enc'] = store_encoder.fit_transform(train['store'])
test['store_enc'] = store_encoder.transform(test['store'])

product_encoder = LabelEncoder()
train['product_enc'] = product_encoder.fit_transform(train['product'])
test['product_enc'] = product_encoder.transform(test['product'])

train.drop(['country','store','product'], axis=1, inplace=True)
test.drop(['country','store','product'], axis=1, inplace=True)

In [10]:
#for col in train.columns:
#    train[col] = pd.Categorical(train[col])
#for col in test.columns:
#    test[col] = pd.Categorical(test[col])

### Tune

In [11]:
NUM_BOOST_ROUND = 1000
EARLY_STOPPING_ROUNDS = 20
VERBOSE_EVAL = 100
    
def objective(trial, X, y):
    
    param_grid = {
        'verbosity': 1,
        'objective': 'reg:squarederror', 
        'eval_metric': 'rmse',
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1),
        'eta': trial.suggest_float('eta', 0.1, 0.9),
        'max_depth': trial.suggest_int('max_depth', 50, 500),     
        'min_child_weight': trial.suggest_float('min_child_weight', 10, 100),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
        'gamma': trial.suggest_float('gamma', 0, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9),
        'lambda': trial.suggest_float('lambda', 1, 10),
        'alpha': trial.suggest_float('alpha', 0, 9),
    }    
        
    X_train, X_valid, y_train, y_valid = train_test_split( X, y, test_size=0.25, random_state=RANDOM_SEED, shuffle=False)
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    model = xgb.train( param_grid, dtrain,
        num_boost_round = NUM_BOOST_ROUND,
        evals=[(dvalid, 'evals')], 
        verbose_eval = VERBOSE_EVAL,
        early_stopping_rounds=EARLY_STOPPING_ROUNDS
    )   
    
    oof_pred = model.predict(dvalid)
    oof_score = SMAPE(y_valid, oof_pred) 
    print(f"OOF SMAPE: {oof_score}")
    return oof_score

In [12]:
N_TRIALS = 100

if TUNING:
    study = optuna.create_study(direction='minimize')
    objective_func = lambda trial: objective(trial, train, target)
    study.optimize(objective_func, n_trials=N_TRIALS)  # number of iterations

    print("Number of finished trials: {}".format(len(study.trials)))
    print("Best trial:")
    trial = study.best_trial
    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

### Model and train

In [13]:
def run_train(X, y, run_params, splits, num_boost_round, verbose_eval, early_stopping_rounds ):
    scores = []
    models = []
    folds = StratifiedKFold(n_splits=splits)
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print(f'Fold {fold_n+1} started')
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dvalid = xgb.DMatrix(X_valid, label=y_valid)
        model = xgb.train( run_params, dtrain,
            num_boost_round = num_boost_round,
            evals=[(dvalid, 'evals')], 
            verbose_eval = verbose_eval,
            early_stopping_rounds=early_stopping_rounds
        )   

        oof_pred = model.predict(dvalid)
        oof_score = SMAPE(y_valid, oof_pred) 
        print(f"OOF SMAPE: {oof_score}")        
        
        models.append(model)
        scores.append(oof_score)
    return scores, models


NUM_BOOST_ROUND = 2000
EARLY_STOPPING_ROUNDS = 100
VERBOSE_EVAL = 100
TOTAL_SPLITS = 5
    
run_params = {
    'verbosity': 1,
    'objective': 'reg:squarederror', 
    'eval_metric': 'rmse',
    'learning_rate': 0.01729433116660487,
    'eta': 0.4954283685809021,
    'max_depth': 476,
    'min_child_weight': 12.875223150484498,
    'colsample_bytree': 0.7890238951483045,
    'gamma': 96.89423371529557,
    'subsample': 0.8862703289885544,
    'lambda': 8.869246442053491,
    'alpha': 4.132837689865073,
}

FEATURES = [col for col in train.columns if col.endswith('enc')]
scores, models = run_train(train, target, run_params, TOTAL_SPLITS, NUM_BOOST_ROUND, 
                                          VERBOSE_EVAL, EARLY_STOPPING_ROUNDS)

print('----------------------')
print(f'CV SMAPE mean score: {np.mean(scores)}, std: {np.std(scores)}.')
print('----------------------')

Fold 1 started
[0]	evals-rmse:461.54083




[100]	evals-rmse:102.70433
[200]	evals-rmse:56.21441
[300]	evals-rmse:54.81675
[379]	evals-rmse:54.69295
OOF SMAPE: 9.071375381058626
Fold 2 started
[0]	evals-rmse:462.46298
[100]	evals-rmse:107.77515
[200]	evals-rmse:55.45177
[300]	evals-rmse:50.82830
[400]	evals-rmse:48.48094
[500]	evals-rmse:46.64149
[600]	evals-rmse:44.34565
[700]	evals-rmse:42.07762
[800]	evals-rmse:40.43451
[900]	evals-rmse:39.42577
[1000]	evals-rmse:38.77135
[1100]	evals-rmse:38.27812
[1200]	evals-rmse:37.96509
[1300]	evals-rmse:37.68391
[1400]	evals-rmse:37.48998
[1500]	evals-rmse:37.32317
[1600]	evals-rmse:37.21749
[1700]	evals-rmse:37.13277
[1800]	evals-rmse:37.05945
[1900]	evals-rmse:36.98235
[1999]	evals-rmse:36.95383
OOF SMAPE: 5.963151857276151
Fold 3 started
[0]	evals-rmse:463.10703
[100]	evals-rmse:117.47646
[200]	evals-rmse:58.05267
[300]	evals-rmse:49.39125
[400]	evals-rmse:46.96753
[500]	evals-rmse:45.56939
[600]	evals-rmse:44.07444
[700]	evals-rmse:42.42603
[800]	evals-rmse:40.99860
[900]	evals-rmse

In [14]:
y_pred = np.zeros(len(test))
for model in models:
    y_pred += model.predict(xgb.DMatrix(test)).reshape(-1)
    
y_pred = y_pred / len(models)

In [15]:
submission['num_sold'] = np.round(y_pred).astype(int)
submission.to_csv('submission.csv', index=False, float_format='%.6f')
submission.head(20)

Unnamed: 0,row_id,num_sold
0,26298,392
1,26299,598
2,26300,182
3,26301,673
4,26302,1007
5,26303,328
6,26304,598
7,26305,898
8,26306,297
9,26307,1004
