In [1]:
import sys
print(sys.version)

3.10.12 (main, Sep 11 2024, 15:47:36) [GCC 11.4.0]


In [2]:
# Completely remove existing packages
!pip uninstall -y scikit-learn autogluon.tabular autogluon.core autogluon.features autogluon.common autogluon mxnet torch torchvision

# Install specific versions known to work together
!pip install --no-cache-dir scikit-learn==1.2.2
!pip install --no-cache-dir mxnet==1.9.1 torch==1.13.1 torchvision==0.14.1
!pip install --no-cache-dir autogluon==0.7.0

Found existing installation: scikit-learn 1.2.2
Uninstalling scikit-learn-1.2.2:
  Successfully uninstalled scikit-learn-1.2.2
[0mFound existing installation: torch 2.4.1+cu121
Uninstalling torch-2.4.1+cu121:
  Successfully uninstalled torch-2.4.1+cu121
Found existing installation: torchvision 0.19.1+cu121
Uninstalling torchvision-0.19.1+cu121:
  Successfully uninstalled torchvision-0.19.1+cu121
Collecting scikit-learn==1.2.2
  Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m152.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.

In [3]:
# --------------------------------------------------------
# Import Libraries
# --------------------------------------------------------
import pandas as pd
import numpy as np
import datetime
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_percentage_error
from autogluon.tabular import TabularPredictor
import joblib

# --------------------------------------------------------
# Options
# --------------------------------------------------------

TIME_LIMIT_FOLD = 3600 * 0.25
TIME_LIMIT = 3600 * 5

# --------------------------------------------------------
# Load Data
# --------------------------------------------------------
train_data = pd.read_csv('/kaggle/input/playground-series-s5e1/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s5e1/test.csv')
sample_submission = pd.read_csv('/kaggle/input/playground-series-s5e1/sample_submission.csv')

# --------------------------------------------------------
# Feature Engineering
# --------------------------------------------------------
train_data['date'] = pd.to_datetime(train_data['date'])
test_data['date'] = pd.to_datetime(test_data['date'])

# Drop rows with missing target
train_data = train_data.dropna(subset=['num_sold'])
print("Train shape after dropping missing target:", train_data.shape)

# Create date-based features
for df in [train_data, test_data]:
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['dayofweek'] = df['date'].dt.dayofweek

# --------------------------------------------------------
# Sort Training Data by Date
# --------------------------------------------------------
train_data_sorted = train_data.sort_values(by='date').reset_index(drop=True)

# --------------------------------------------------------
# Split Features & Target
# --------------------------------------------------------
# Define feature columns (excluding 'id', 'date', 'num_sold')
feature_cols = [col for col in train_data_sorted.columns if col not in ['id', 'date', 'num_sold']]
X_sorted = train_data_sorted[feature_cols]
y_sorted = train_data_sorted['num_sold']

# Prepare Test Features
X_test = test_data.drop(columns=['id', 'date'])

# --------------------------------------------------------
# Generate Timestamp
# --------------------------------------------------------
timestamp_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# --------------------------------------------------------
# Time-based Cross Validation with OOF Predictions
# --------------------------------------------------------
# Initialize OOF predictions array
oof_predictions = np.zeros(len(train_data_sorted))
tscv = TimeSeriesSplit(n_splits=5)
scores = []

for fold, (train_index, valid_index) in enumerate(tscv.split(X_sorted), 1):
    # Split data
    X_train_cv, X_valid_cv = X_sorted.iloc[train_index], X_sorted.iloc[valid_index]
    y_train_cv, y_valid_cv = y_sorted.iloc[train_index], y_sorted.iloc[valid_index]
    
    # Combine X and y for AutoGluon
    train_cv = X_train_cv.copy()
    train_cv['num_sold'] = y_train_cv
    valid_cv = X_valid_cv.copy()
    valid_cv['num_sold'] = y_valid_cv
    
    # Initialize AutoGluon Predictor
    predictor = TabularPredictor(label='num_sold', problem_type='regression').fit(
        train_data=train_cv,
        presets='best_quality',  # You can choose 'medium_quality' or other presets
        verbosity=0,
        time_limit=TIME_LIMIT_FOLD
    )
    
    # Predict on validation set
    preds = predictor.predict(valid_cv)
    
    # Store OOF predictions
    oof_predictions[valid_index] = preds
    
    # Compute MAPE
    mape = mean_absolute_percentage_error(y_valid_cv, preds)
    scores.append(mape)
    print(f"Fold {fold} MAPE: {mape:.2%}")

print("TimeSeriesSplit MAPE Scores:", scores)
print("Average MAPE:", np.mean(scores))

# --------------------------------------------------------
# Save OOF Predictions
# --------------------------------------------------------
oof_df = pd.DataFrame({
    'id': train_data_sorted['id'],
    'oof_num_sold': oof_predictions
})

oof_filename = f"oof_predictions_m05_{timestamp_str}.csv"
oof_df.to_csv(oof_filename, index=False)
print(f"OOF predictions saved as {oof_filename}")

# --------------------------------------------------------
# Train on Full Dataset & Predict on Test
# --------------------------------------------------------
# Combine X and y for full training
full_train = X_sorted.copy()
full_train['num_sold'] = y_sorted

# Initialize and train the predictor on the full dataset
final_predictor = TabularPredictor(label='num_sold', problem_type='regression').fit(
    train_data=full_train,
    presets='best_quality',
    verbosity=0,
    time_limit=TIME_LIMIT
)

# Predict on test data
test_preds = final_predictor.predict(X_test)

# --------------------------------------------------------
# Save Trained Model
# --------------------------------------------------------
model_filename = f"model_05_{timestamp_str}.pkl"
final_predictor.save(model_filename)
print(f"Trained model saved as {model_filename}")

# --------------------------------------------------------
# Submission
# --------------------------------------------------------
submission = pd.DataFrame({
    'id': test_data['id'],
    'num_sold': test_preds
})

submission_filename = f"sub_m05_{timestamp_str}.csv"
submission.to_csv(submission_filename, index=False)
print(f"Submission saved as {submission_filename}")

No path specified. Models will be saved in: "AutogluonModels/ag-20250102_172429/"


Train shape after dropping missing target: (221259, 6)


  bool = onp.bool
Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x7a3b64fcae60>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/threadpoolctl.py", line 1005, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/usr/local/lib/python3.10/dist-packages/threadpoolctl.py", line 1175, in _make_controller_from_path
    lib_controller = controller_class(
  File "/usr/local/lib/python3.10/dist-packages/threadpoolctl.py", line 114, in __init__
    self.dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
  File "/usr/lib/python3.10/ctypes/__init__.py", line 374, in __init__
    self._handle = _dlopen(self._name, mode)
OSError: /usr/local/lib/python3.10/dist-packages/numpy.libs/libopenblas64_p-r0-0cf96a72.3.23.dev.so: cannot open shared object file: No such file or directory
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor

Fold 1 MAPE: 10.56%


  bool = onp.bool
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.

Fold 2 MAPE: 8.67%


  bool = onp.bool
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.

Fold 3 MAPE: 7.49%


  bool = onp.bool
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.

Fold 4 MAPE: 12.55%


  bool = onp.bool
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.

Fold 5 MAPE: 10.59%
TimeSeriesSplit MAPE Scores: [0.10557342739937137, 0.08670459396708857, 0.0748667714046865, 0.12548351798558097, 0.10587422935662702]
Average MAPE: 0.0997005080226709


No path specified. Models will be saved in: "AutogluonModels/ag-20250102_185900/"


OOF predictions saved as oof_predictions_m05_20250102_172429.csv


  bool = onp.bool
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.fit: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.

Trained model saved as model_05_20250102_172429.pkl
Submission saved as sub_m05_20250102_172429.csv
