In [1]:
import pandas as pd
import numpy as np
import logging

# notebook only
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


filename="../data/data.csv"
df = pd.read_csv(filename)
df.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [2]:
missing = sum(df.isna().sum())
logging.info(f"Missing elements: {missing}")

INFO:root:Missing elements: 0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             3900 non-null   int64  
 1   Age                     3900 non-null   int64  
 2   Gender                  3900 non-null   object 
 3   Item Purchased          3900 non-null   object 
 4   Category                3900 non-null   object 
 5   Purchase Amount (USD)   3900 non-null   int64  
 6   Location                3900 non-null   object 
 7   Size                    3900 non-null   object 
 8   Color                   3900 non-null   object 
 9   Season                  3900 non-null   object 
 10  Review Rating           3900 non-null   float64
 11  Subscription Status     3900 non-null   object 
 12  Shipping Type           3900 non-null   object 
 13  Discount Applied        3900 non-null   object 
 14  Promo Code Used         3900 non-null   

In [4]:
cols = df.columns
object_cols = []
for i in cols:
    col = df[i]
    if col.dtype == "object":
        object_cols.append(i)

logging.info(f"The columns which contain objects are: {', '.join(object_cols)}")
logging.info("These columns will be encoded")

INFO:root:The columns which contain objects are: Gender, Item Purchased, Category, Location, Size, Color, Season, Subscription Status, Shipping Type, Discount Applied, Promo Code Used, Payment Method, Frequency of Purchases
INFO:root:These columns will be encoded


In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
logging.info("Encoding.......")
for col in object_cols:
    df[col] = le.fit_transform(df[col])
logging.info("Encoding complete")

INFO:root:Encoding.......
INFO:root:Encoding complete


In [6]:
from sklearn.model_selection import train_test_split
import os
logging.info("")
processed_data = pd.read_csv("../data/processed_data.csv")
target_column = 'Frequency of Purchases'
X = processed_data.loc[:, processed_data.columns != target_column]
y = processed_data.loc[:, processed_data.columns == target_column]
directory = '../data/splits'

if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory '{directory}' created successfully!")
else:
    print(f"Directory '{directory}' already exists.")    

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify = y, random_state=47)
    
np.save(f'../data/splits/X_train.npy', X_train)
np.save(f'../data/splits/X_test.npy', X_test)
np.save(f'../data/splits/y_train.npy', y_train)
np.save(f'../data/splits/y_test.npy', y_test)

INFO:root:


Directory '../data/splits' already exists.


In [7]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from aim import Run
import pickle
import optuna
import os

X_train = np.load('../data/splits/X_train.npy')
y_train = np.load('../data/splits/y_train.npy')

def objective(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'booster': 'gbtree',
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'eta': trial.suggest_float('eta', 1e-3, 0.5),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'random_state': 42,
        'n_jobs': -1
    }

    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    dtrain = xgb.DMatrix(X_train_split, label=y_train_split)
    dval = xgb.DMatrix(X_val, label=y_val)

    bst = xgb.train(params, dtrain, num_boost_round=trial.suggest_int('n_estimators', 100, 1000, step=100), evals=[(dval, 'eval')], early_stopping_rounds=10, verbose_eval=False)

    y_pred = bst.predict(dval)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    run = Run()
    run["hparams"] = {k: v for k, v in params.items() if k != 'eval_metric'}
    run.track(rmse, name='RMSE', context={"subset": "validation"})

    return rmse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)

best_params = study.best_params
print("Best hyperparameters:", best_params)

logging.info("Training model with best hyperparams")
dtrain = xgb.DMatrix(X_train, label=y_train)
logging.info("Training model with best hyperparams")
best_bst = xgb.train(best_params, dtrain, num_boost_round=best_params['n_estimators'])

logging.info("Saving model into the model folder")
model_directory = '../model'
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

model_filename = os.path.join(model_directory, 'best_xgb_model.pkl')
with open(model_filename, 'wb') as file:
    pickle.dump(best_bst, file)

  from .autonotebook import tqdm as notebook_tqdm
[I 2023-11-30 12:28:04,730] A new study created in memory with name: no-name-49cca231-e8d1-48b8-840b-cf30193e73c9
DEBUG:filelock:Attempting to acquire lock 140619400903696 on /run/media/veer/613A1DEC519A152F/kf-pipeline/notebooks/.aim/locks/afee3d79904c4165a8222fce.softlock
DEBUG:filelock:Lock 140619400903696 acquired on /run/media/veer/613A1DEC519A152F/kf-pipeline/notebooks/.aim/locks/afee3d79904c4165a8222fce.softlock
DEBUG:aim.storage.rockscontainer:opening /run/media/veer/613A1DEC519A152F/kf-pipeline/notebooks/.aim/meta/chunks/afee3d79904c4165a8222fce as aimrocks db
DEBUG:aim.sdk.run:Opening Run afee3d79904c4165a8222fce in write mode
INFO:aim.sdk.reporter:creating RunStatusReporter for afee3d79904c4165a8222fce
DEBUG:aim.sdk.reporter:polling for check-ins in PosixPath('/run/media/veer/613A1DEC519A152F/kf-pipeline/notebooks/.aim/check_ins')
DEBUG:aim.sdk.reporter:no check-in found for afee3d79904c4165a8222fce; returning zero-check-in
D

Best hyperparameters: {'lambda': 0.89712856040191, 'alpha': 0.7860202407676028, 'max_depth': 3, 'eta': 0.08372601470293096, 'gamma': 0.20883618518514876, 'colsample_bytree': 0.446409147196833, 'min_child_weight': 8, 'subsample': 0.5237286102774871, 'n_estimators': 500}


In [None]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pickle
# from aim import Run

X_test = np.load('../data/splits/X_test.npy')
y_test = np.load('../data/splits/y_test.npy')

model_directory = '../model'
model_filename = os.path.join(model_directory, 'best_xgb_model.pkl')

with open(model_filename, 'rb') as file:
    best_bst = pickle.load(file)

dtest = xgb.DMatrix(X_test, label=y_test)
y_pred = best_bst.predict(dtest)

test_rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Test RMSE: {test_rmse}")

run = Run()
run.track(test_rmse, name='Test_RMSE', context={"subset": "test"})

DEBUG:filelock:Attempting to acquire lock 140619908512144 on /run/media/veer/613A1DEC519A152F/kf-pipeline/notebooks/.aim/locks/a5314fa415ce4f76a7fab990.softlock
DEBUG:filelock:Lock 140619908512144 acquired on /run/media/veer/613A1DEC519A152F/kf-pipeline/notebooks/.aim/locks/a5314fa415ce4f76a7fab990.softlock
DEBUG:aim.storage.rockscontainer:opening /run/media/veer/613A1DEC519A152F/kf-pipeline/notebooks/.aim/meta/chunks/a5314fa415ce4f76a7fab990 as aimrocks db


Test RMSE: 2.1221253491519043


DEBUG:aim.sdk.run:Opening Run a5314fa415ce4f76a7fab990 in write mode
INFO:aim.sdk.reporter:creating RunStatusReporter for a5314fa415ce4f76a7fab990
DEBUG:aim.sdk.reporter:polling for check-ins in PosixPath('/run/media/veer/613A1DEC519A152F/kf-pipeline/notebooks/.aim/check_ins')
DEBUG:aim.sdk.reporter:no check-in found for a5314fa415ce4f76a7fab990; returning zero-check-in
DEBUG:aim.sdk.reporter:no leftover check-in found. starting from zero
INFO:aim.sdk.reporter:starting from: {}
DEBUG:aim.sdk.reporter:incrementing starting idx -> 1
DEBUG:aim.sdk.reporter:scheduled TimedTask(when=0, flag_name='starting', overwritten=False) ASAP because no physical check-in was found
INFO:aim.sdk.reporter:starting writer thread for <aim.sdk.reporter.RunStatusReporter object at 0x7fe4783355d0>
DEBUG:aim.sdk.reporter:no interesting things to do, sleeping for 0
DEBUG:aim.sdk.reporter:notifying <aim.sdk.reporter.RunStatusReporter object at 0x7fe4783355d0>
DEBUG:aim.sdk.reporter:until woken up
DEBUG:aim.sdk.re

DEBUG:aim.sdk.reporter:incrementing progress idx -> 3
DEBUG:aim.sdk.reporter:scheduled TimedTask(when=0, flag_name='progress', overwritten=False) ASAP because no physical check-in was found
DEBUG:aim.sdk.reporter:time remaining: -4083.252217031
DEBUG:aim.sdk.reporter:only -4083.252217031 remaining... flushing one task
DEBUG:aim.sdk.reporter:calibrated check-in: CheckIn(idx=3, expect_next_in=30, flag_name='progress') -> CheckIn(idx=3, expect_next_in=30, flag_name='progress')
DEBUG:aim.sdk.reporter.file_manager:touching check-in: /run/media/veer/613A1DEC519A152F/kf-pipeline/notebooks/.aim/check_ins/a5314fa415ce4f76a7fab990-0000000000000003-progress-1701327562.13-00030
DEBUG:aim.sdk.reporter.file_manager:found 1 check-ins:
DEBUG:aim.sdk.reporter.file_manager:the acting one: /run/media/veer/613A1DEC519A152F/kf-pipeline/notebooks/.aim/check_ins/a5314fa415ce4f76a7fab990-0000000000000003-progress-1701327562.13-00030
DEBUG:aim.sdk.reporter.file_manager:check-in /run/media/veer/613A1DEC519A152F