In [1]:
from datetime import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import mlframework
from mlframework import MLFramework


class ShoppingPreprocessing:

    def __init__(self):
        self.raw = None
        self.label = None
        self.feature = None

    def load_raw(self, feature_path: str, label_path: str):
        feature = pd.read_csv(feature_path, names=['sessionid', 'start', 'end', 'productid'])
        label = pd.read_csv(label_path, names=['gender'])
        raw = feature.assign(gender=label['gender'])
        self.raw = raw

    def encode_label(self):
        gender_dict = {'male': 1, 'female': 0}
        self.raw = self.raw.assign(male=lambda df: df['gender'].replace(gender_dict))
        self.label = self.raw.set_index('sessionid')['male']

    def gen_feat_time(self):
        feat_time = (
            self.raw
            .set_index('sessionid')
            .loc[:, ['start', 'end']]
            .assign(start=lambda df: pd.to_datetime(df['start']))
            .assign(end=lambda df: pd.to_datetime(df['end']))

            .assign(duration=lambda df: (df['end'] - df['start']).dt.total_seconds())
            .assign(dayname=lambda df: df.start.dt.day_name())
            .assign(weekend=lambda df: df['dayname'].isin(['Saturday', 'Sunday']))
            .assign(worktime=lambda df: df['start'].dt.time.between(time(9, 0, 0), time(17, 0, 0)))
            .assign(worktime=lambda df: (df['weekend'] == 0) & (df['worktime'] == 1))
            .assign(nighttime=lambda df: df['start'].dt.hour.isin([22, 23, 0, 1, 2, 3, 4]))
            .assign(oclock=lambda df: df['start'].dt.minute.isin([55, 56, 57, 58, 59, 0, 1, 2, 3, 4]))

            .drop(['start', 'end'], axis=1)
        )
        feat_time['dayname'] = feat_time['dayname'].astype('category')
        feat_time[['duration', 'weekend', 'worktime', 'nighttime', 'oclock']] = (
            feat_time[['duration', 'weekend', 'worktime', 'nighttime', 'oclock']].astype(int))

        self.feature = pd.concat([self.feature, feat_time], axis=1)

    def transform_product(data: pd.DataFrame) -> pd.DataFrame:
        """Separate four levels of productid into different columns.

        Args:
            data (pd.DataFrame): Data input.

        Returns:
            pd.DataFrame: Separated four levels of productid along with their sessions.
        """
        product_series = (
            data
            .productid
            .str
            .split(';', expand=True)
            .stack()
            .reset_index(level=1, drop=True)
            .str
            .split('/')
        )
        product_df = pd.DataFrame(product_series.to_list(), index=product_series.index)
        product_df.drop(4, axis=1, inplace=True)
        product_df.columns = [f'level{x}' for x in product_df.columns]
        product_df = (
            data
            .reset_index()
            .loc[:, ['index', 'sessionid']]
            .merge(product_df.reset_index(), how='left', on='index')
            .drop('index', axis=1)
        )
        return product_df

    def keep_product(data: pd.DataFrame, level: str, threshold: float = 0.5) -> list:
        """Generate list of productid to be kept at every level. A productid will
        be kept if being satisfied one of following two conditions:
        (1) Appear in more than 100 sessions.
            -> Keep the most common productid.
        (2) Appear in between 10 and 100 sessions and male proportion >= threshold
        (e.g. 0.5) in the sessions having the productid.
            -> Ignore uncommon productid, i.e. appear in fewer than 10 sessions.
            -> For productid neither common or uncommon, i.e. appear in between 10
            and 100 sessions, the productid will be kept only when it has male
            significant more than female compare to the population rate (0.22).

        Args:
            data (pd.DataFrame): Data input.
            level (str): One of ['level0', 'level1', 'level2', 'level3'].
            threshold (float, optional): [description]. Threshold apply to
                considering productid.Defaults to 0.5.

        Returns:
            list: productid to be kept.
        """
        product_df = ShoppingPreprocessing.transform_product(data)
        session_count = product_df.groupby(level).sessionid.nunique()
        product_keep = session_count.loc[lambda x: x > 100].index.to_list()
        product_consider = session_count.loc[lambda x: (x >= 10) & (x <= 100)].index.to_list()
        product_add = (
            product_df
            .merge(data[['sessionid', 'male']], how='left', on='sessionid')
            .loc[lambda df: df[level].isin(product_consider)]
            .drop_duplicates(['sessionid', level])
            .groupby(level)
            .male
            .mean()
            .loc[lambda x: x >= threshold]
            .index
            .to_list()
        )
        product_keep.extend(product_add)
        return product_keep

    def gen_feat_product(self):
        product_df = ShoppingPreprocessing.transform_product(self.raw)
        for level in ['level0', 'level1', 'level2', 'level3']:
            product_keep = ShoppingPreprocessing.keep_product(self.raw, level)
            product_df[level].loc[lambda x: ~x.isin(product_keep)] = np.nan

        product_df = product_df.set_index('sessionid')
        feat_product = pd.get_dummies(product_df, prefix='', prefix_sep='')
        feat_product = feat_product.groupby('sessionid').sum()
        self.feature = pd.concat([self.feature, feat_product], axis=1)

    def gen_data(self) -> pd.DataFrame:
        end = (
            self.raw
            .set_index('sessionid')
            .assign(end=lambda df: pd.to_datetime(df['end']).dt.date)
            .end
        )
        data = pd.concat([end, self.label, self.feature], axis=1)
        return data


In [2]:
shopping_preprocessing = ShoppingPreprocessing()
shopping_preprocessing.load_raw(feature_path='trainingData.csv', label_path='trainingLabels.csv')
shopping_preprocessing.encode_label()
shopping_preprocessing.gen_feat_time()
shopping_preprocessing.gen_feat_product()
data = shopping_preprocessing.gen_data()
data.head()


Unnamed: 0_level_0,end,male,duration,dayname,weekend,worktime,nighttime,oclock,A00001,A00002,...,D07875,D10307,D13982,D14855,D18347,D18353,D22126,D33879,D35676,D35703
sessionid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
u10001,2014-11-14,0,6,Friday,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
u10002,2014-12-12,0,1,Friday,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
u10003,2014-11-14,0,839,Friday,0,0,1,1,0,12,...,0,0,0,0,0,0,0,0,0,0
u10004,2014-11-14,0,2,Friday,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
u10005,2014-11-14,0,197,Friday,0,0,1,0,0,3,...,0,0,0,0,0,0,0,0,0,0


In [24]:
label_col = 'male'
label_time_col = 'end'
feat_cols = [col for col in data.columns if col not in [label_col, label_time_col]]

shopping_ml = MLFramework()
shopping_ml.train_valid_test_split(
    data=data,
    label_col=label_col,
    feat_cols=feat_cols,
    label_time_col=label_time_col,
    valid_size=0.2,  # not split for valid set yet. will be split later in cross validation
    test_size=0.2  # keep last 7 days for testing model (~25% total data)
)
shopping_ml.optuna_lgb(10)

[32m[I 2022-01-09 16:09:44,385][0m A new study created in memory with name: no-name-2ac4a022-24e6-47c1-a26f-b779d63037d4[0m


Data and its partition (train, valid, test) are stored in attributes data, train, valid and test, respectively.


[32m[I 2022-01-09 16:09:48,094][0m Trial 0 finished with value: 0.8427551007746559 and parameters: {'num_leaves': 358, 'max_depth': 6, 'min_data_in_leaf': 42, 'subsample': 0.37100350263545645, 'colsample_bytree': 0.528035307768199, 'reg_alpha': 0.005002757418697866, 'reg_lambda': 6.193339587133041e-07}. Best is trial 0 with value: 0.8427551007746559.[0m
[32m[I 2022-01-09 16:09:52,363][0m Trial 1 finished with value: 0.8421314191365127 and parameters: {'num_leaves': 101, 'max_depth': 12, 'min_data_in_leaf': 48, 'subsample': 0.7560137180200421, 'colsample_bytree': 0.44501541859006755, 'reg_alpha': 0.1276723085739637, 'reg_lambda': 0.0038864612846047147}. Best is trial 0 with value: 0.8427551007746559.[0m
[32m[I 2022-01-09 16:09:55,849][0m Trial 2 finished with value: 0.8425255442069508 and parameters: {'num_leaves': 57, 'max_depth': 5, 'min_data_in_leaf': 70, 'subsample': 0.35746350346765554, 'colsample_bytree': 0.5300120798357196, 'reg_alpha': 0.31665122196799234, 'reg_lambda': 

Best booster has been trained with num_boost_round=8.
New attributes assigned: study, best_params, cvbooster, booster.


In [25]:
# Make prediction on test set
pred = shopping_ml.predict(shopping_ml.test.data)
pred = [1 if x >= 0.5 else 0 for x in pred]

mlframework.accuracy_macro(pred, shopping_ml.test.label)

0.837098964296174

In [26]:
pred = shopping_ml.cvbooster['cvbooster'].predict(shopping_ml.test.data)
pred = np.mean(pred, axis=0)
pred = [1 if x >= 0.5 else 0 for x in pred]

mlframework.accuracy_macro(pred, shopping_ml.test.label)

0.8253548458148009

In [13]:
import lightgbm as lgb

In [7]:
params = {
    'objective': 'binary',  # binary log loss classification
    'metric': 'custom',  # metric to be evaluated on the evaluation set for early stopping
    'learning_rate': 0.1,  # should not be turned
    'boosting_type': 'gbdt',  # early stopping is not available in dart mode
    'verbose': -1,  # suppress warning
    'feature_pre_filter': False,  # to be able tunning min_child_samples

    # 'num_leaves': trial.suggest_int('num_leaves', 15, 1023),
    # 'max_depth': trial.suggest_int('max_depth', 3, 12),
    # 'min_child_samples': trial.suggest_int('min_data_in_leaf', 10, 100),
    # 'subsample': trial.suggest_float('subsample', 0.3, 1.0),
    # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
    # 'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
    # 'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True)
}
gbm = lgb.cv(
    params=params,
    train_set=shopping_ml.train,
    num_boost_round=500,
    early_stopping_rounds=50,
    verbose_eval=False,
    feval=MLFramework.accuracy_macro_lgb
)

4.0

In [84]:
def accuracy(a, b, c, d):
    result = (a + c) / (b + d)
    print(result)


def accuracy_custom(a, b, c, d):
    result = ((a / b) + (c / d)) / 2
    print(result)


b = 20
d = 80

a = 0.8*20
c = 0.8*80

accuracy(a, b, c, d)
accuracy_custom(a, b, c, d)


0.8
0.8
