### User: xeeman
### Multilabel classification using catboost

In [1]:
import pandas as pd, numpy as np
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import sys
import gc
gc.collect()

0

In [2]:
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from mlxtend.preprocessing import TransactionEncoder
from sklearn import metrics, preprocessing, model_selection

In [3]:
sys.path.append('../cust_lib/iterative-stratification-master')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [4]:
def list_comm_mem(a, b):   
    a_set = set(a)
    b_set = set(b)
     
    # check length
    if len(a_set.intersection(b_set)) > 0:
        return(list(a_set.intersection(b_set))) 
    else:
        return("no common elements")

def do_labelenc(train=None,test=None,cat_cols=None):
    enc_dict = {}
    
    if test is None:
        for col in cat_cols:
            print(col)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train[col].values.astype('str')))
            train[col+'lab_enc'] = lbl.transform(list(train[col].values.astype('str')))
            enc_dict[col] = lbl
    else:
        for col in cat_cols:
            print(col)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train[col].values.astype('str'))+list(test[col].values.astype('str')))
            train[col+'_lab_enc'] = lbl.transform(list(train[col].values.astype('str')))
            test[col+'_lab_enc'] = lbl.transform(list(test[col].values.astype('str')))
            enc_dict[col] = lbl
            
def conv_pred_to_prods(row):
    #how to use: prod_test = pred_test_Y.apply(conv_pred_to_prods,axis=1)
    return list(row.sort_values(ascending=False).index)[0:3]

def create_B1_features(df):
    #transactions =list(train["Product_Holding_B1"])
    transactions =list(df["Product_Holding_B1"])
    encoder = TransactionEncoder().fit(transactions)
    onehot = encoder.transform(transactions)
    onehot = onehot*1
    onehot = pd.DataFrame(onehot, columns = [i+'_x' for i in encoder.columns_])
    df = pd.concat([df,onehot],axis=1)
    return df

def create_ohe_targets(df):
    #transactions =list(train["Product_Holding_B2"])
    transactions =list(df["Product_Holding_B2"])
    encoder = TransactionEncoder().fit(transactions)
    Y = encoder.transform(transactions)
    Y = Y*1
    Y = pd.DataFrame(Y, columns = encoder.columns_)
    return Y

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=3):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])
# source of apk and mapk functions https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py

def calc_mapk(pred_prod,df):
    pred_prod_ll = list(pred_prod)
    actual_prod_ll = list(df["Product_Holding_B2"])
    return mapk(actual_prod_ll, pred_prod_ll)

In [5]:
seed = 2021
data_path = r'../data/'

In [6]:
%%time
train = pd.read_csv(data_path+'train_go05W65.csv')
test = pd.read_csv(data_path+'test_VkM91FT.csv')
sample_submission = pd.read_csv(data_path+'sample_submission_kF044ur.csv')

Wall time: 86.2 ms


In [7]:
print(train.shape)
train.head()

(37748, 9)


Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1,Product_Holding_B2
0,CC264719,Male,41,14,0,C1,S3,['P16'],['P8']
1,CC209679,Female,47,14,1,C1,S2,"['P13', 'P20']",['P3']
2,CC319633,Female,59,14,0,C2,S2,['P11'],['P00']
3,CC231413,Female,32,16,0,C1,S2,"['P8', 'P13']",['P6']
4,CC259633,Male,30,15,0,C2,S3,"['P16', 'P17', 'P21']","['P8', 'P12']"


In [8]:
print(test.shape)
test.head()

(20327, 8)


Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1
0,CC372708,Female,31,31,0,C2,S3,"['P12', 'P13']"
1,CC216072,Male,28,37,1,C1,S2,"['P12', 'P13']"
2,CC387629,Male,31,12,0,C2,S3,['P20']
3,CC389228,Female,55,11,0,C2,S2,"['P13', 'P21']"
4,CC394445,Male,51,49,1,C2,S1,['P13']


In [9]:
print(sample_submission.shape)
sample_submission.head()

(20327, 2)


Unnamed: 0,Customer_ID,Product_Holding_B2
0,CC372708,['P00']
1,CC216072,['P00']
2,CC387629,['P00']
3,CC389228,['P00']
4,CC394445,['P00']


### Preprocess data

In [10]:
train["Product_Holding_B1"] = train["Product_Holding_B1"].apply(eval)
train["Product_Holding_B2"] = train["Product_Holding_B2"].apply(eval)
test["Product_Holding_B1"] = test["Product_Holding_B1"].apply(eval)

In [11]:
train.head()

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1,Product_Holding_B2
0,CC264719,Male,41,14,0,C1,S3,[P16],[P8]
1,CC209679,Female,47,14,1,C1,S2,"[P13, P20]",[P3]
2,CC319633,Female,59,14,0,C2,S2,[P11],[P00]
3,CC231413,Female,32,16,0,C1,S2,"[P8, P13]",[P6]
4,CC259633,Male,30,15,0,C2,S3,"[P16, P17, P21]","[P8, P12]"


#### preprocess B1

In [12]:
train = create_B1_features(train)

In [13]:
train.head(3)

Unnamed: 0,Customer_ID,Gender,Age,Vintage,Is_Active,City_Category,Customer_Category,Product_Holding_B1,Product_Holding_B2,P00_x,P1_x,P10_x,P11_x,P12_x,P13_x,P14_x,P15_x,P16_x,P17_x,P18_x,P19_x,P2_x,P20_x,P21_x,P3_x,P4_x,P5_x,P6_x,P7_x,P8_x,P9_x
0,CC264719,Male,41,14,0,C1,S3,[P16],[P8],0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,CC209679,Female,47,14,1,C1,S2,"[P13, P20]",[P3],0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,CC319633,Female,59,14,0,C2,S2,[P11],[P00],0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### preprocess B2 the target

In [14]:
train_Y = create_ohe_targets(train)

In [15]:
train_Y.head(3)

Unnamed: 0,P00,P1,P10,P11,P12,P13,P14,P15,P16,P17,P18,P2,P20,P3,P4,P5,P6,P7,P8,P9
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
train.columns

Index(['Customer_ID', 'Gender', 'Age', 'Vintage', 'Is_Active', 'City_Category',
       'Customer_Category', 'Product_Holding_B1', 'Product_Holding_B2',
       'P00_x', 'P1_x', 'P10_x', 'P11_x', 'P12_x', 'P13_x', 'P14_x', 'P15_x',
       'P16_x', 'P17_x', 'P18_x', 'P19_x', 'P2_x', 'P20_x', 'P21_x', 'P3_x',
       'P4_x', 'P5_x', 'P6_x', 'P7_x', 'P8_x', 'P9_x'],
      dtype='object')

In [17]:
cat_cols = ['Gender', 'City_Category','Customer_Category']
lab_enc_objs = do_labelenc(train=train,test=test,cat_cols=cat_cols)

Gender
City_Category
Customer_Category


In [18]:
test = create_B1_features(test)

In [19]:
x_col = ['Age', 'Vintage', 'Is_Active',
       'P00_x', 'P1_x', 'P10_x', 'P11_x', 'P12_x', 'P13_x', 'P14_x', 'P15_x',
       'P16_x', 'P17_x', 'P18_x', 'P19_x', 'P2_x', 'P20_x', 'P21_x', 'P3_x',
       'P4_x', 'P5_x', 'P6_x', 'P7_x', 'P8_x', 'P9_x', 'Gender_lab_enc',
       'City_Category_lab_enc', 'Customer_Category_lab_enc']

In [20]:
test_X = test[x_col]

In [21]:
n_folds = 5
mskf = MultilabelStratifiedKFold(n_splits=n_folds,shuffle=True,random_state=seed)
pred_test_final = np.zeros((test.shape[0],len(train_Y.columns)))
score_dev_total = 0.0
score_val_total = 0.0

for fold, (dev_idx, val_idx) in enumerate(mskf.split(train[x_col],train_Y)):
    dev_df = train.iloc[dev_idx]
    val_df= train.iloc[val_idx]
    dev_X = dev_df[x_col]
    val_X = val_df[x_col]
    dev_Y = train_Y.iloc[dev_idx]
    val_Y = train_Y.iloc[val_idx]
    
    print(f"Fold={fold}")
    print(f"dev shape: {dev_X.shape,dev_Y.shape}")
    print(f"val shape: {val_X.shape,val_Y.shape}")
    
    print("Create data ...")
    
    dev_pool = Pool(dev_X, dev_Y.values)
    val_pool = Pool(val_X, val_Y.values)
    
    clf = CatBoostClassifier(
        loss_function='MultiLogloss',
        iterations=500,
        learning_rate = 0.1,
        random_seed = seed,
        class_names=list(dev_Y.columns)
    )

    clf.fit(dev_pool, eval_set=val_pool, metric_period=10, plot=False, verbose=50)
    
    pred_val = clf.predict_proba(val_X)
    pred_val_Y = pd.DataFrame(pred_val, columns = val_Y.columns)
    pred_val_prod = pred_val_Y.apply(conv_pred_to_prods,axis=1)
    fold_score = calc_mapk(pred_val_prod, val_df)
    print(f"Mapk score on fold {fold}: {fold_score}")

    #score_dev_total = score_dev_total + score_dev
    score_val_total = score_val_total + fold_score
    
    #predict on test
    pred_test_final += clf.predict_proba(test_X)
    print(f"End fold{fold}")
    print("==================")
    
#score_dev_net = score_dev_total/5.0
score_val_net = score_val_total/5.0
pred_test = pred_test_final/5.0


Fold=0
dev shape: ((30190, 28), (30190, 20))
val shape: ((7558, 28), (7558, 20))
Create data ...
0:	learn: 0.5192241	test: 0.5188854	best: 0.5188854 (0)	total: 319ms	remaining: 2m 38s
50:	learn: 0.1242982	test: 0.1268839	best: 0.1268839 (50)	total: 8.07s	remaining: 1m 11s
100:	learn: 0.1193450	test: 0.1236627	best: 0.1236627 (100)	total: 15.8s	remaining: 1m 2s
150:	learn: 0.1162546	test: 0.1226560	best: 0.1226560 (150)	total: 23.5s	remaining: 54.4s
200:	learn: 0.1139791	test: 0.1225341	best: 0.1224971 (180)	total: 31.2s	remaining: 46.4s
250:	learn: 0.1121590	test: 0.1225958	best: 0.1224971 (180)	total: 38.8s	remaining: 38.5s
300:	learn: 0.1106618	test: 0.1227673	best: 0.1224971 (180)	total: 46.4s	remaining: 30.7s
350:	learn: 0.1092708	test: 0.1230031	best: 0.1224971 (180)	total: 54.1s	remaining: 23s
400:	learn: 0.1078423	test: 0.1233609	best: 0.1224971 (180)	total: 1m 1s	remaining: 15.2s
450:	learn: 0.1065935	test: 0.1235756	best: 0.1224971 (180)	total: 1m 9s	remaining: 7.53s
499:	lear

In [22]:
pred_test_Y = pd.DataFrame(pred_test, columns = val_Y.columns)

In [24]:
prod_test = pred_test_Y.apply(conv_pred_to_prods,axis=1)
prod_test

0          [P8, P10, P4]
1          [P8, P1, P10]
2        [P16, P13, P00]
3          [P8, P10, P9]
4          [P00, P8, P1]
              ...       
20322      [P8, P00, P6]
20323      [P8, P12, P6]
20324      [P8, P00, P9]
20325       [P1, P6, P7]
20326       [P8, P3, P6]
Length: 20327, dtype: object

In [25]:
sub = pd.DataFrame()
sub["Customer_ID"] = test["Customer_ID"]
sub["Product_Holding_B2"] = prod_test

In [26]:
sub.to_csv("sub005.csv",index=False)

In [27]:
print(sub.shape)
sub.head()

(20327, 2)


Unnamed: 0,Customer_ID,Product_Holding_B2
0,CC372708,"[P8, P10, P4]"
1,CC216072,"[P8, P1, P10]"
2,CC387629,"[P16, P13, P00]"
3,CC389228,"[P8, P10, P9]"
4,CC394445,"[P00, P8, P1]"


In [28]:
sample_submission.head()

Unnamed: 0,Customer_ID,Product_Holding_B2
0,CC372708,['P00']
1,CC216072,['P00']
2,CC387629,['P00']
3,CC389228,['P00']
4,CC394445,['P00']
