# Modeling

testing mode vs. validation mode : sliding window的取樣窗口略有不同，另外validation mode有offline validation，而testing mode則會存出預測結果。

## 1. Import packages

In [None]:
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor, MultiOutputClassifier
import numpy as np
import pandas as pd
from scipy.stats import mode

from tqdm.notebook import tqdm, trange

pred_class = ['2', '6', '10', '12', '13', '15', '18', '19', '21', '22', '25', '26', '36', '37', '39', '48']
idx_to_class = {i:class_ for i, class_ in enumerate(pred_class)}

## 2. Extra features from raw_data
從原始資料中萃取額外特徵並後續放入模型，包含50萬名用戶的個人特徵(取最後一筆有效資料)以及前19個月最常使用的卡號特徵。

In [None]:
cu_dat = pd.read_csv('tbrain_cc_training_48tags_hash_final.csv', usecols = ['chid', 'dt',
                                                                             'masts', 'educd', 'trdtp', 'naty', 'poscd', 'cuorg', 'slam',
                                                                             'gender_code', 'age', 'primary_card'])

customer_dat = cu_dat.groupby('chid').tail(1).drop(columns = 'dt').sort_values('chid')
del cu_dat

In [None]:

# mode card -v2
use_cols = ['dt', 'chid', 'shop_tag',
            'card_1_txn_cnt', 'card_2_txn_cnt', 'card_3_txn_cnt', 'card_4_txn_cnt', 'card_5_txn_cnt',
            'card_6_txn_cnt', 'card_7_txn_cnt', 'card_8_txn_cnt', 'card_9_txn_cnt', 'card_10_txn_cnt',
            'card_11_txn_cnt', 'card_12_txn_cnt', 'card_13_txn_cnt', 'card_14_txn_cnt', 'card_other_txn_cnt',]

card_df = pd.read_csv('tbrain_cc_training_48tags_hash_final.csv', usecols=use_cols)

# take only month before 18 # test->19
card_df = card_df[card_df.dt<19]

# 不精準的卡片使用率計算方式，單純加總各個shop_tag下的消費次數之後取最大值
pct_sum = card_df.groupby(['chid', 'shop_tag']).sum()
pct_sum['max_card_idx'] = np.argmax(pct_sum.values[:,1:], axis = 1)+1
pct_sum = pct_sum[['max_card_idx']].reset_index()

# long to wide
chid_card_choose = pct_sum.pivot(index='chid', columns = 'shop_tag', values = 'max_card_idx').fillna(0)
chid_card_choose = chid_card_choose[pred_class]

del card_df

In [None]:
# 將每個用戶常使用的卡號放進 customer_dat中   (未做sliding window處理)
customer_dat = customer_dat.merge(chid_card_choose, left_on='chid', right_index=True, how = 'left').fillna(0.0)

## 3. Load processed data
載入前處理後的資料並且以sliding window分割資料。由於前處理的資料有三份，在此將會分別讀取三份資料並執行相同的步驟，最後再使用三份資料產生的模型預測結果做averaging。

In [None]:
data = pd.read_csv('cnt_amt_card_onoff_doov_data.csv')

# data = pd.read_csv('data/cnt_amt_card_cnt_onoff_doov_cnt_data.csv')
# data = pd.read_csv('data/cnt_amt_card_onoff_doov_cnt_data.csv')

In [None]:
data_chid = data['chid']
data = data.drop(columns = ['chid'])

n_features = 7
n_class = len(pred_class)

In [None]:
# validation mode

X_train = np.vstack([
    data.iloc[:,n_class*n_features*0:-n_class*n_features*7],
    data.iloc[:,n_class*n_features*1:-n_class*n_features*6],
    data.iloc[:,n_class*n_features*2:-n_class*n_features*5],
    data.iloc[:,n_class*n_features*3:-n_class*n_features*4],
    data.iloc[:,n_class*n_features*4:-n_class*n_features*3],
    data.iloc[:,n_class*n_features*5:-n_class*n_features*2]
])
y_train=  np.vstack([
    data[[f"txn_amt_dt18_shoptag_{i}" for i in pred_class]],
    data[[f"txn_amt_dt19_shoptag_{i}" for i in pred_class]],
    data[[f"txn_amt_dt20_shoptag_{i}" for i in pred_class]],
    data[[f"txn_amt_dt21_shoptag_{i}" for i in pred_class]],
    data[[f"txn_amt_dt22_shoptag_{i}" for i in pred_class]],
    data[[f"txn_amt_dt23_shoptag_{i}" for i in pred_class]]
])

y_train_class =  np.where(y_train>0, 1, 0)
X_test = data.iloc[:, n_class*n_features*6:-n_class*n_features*1]

## 4. Feature engineering on sliding window
針對sliding window製作額外特徵，並且合併第二步的額外特徵

In [None]:
not_zeros = X_train.mean(axis = 1)>0
y_train = y_train[not_zeros]
y_train_class = y_train_class[not_zeros]

In [None]:
def feature_engineer(X):
    '''
    txn_amt_dt{m}_shoptag_{n}, txn_cnt_dt{m}_shoptag_{n}, ...
    '''
    
    # total amt & cnt
    avg_dt_amt = np.mean(X[:,::n_features], axis = 1).reshape(-1, 1)
    recent_avg_dt_amt = np.mean(X[:,-n_class*n_features*3::n_features], axis = 1).reshape(-1, 1)
    avg_dt_cnt = np.mean(X[:,1::n_features], axis = 1).reshape(-1, 1)
    recent_avg_dt_cnt = np.mean(X[:,-n_class*n_features*3+1::n_features], axis = 1).reshape(-1, 1)
    
    std_dt_amt = np.std(X[:,::n_features], axis = 1).reshape(-1, 1)
    recent_std_dt_amt = np.std(X[:,-n_class*n_features*3::n_features], axis = 1).reshape(-1, 1)
    std_dt_cnt = np.std(X[:,1::n_features], axis = 1).reshape(-1, 1)
    recent_std_dt_cnt = np.std(X[:,-n_class*n_features*3+1::n_features], axis = 1).reshape(-1, 1)
    

    # amt & cnt by shoptag
    shoptag_avg_amt = np.array([np.mean(X[:,shoptag*n_features::n_class], axis = 1) for shoptag in range(n_class)]).transpose()
    shoptag_avg_cnt = np.array([np.mean(X[:,shoptag*n_features+1::n_class], axis = 1) for shoptag in range(n_class)]).transpose()
    
    shoptag_std_amt = np.array([np.std(X[:,shoptag*n_features::n_class], axis = 1) for shoptag in range(n_class)]).transpose()
    shoptag_std_cnt = np.array([np.std(X[:,shoptag*n_features+1::n_class], axis = 1) for shoptag in range(n_class)]).transpose()

    amt_dt_pro = recent_avg_dt_amt / (avg_dt_amt +1)
    cnt_dt_pro = recent_avg_dt_cnt / (avg_dt_cnt +1)    
    
    # offline / domestic
    avg_dt_off_do_amt = np.mean(X[:,3::n_features], axis = 1).reshape(-1, 1)
    avg_dt_on_do_amt = np.mean(X[:,4::n_features], axis = 1).reshape(-1, 1)
    avg_dt_off_ov_amt = np.mean(X[:,5::n_features], axis = 1).reshape(-1, 1)
    avg_dt_on_ov_amt = np.mean(X[:,6::n_features], axis = 1).reshape(-1, 1)
    
    shoptag_off_do_avg_amt = np.array([np.mean(X[:,shoptag*n_features+3::n_class], axis = 1) for shoptag in range(n_class)]).transpose()
    shoptag_on_do_avg_amt = np.array([np.mean(X[:,shoptag*n_features+4::n_class], axis = 1) for shoptag in range(n_class)]).transpose()
    shoptag_off_ov_avg_amt = np.array([np.mean(X[:,shoptag*n_features+5::n_class], axis = 1) for shoptag in range(n_class)]).transpose()
    shoptag_on_ov_avg_amt = np.array([np.mean(X[:,shoptag*n_features+6::n_class], axis = 1) for shoptag in range(n_class)]).transpose()

    
    # customer_data
    customer_ = customer_dat.drop(columns = 'chid').values
    n_customer = customer_.shape[0]
    rep = int(X.shape[0]/n_customer)
    customer_ = np.tile(customer_, reps=(rep, 1))
    
    X_fe = np.concatenate([X, avg_dt_amt, recent_avg_dt_amt, avg_dt_cnt, recent_avg_dt_cnt, shoptag_avg_amt, shoptag_avg_cnt,
                           std_dt_amt, recent_std_dt_amt, std_dt_cnt, recent_std_dt_cnt, shoptag_std_amt, shoptag_std_cnt,
                           avg_dt_off_do_amt, avg_dt_on_do_amt, avg_dt_off_ov_amt, avg_dt_on_ov_amt, shoptag_off_do_avg_amt, 
                           shoptag_on_do_avg_amt, shoptag_off_ov_avg_amt, shoptag_on_ov_avg_amt,
                           amt_dt_pro, cnt_dt_pro,
                           customer_], axis = 1)
    return X_fe

In [None]:
%%time

X_train = feature_engineer(X_train)
X_train = X_train[not_zeros]

X_test = feature_engineer(X_test.values)

## 5. Modeling
分別建立二元分類與迴歸模型預測交易機率與交易價格

In [None]:
%%time

params = {
    'subsample':0.8,
    'colsample_bytree':0.8,
    'n_estimators': 1000,
}

reg_model = MultiOutputRegressor(LGBMRegressor(**params)).fit(X_train, y_train)

In [None]:
%%time

params = {
    'subsample':0.8,
    'colsample_bytree':0.8,
    'n_estimators': 1000,
}

class_model = MultiOutputClassifier(LGBMClassifier(**params)).fit(X_train, y_train_class)

In [None]:
reg_prediction = reg_model.predict(X_test)
class_prediction = np.array([c[:,1] for c in class_model.predict_proba(X_test)]).transpose()

## 6. Offline evaluation
以dt24的交易資料作為validation的ground truth

In [None]:
#prediction has shape 500000x16
submit = []
prediction = reg_prediction*(class_prediction**1.5)
for row in prediction:
    submit.append(row.argsort()[-3:][::-1][np.newaxis,:])
submit = np.vstack(submit)

In [None]:
shape = submit.shape
submit = submit.flatten()
submit = np.array([idx_to_class[idx] for idx in submit])
submit = submit.reshape(shape)

In [None]:
submit_csv = pd.read_csv("sample_submission.csv")
submit_csv = submit_csv.sort_values("chid")

In [None]:
submit_csv.iloc[:,1:] = submit

In [19]:
from utils import DCG
DCG(submit_csv)

Calculate DCG score with 398988 users


0.7175257432488035