In [1]:
import numpy as np
import pandas as pd
import gc
from datetime import date, datetime
from random import normalvariate
pd.set_option('display.max_columns', 100)

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from catboost import CatBoostClassifier
import lightgbm as lgbm

In [2]:
df_products = pd.read_csv('data/products.csv', index_col='product_id')
df_transactions = pd.read_csv('data/purchases.csv').join(df_products['is_own_trademark'].astype(np.int8), on='product_id')

In [3]:
for col in df_transactions.columns:
    if df_transactions[col].dtype == 'float64':
        df_transactions[col] = df_transactions[col].astype(np.float32)

In [4]:
own_trademark_sum = df_transactions[df_transactions['is_own_trademark'] == 1].groupby('client_id')['trn_sum_from_iss'].sum().rename('own_trademark_sum')
df_transactions.drop_duplicates(inplace=True)
features =  pd.concat([df_transactions.groupby('client_id')['purchase_sum'].count(),
                       df_transactions.groupby('client_id')['purchase_sum'].mean(),
                       df_transactions.groupby('client_id')['purchase_sum'].std(),
                       df_transactions.groupby('client_id')['express_points_spent'].mean(),
                       df_transactions.groupby('client_id')['express_points_spent'].std(),
                       df_transactions.groupby('client_id')['express_points_received'].mean(),
                       df_transactions.groupby('client_id')['express_points_received'].std(),
                       df_transactions.groupby('client_id')['regular_points_spent'].mean(),
                       df_transactions.groupby('client_id')['regular_points_spent'].std(),
                       df_transactions.groupby('client_id')['regular_points_received'].mean(),
                       df_transactions.groupby('client_id')['regular_points_received'].std(),
                       df_transactions.groupby('client_id')[['store_id']].nunique(),
                       df_transactions.groupby('client_id')[['regular_points_received', 'express_points_received', 
                                                             'regular_points_spent', 'express_points_spent', 
                                                             'purchase_sum']].sum()
                      ],axis = 1)

features.columns = ['total_trans_count', 'mean_purchase', 'std_purchase', 'mean_epoints_spent', 'std_epoints_spent',
                    'mean_epoints_recd', 'std_epoints_recd', 'mean_rpoints_spent', 'std_rpoints_spent',
                    'mean_rpoints_recd', 'std_rpoints_recd', 'nunique_stores'] + \
                    list(c+"_sum" for c in ['regular_points_received', 'express_points_received',
                                            'regular_points_spent', 'express_points_spent', 'purchase'])

In [5]:
last_month_transactions = df_transactions[df_transactions['transaction_datetime'] > '2019-02-18']
last_month_features =  pd.concat([last_month_transactions.groupby('client_id')['purchase_sum'].count(),
                                  last_month_transactions.groupby('client_id')[['store_id']].nunique(),
                                  last_month_transactions.groupby('client_id')[['regular_points_received', 'express_points_received', 
                                                                               'regular_points_spent', 'express_points_spent', 
                                                                               'purchase_sum']].sum()
                                 ],axis = 1)

last_month_features.columns = list(['total_trans_count', 'nunique_stores'] + \
                                list(c+"_sum" for c in ['regular_points_received', 'express_points_received','regular_points_spent', 
                                                        'express_points_spent', 'purchase']))
last_month_features.columns = list(last_month_features.columns + '_last_month')

In [6]:
features['regular_points_received_ratio'] = features['regular_points_received_sum'] / features['purchase_sum']
features['express_points_received_ratio'] = features['express_points_received_sum'] / features['purchase_sum']
features['regular_points_spent_ratio'] = features['regular_points_spent_sum'] / features['purchase_sum']
features['express_points_spent_ratio'] = features['express_points_spent_sum'] / features['purchase_sum']

In [7]:
features = pd.merge(features, own_trademark_sum, how='left', on='client_id')
features['own_trademark_ratio'] = features['own_trademark_sum'] / features['purchase_sum']
features.drop(columns='own_trademark_sum', inplace=True)
df_transactions_features = pd.merge(features, last_month_features, how='inner', on='client_id')

In [8]:
del [df_products, df_transactions, own_trademark_sum, features, last_month_transactions, last_month_features]
gc.collect()

13

In [9]:
df_clients = pd.read_csv('data/clients.csv', index_col='client_id',
                         parse_dates=['first_issue_date','first_redeem_date'])
df_train = pd.read_csv('data/uplift_train.csv', index_col='client_id')
df_test = pd.read_csv('data/uplift_test.csv', index_col='client_id')

In [10]:
# Try to fix age variable in the data. Also, set age_antinorm variable depending on 
# type of error in the age data.
# Set real_fix flag to False to calculate age_antinorm, but not fix the age
#
# Use the following heuristics:
# 19XX means year of birth -> easy to convert to age
# 18XX - the same, but it should be 9 instead of 8
#  9XX - the same, first '1' is missed
# -9XX - the same as 19XX, '1' was OCRed as '-'
# etc
#
def fix_age(df_clients, real_fix=True):

    # create a copy of age column. Modify the copy for now
    df_clients['age2'] = df_clients['age']
    
    age_index = (df_clients['age'] < -900) & (df_clients['age'] > -1000)
    df_clients.loc[age_index, 'age2'] = -1 * df_clients.loc[age_index, 'age'] + 1019
    df_clients.loc[age_index, 'age_antinorm'] = 1

    age_index = (df_clients['age'] > 900) & (df_clients['age'] < 1000)
    df_clients.loc[age_index, 'age2'] = 1019 - df_clients.loc[age_index, 'age']
    df_clients.loc[age_index, 'age_antinorm'] = 2

    age_index = (df_clients['age'] > 1900) & (df_clients['age'] < 2000)
    df_clients.loc[age_index, 'age2'] = 2019 - df_clients.loc[age_index, 'age']
    df_clients.loc[age_index, 'age_antinorm'] = 3
    
    age_index = (df_clients['age'] > 120) & (df_clients['age'] < 200)
    df_clients.loc[age_index, 'age2'] = df_clients.loc[age_index, 'age'] - 100
    df_clients.loc[age_index, 'age_antinorm'] = 4

    age_index = (df_clients['age'] > 1800) & (df_clients['age'] < 1900)
    df_clients.loc[age_index, 'age2'] = df_clients.loc[age_index, 'age'] - 1800
    df_clients.loc[age_index, 'age_antinorm'] = 5

    # the following types of errors are impossible to recover
    # so we set the age to mean of all clients (46), slightly randomizing it (std=16)
    age_index = (df_clients['age'] > 120)
    df_clients.loc[age_index, 'age2'] = normalvariate(46, 16)
    df_clients.loc[age_index, 'age_antinorm'] = 6
    
    age_index = (df_clients['age'] > 0) & (df_clients['age'] < 12)
    df_clients.loc[age_index, 'age2'] = normalvariate(46, 16)
    df_clients.loc[age_index, 'age_antinorm'] = 7
    
    age_index = (df_clients['age'] == 0)
    df_clients.loc[age_index, 'age2'] = normalvariate(46, 16)
    df_clients.loc[age_index, 'age_antinorm'] = 8
    
    age_index = (df_clients['age'] < 0)
    df_clients.loc[age_index, 'age2'] = normalvariate(46, 16)
    df_clients.loc[age_index, 'age_antinorm'] = 9
    
    # use the modified copy 
    if (real_fix):
        df_clients['age'] = df_clients['age2']
    
    df_clients.drop('age2', axis=1, inplace=True)
    
    return df_clients

In [11]:
df_clients['age_antinorm'] = 0
df_clients = fix_age(df_clients)

df_clients = pd.get_dummies(df_clients, prefix='gender', columns=['gender'])
df_clients['first_issue_unixtime'] = (df_clients['first_issue_date']).astype(int)/10**9
df_clients['first_redeem_unixtime'] = (df_clients['first_redeem_date']).astype(int)/10**9

df_clients['issue_redeem_delay'] = df_clients['first_redeem_unixtime'] - df_clients['first_issue_unixtime']
df_clients.drop(columns=['first_issue_date', 'first_redeem_date'], inplace=True)

In [12]:
def merge_dataset(X, to_merge):
    
    result = X
    
    for table in to_merge:
        
        result = pd.merge(result, table, how='inner', on='client_id')
    
    return result

In [13]:
train_X = merge_dataset(df_train, [df_clients, df_transactions_features])
train_X['new_target'] = (train_X['target'] + train_X['treatment_flg'] + 1) % 2
test_X = merge_dataset(df_test, [df_clients, df_transactions_features])
treatment = train_X[train_X['treatment_flg'] == 1].drop('treatment_flg', axis=1)
treatment_X = treatment.drop(['target', 'new_target'], axis=1, errors='ignore')
treatment_y = treatment['target']
control = train_X[train_X['treatment_flg'] == 0].drop('treatment_flg', axis=1)
control_X = control.drop(['target', 'new_target'], axis=1, errors='ignore')
control_y = control['target']

In [14]:
def trans_train_model(model, df_X, df_X_test, num_folds=5, random_state=0, verbose=2):

    cat_params['random_state'] = random_state
    lgbm_params['random_state'] = random_state

    df_X['new_target'] = (df_X['target'] + df_X['treatment_flg'] + 1) % 2
    df_y = df_X[['new_target']]
    treatment = df_X['treatment_flg'].to_numpy()
    old_target = df_X['target'].to_numpy()
    df_X = df_X.drop(['target', 'new_target', 'treatment_flg'], axis=1, errors='ignore')    
    
    X = df_X.to_numpy()
    y = df_y.to_numpy()
    X_test = df_X_test.to_numpy()
    
    folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=random_state)
    scores = []
    uplift_scores = []    
    prediction = np.zeros(len(X_test))
    feature_importances = []

    for i, (train_index, valid_index) in enumerate(folds.split(X, y)):    
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]        
        treat_valid = treatment[valid_index]
        old_target_vaild = old_target[valid_index]
        
        if (model == 'catboost'):
            f = CatBoostClassifier(**cat_params)
            f.fit(X_train, y_train, eval_set=(X_valid, y_valid), use_best_model=True, verbose=False)
        elif (model == 'lgbm'):
            f = lgbm.LGBMClassifier(**lgbm_params)        
            f.fit(X_train, y_train, eval_set=(X_valid, y_valid), verbose=False)
        else:
            return None
        
        feature_importances.append(f.feature_importances_)
        y_pred_valid = f.predict_proba(X_valid)[:, 1]
        score = log_loss(y_valid, y_pred_valid)
        uplift_score = uplift_at_k(old_target_vaild, y_pred_valid, treat_valid)
        uplift_scores.append(uplift_score)
        if (verbose > 1):
            print('Uplift score: {0:.5f}'.format(uplift_score))
        
        scores.append(score)
        # predict on test and accumulate the result
        y_pred = f.predict_proba(X_test)[:, 1]
        prediction += y_pred

    # get average prediction from all models
    prediction /= num_folds
    importances_mean = np.mean(feature_importances, axis=0)
    df_features = pd.DataFrame(zip(importances_mean, df_X.columns), columns=['Value','Feature']).sort_values(by='Value', ascending=False)

    if (verbose > 0):
        print('CV mean score: {0:.5f}, std: {1:.5f}'.format(np.mean(scores), np.std(scores)))    
        print('Uplift score @30%: {0:.5f}, std: {1:.5f}'.format(np.mean(uplift_scores), np.std(uplift_scores)))
        print(df_features)
        
    return prediction

In [15]:
def uplift_at_k(y_true, uplift, treatment, k=0.3):
    """Compute uplift at first k percentage of the total sample.

    Args:
        y_true (1d array-like): Ground truth (correct) labels.
        uplift (1d array-like): Predicted uplift, as returned by a model.
        treatment (1d array-like): Treatment labels.
        k (float > 0 and <= 1): Percentage of the total sample to compute uplift.

    Returns:
        float: Uplift at first k percentage of the total sample.

    Reference:
        Baseline from `RetailHero competition`_.

    .. _RetailHero competition:
        https://retailhero.ai/c/uplift_modeling/overview
    """
    order = np.argsort(-uplift)
    treatment_n = int((treatment == 1).sum() * k)
    treatment_p = y_true[order][treatment[order] == 1][:treatment_n].mean()
    control_n = int((treatment == 0).sum() * k)
    control_p = y_true[order][treatment[order] == 0][:control_n].mean()
    score_at_k = treatment_p - control_p
    
    return score_at_k

In [16]:
cat_params = {'learning_rate':0.01, 'max_depth':3,
              'loss_function':'Logloss', 'eval_metric':'Logloss',
               'iterations':20000, 'od_type': "Iter", 'od_wait':200
}

lgbm_params = {'learning_rate':0.01,'max_depth':6,'num_leaves':20, 'min_data_in_leaf':3, 
               'subsample':0.8, 'colsample_bytree': 0.8, 'reg_alpha':0.01,'max_bin':416,
               'bagging_freq':3,'reg_lambda':0.01,'num_leaves':20, 'n_estimators':600, 
               'eval_metric':'Logloss', 'application':'binary', 
               'iterations':20000, 'od_type': 'Iter', 'od_wait':200
}

In [17]:
pred = trans_train_model('catboost', train_X, test_X, verbose=2)

Uplift score: 0.07111
Uplift score: 0.08943
Uplift score: 0.08871
Uplift score: 0.07330
Uplift score: 0.07869
CV mean score: 0.68845, std: 0.00029
Uplift score @30%: 0.08025, std: 0.00762
        Value                                 Feature
6   70.513373                   first_redeem_unixtime
0    2.583397                                     age
28   1.990594              express_points_spent_ratio
7    1.460207                      issue_redeem_delay
9    1.266638                           mean_purchase
5    1.257648                    first_issue_unixtime
24   1.230376                            purchase_sum
36   1.152020                 purchase_sum_last_month
8    1.131740                       total_trans_count
23   1.096668                express_points_spent_sum
30   1.086737            total_trans_count_last_month
32   1.069755  regular_points_received_sum_last_month
12   1.005317                       std_epoints_spent
25   0.981327           regular_points_received_ratio
29

In [18]:
df_submission = pd.DataFrame({'client_id':test_X.index.values,'uplift': pred})
df_submission.to_csv('submission.csv', index=False)