In [72]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklift.metrics import uplift_at_k
from sklift.viz import plot_uplift_preds
from sklift.models import SoloModel

from catboost import CatBoostClassifier

from sklift.models import ClassTransformation

from sklift.models import TwoModels

In [2]:
df = pd.read_csv('data.csv', ',')
df.head(5)

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion
0,10,142.44,1,0,Surburban,0,Phone,Buy One Get One,0
1,6,329.08,1,1,Rural,1,Web,No Offer,0
2,7,180.65,0,1,Surburban,1,Web,Buy One Get One,0
3,9,675.83,1,0,Rural,1,Web,Discount,0
4,2,45.34,1,0,Urban,0,Web,Buy One Get One,0


In [3]:
df = df.rename(columns = {'conversion' : 'target', 'offer' : 'treatment'})

In [4]:
df['treatment'].value_counts()

Buy One Get One    21387
Discount           21307
No Offer           21306
Name: treatment, dtype: int64

In [5]:
new_val = {'Buy One Get One' : 1, 'No Offer' : 0, 'Discount' : 1}
df['treatment'] = df['treatment'].map(new_val)

In [6]:
df['treatment'].value_counts()

1    42694
0    21306
Name: treatment, dtype: int64

In [7]:
len(df)

64000

In [8]:
num = pd.Series(range(0, len(df)))
num = pd.DataFrame(num)
df['id'] = num
df_features = df.drop(['treatment', 'target'], axis=1)

In [9]:
df_test = df['id']
df_test.head()

0    0
1    1
2    2
3    3
4    4
Name: id, dtype: int64

In [10]:
df_train = df[['id', 'treatment', 'target']]
df_train.head()

Unnamed: 0,id,treatment,target
0,0,1,0
1,1,0,0
2,2,1,0
3,3,1,0
4,4,1,0


In [22]:
indices_train = df_train.index
indices_test = df_test.index
indices_learn, indices_valid = train_test_split(indices_train, test_size=0.3, random_state=123)

In [23]:
df_features.head()

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,id
0,10,142.44,1,0,Surburban,0,Phone,0
1,6,329.08,1,1,Rural,1,Web,1
2,7,180.65,0,1,Surburban,1,Web,2
3,9,675.83,1,0,Rural,1,Web,3
4,2,45.34,1,0,Urban,0,Web,4


In [24]:
df_train.head()

Unnamed: 0,id,treatment,target
0,0,1,0
1,1,0,0
2,2,1,0
3,3,1,0
4,4,1,0


In [25]:
df_test.head()

0    0
1    1
2    2
3    3
4    4
Name: id, dtype: int64

In [30]:
X_train = df_features.loc[indices_learn]
X_train

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,id
53181,8,121.56,0,1,Surburban,0,Web,53181
42635,9,617.62,0,1,Urban,1,Phone,42635
6296,5,185.62,1,0,Rural,1,Web,6296
41722,1,359.03,0,1,Rural,0,Web,41722
32660,10,139.68,1,0,Urban,0,Web,32660
...,...,...,...,...,...,...,...,...
61404,1,172.98,1,0,Surburban,0,Web,61404
17730,9,95.41,0,1,Surburban,0,Phone,17730
28030,1,547.69,1,1,Rural,1,Multichannel,28030
15725,5,341.39,0,1,Surburban,0,Phone,15725


In [33]:
y_train = df_train.loc[indices_learn, 'target']
y_train

53181    0
42635    0
6296     1
41722    0
32660    0
        ..
61404    0
17730    0
28030    1
15725    1
52734    0
Name: target, Length: 44800, dtype: int64

In [34]:
treat_train = df_train.loc[indices_learn, 'treatment']
treat_train

53181    0
42635    1
6296     0
41722    1
32660    1
        ..
61404    1
17730    1
28030    0
15725    1
52734    1
Name: treatment, Length: 44800, dtype: int64

In [36]:
X_val = df_features.loc[indices_valid]
y_val = df_train.loc[indices_valid, 'target']
treat_val =  df_train.loc[indices_valid, 'treatment']

In [39]:
X_test = df_features.loc[indices_test]
X_test

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,id
0,10,142.44,1,0,Surburban,0,Phone,0
1,6,329.08,1,1,Rural,1,Web,1
2,7,180.65,0,1,Surburban,1,Web,2
3,9,675.83,1,0,Rural,1,Web,3
4,2,45.34,1,0,Urban,0,Web,4
...,...,...,...,...,...,...,...,...
63995,10,105.54,1,0,Urban,0,Web,63995
63996,5,38.91,0,1,Urban,1,Phone,63996
63997,6,29.99,1,0,Urban,1,Phone,63997
63998,1,552.94,1,0,Surburban,1,Multichannel,63998


In [63]:
cat_features = ['zip_code', 'channel', 'used_discount', 'used_bogo', 'is_referral']

result = pd.DataFrame({
    'approach': [],
    'uplift@30%': [],
    'uplift@20%': [],
    'uplift@10%': []
})

In [44]:
treat_train

53181    0
42635    1
6296     0
41722    1
32660    1
        ..
61404    1
17730    1
28030    0
15725    1
52734    1
Name: treatment, Length: 44800, dtype: int64

In [64]:
sm = SoloModel(
    CatBoostClassifier(iterations=20, random_state=42, silent=True, cat_features=cat_features)
)

sm = sm.fit(X_train, y_train, treat_train)

uplift_sm = sm.predict(X_val)

sm_score30 = uplift_at_k(y_true=y_val, uplift=uplift_sm, treatment=treat_val, strategy='by_group', k=0.3)
sm_score20 = uplift_at_k(y_true=y_val, uplift=uplift_sm, treatment=treat_val, strategy='by_group', k=0.2)
sm_score10 = uplift_at_k(y_true=y_val, uplift=uplift_sm, treatment=treat_val, strategy='by_group', k=0.1)

result = result.append({
    'approach' : 'SoloModel',
    'uplift@30%' : sm_score30,
    'uplift@20%' : sm_score20,
    'uplift@10%' : sm_score10,
}, ignore_index=True)
result

Unnamed: 0,approach,uplift@30%,uplift@20%,uplift@10%
0,SoloModel,0.084812,0.078603,0.071278


In [70]:
ct = ClassTransformation(
    CatBoostClassifier(iterations=20, random_state=42, silent=True, cat_features=cat_features)
)
ct = ct.fit(X_train, y_train, treat_train)

uplift_ct = ct.predict(X_val)

ct_score30 = uplift_at_k(y_true=y_val, uplift=uplift_ct, treatment=treat_val, strategy='by_group', k=0.3)
ct_score20 = uplift_at_k(y_true=y_val, uplift=uplift_ct, treatment=treat_val, strategy='by_group', k=0.2)
ct_score10 = uplift_at_k(y_true=y_val, uplift=uplift_ct, treatment=treat_val, strategy='by_group', k=0.1)

result = result.append({
    'approach' : 'ClassTransformation',
    'uplift@30%' : ct_score30,
    'uplift@20%' : ct_score20,
    'uplift@10%' : ct_score10,
}, ignore_index=True)
result

Unnamed: 0,approach,uplift@30%,uplift@20%,uplift@10%
0,SoloModel,0.084812,0.078603,0.071278
1,ClassTransformation,0.085085,0.090717,0.102554


In [75]:
tm = TwoModels(
    CatBoostClassifier(iterations=20, random_state=42, silent=True, cat_features=cat_features),
    CatBoostClassifier(iterations=20, random_state=42, silent=True, cat_features=cat_features),
    method='vanilla'  # независимые модели
)
tm = tm.fit(
    X_train, y_train, treat_train
)

uplift_tm = tm.predict(X_val)

tm_score30 = uplift_at_k(y_true=y_val, uplift=uplift_tm, treatment=treat_val, strategy='by_group', k=0.3)
tm_score20 = uplift_at_k(y_true=y_val, uplift=uplift_tm, treatment=treat_val, strategy='by_group', k=0.2)
tm_score10 = uplift_at_k(y_true=y_val, uplift=uplift_tm, treatment=treat_val, strategy='by_group', k=0.1)

result = result.append({
    'approach' : 'TwoModels',
    'uplift@30%' : tm_score30,
    'uplift@20%' : tm_score20,
    'uplift@10%' : tm_score10,
}, ignore_index=True)
result

Unnamed: 0,approach,uplift@30%,uplift@20%,uplift@10%
0,SoloModel,0.084812,0.078603,0.071278
1,ClassTransformation,0.085085,0.090717,0.102554
2,TwoModels,0.085052,0.080939,0.082988


Судя по табличке лучше справилась модель с трансформацией класса.