In [48]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split

In [11]:
df = pd.read_csv("Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv")
df.shape

(64000, 12)

In [13]:
df.head(3)

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0.0
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0


In [8]:
df.segment.value_counts()

Womens E-Mail    21387
Mens E-Mail      21307
No E-Mail        21306
Name: segment, dtype: int64

In [14]:
# remove unused mail
df = df[df.segment != "No E-Mail"].reset_index(drop=True)
df.shape

(42694, 12)

In [16]:
df.head(3)

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0.0
1,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0.0
2,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0.0


In [17]:
df.conversion.value_counts()

0    42238
1      456
Name: conversion, dtype: int64

In [18]:
# multi-label binarize (dummies)
columns = ["zip_code", "channel"]
dummies = pd.get_dummies(df[columns], drop_first=True)
df = pd.concat([df.drop(columns, axis=1), dummies], axis=1)

In [20]:
# keep features
unused_columns = ["segment", "visit", "conversion", "spend", "history_segment"]
feature_columns = df.drop(unused_columns, axis=1).columns
X = df.drop(unused_columns, axis=1).values

In [26]:
# man: treatment, woman: control
w = (df.segment == "Mens E-Mail").astype(int).values # Man: 1, Woman: 0
y = df.visit.values # visit: 1, not: 0

In [28]:
X_train, X_test, w_train, w_test, y_train, y_test = train_test_split(
    X,
    w,
    y,
    test_size=0.5,
    random_state=0
)

In [32]:
params = {
    "C": 0.1,
    "max_iter": 1000,
}

In [33]:
lr_treatment = LogisticRegression(**params)
lr_treatment.fit(X_train[w_train==1], y_train[w_train == 1])

In [34]:
lr_control = LogisticRegression(**params)
lr_control.fit(X_train[w_train==0], y_train[w_train == 0])

In [35]:
p_treatment = lr_treatment.predict_proba(X_test)[:, 1]
p_control = lr_control.predict_proba(X_test)[:, 1]
uplift_score = p_treatment / p_control

In [50]:
uplift_score

array([1.36123866, 1.29924926, 0.92020088, ..., 0.86611432, 1.42526805,
       1.06175802])

In [41]:
p_treatment - p_control

array([ 0.08348381,  0.04842094, -0.0157428 , ..., -0.02542393,
        0.05239613,  0.00981946])

In [46]:
print("treatment")
for column, coef in zip(df.drop(unused_columns, axis=1).columns, lr_treatment.coef_[0]):
    print(f"{column}: {coef}")

treatment
recency: -0.05378370700038538
history: 0.0004518607397362312
mens: 0.7195213316866386
womens: 0.6529544580200272
newbie: -0.49502311228994145
zip_code_Surburban: -0.19532770325575569
zip_code_Urban: -0.15101807212566762
channel_Phone: -0.06848546166008543
channel_Web: 0.13039725991981047


In [47]:
print("control")
for column, coef in zip(df.drop(unused_columns, axis=1).columns, lr_control.coef_[0]):
    print(f"{column}: {coef}")

control
recency: -0.032170105692805935
history: 0.0004304978147927434
mens: 0.33424660873923057
womens: 0.9142842626385103
newbie: -0.5008049131267299
zip_code_Surburban: -0.27313291669465706
zip_code_Urban: -0.2568290301165986
channel_Phone: -0.1848953464959288
channel_Web: 0.05782231249667818


In [49]:
auc_treatment = roc_auc_score(y_test, p_treatment)
auc_control = roc_auc_score(y_test, p_control)
print(f"auc_treatment: {auc_treatment}")
print(f"auc_control: {auc_control}")

auc_treatment: 0.6150841442969484
auc_control: 0.6118946263789143
