In [23]:
from sklift.datasets import fetch_hillstrom
from sklearn.datasets import fetch_openml  # or from dirty_cat.datasets if using dirty_cat
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
import pandas as pd

In [None]:
pd.DataFrame(fetch_hillstrom(target_col='visit').data) # this is our X features.

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web
...,...,...,...,...,...,...,...,...
63995,10,2) $100 - $200,105.54,1,0,Urban,0,Web
63996,5,1) $0 - $100,38.91,0,1,Urban,1,Phone
63997,6,1) $0 - $100,29.99,1,0,Urban,1,Phone
63998,1,5) $500 - $750,552.94,1,0,Surburban,1,Multichannel


In [15]:
# Fetch the Hillstrom dataset with the target column specified
df_bunch = fetch_hillstrom(target_col='visit')
df = pd.DataFrame(df_bunch.data, columns=df_bunch.feature_names)
# df['target'] = df_bunch.target
df['treatment'] = df_bunch.treatment
df.head()


Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,treatment
0,10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail
1,6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail
2,7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail
3,9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail
4,2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail


### The Meta-data


In [None]:
"""
Major columns:

visit (binary): target. 1/0 indicator, 1 = Customer visited website in the following two weeks.

conversion (binary): target. 1/0 indicator, 1 = Customer purchased merchandise in the following two weeks.

spend (float): target. Actual dollars spent in the following two weeks.

segment (str): treatment. The e-mail campaign the customer received

"""

In [None]:
"""
Historical customer attributes at your disposal include:

Recency: Months since last purchase.

History_Segment: Categorization of dollars spent in the past year.

History: Actual dollar value spent in the past year.

Mens: 1/0 indicator, 1 = customer purchased Mens merchandise in the past year.

Womens: 1/0 indicator, 1 = customer purchased Womens merchandise in the past year.

Zip_Code: Classifies zip code as Urban, Suburban, or Rural.

Newbie: 1/0 indicator, 1 = New customer in the past twelve months.

Channel: Describes the channels the customer purchased from in the past year.

"""

## Preprocessing part

In [24]:
# Let's see null values
df.isnull().sum()

recency      0
history      0
mens         0
womens       0
zip_code     0
newbie       0
channel      0
treatment    0
dtype: int64

In [20]:
"""
    Since the history and history_segment represent the same information, we can drop one of them.
"""
df.drop(columns=['history_segment'], inplace=True)

In [22]:
df.head(4)

Unnamed: 0,recency,history,mens,womens,zip_code,newbie,channel,treatment
0,10,142.44,1,0,Surburban,0,Phone,Womens E-Mail
1,6,329.08,1,1,Rural,1,Web,No E-Mail
2,7,180.65,0,1,Surburban,1,Web,Womens E-Mail
3,9,675.83,1,0,Rural,1,Web,Mens E-Mail


In [None]:
""" Now we will use sklearn pipeline to preprocess and encode the data. Later, train a model."""

# Define our numeric columns and categorical columns
numeric_features = ['recency', 'history']
binary_features = ['mens', 'womens', 'newbie']
categorical_features = ['zip_code', 'channel', 'treatment']

# Define our column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),  # Keep numeric features as is
        ('bin', 'passthrough', binary_features),  # One-hot encode binary features
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)  # One-hot encode categorical features
    ],
    remainder='drop'  # Drop any other columns not specified
)


