<img src="../img/hu-logo.png" align="right" width="120">

# By-feature AUC as feature evaluation

"Monitor" value of individual features (and check for leakage) by computing by-feature AUC.

A value of 1 or unusually large values (can) indicate leakage.

In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics

In [2]:
def pr_auc(y, p):
    precision, recall, _ = sklearn.metrics.precision_recall_curve(y, p)
    return sklearn.metrics.auc(recall, precision)

In [3]:
def purchase_frequency(by, norm, week, baskets, index, variable="phat", fillna=False):
    baskets_t = baskets[baskets["week"] < week]
    if by is None:
        pf_t = pd.DataFrame({variable: [baskets_t.shape[0] / (week * norm)]})
    else:
        pf_t = (
            (baskets_t.groupby(by)[["week"]].count() / (week * norm))
            .rename(columns={"week": variable})
            .reset_index()
        )
    pf_t["week"] = week
    out = index.copy().merge(
        pf_t, on=["week"] if by is None else ["week"] + by, how="left"
    )
    if fillna:
        out[variable].fillna(0, inplace=True)
    else:
        assert np.all(out[variable].notnull())
    return out

## Input

In [4]:
I = 1_000

## Data

In [5]:
baskets = pd.read_parquet("/private/data/teaching/dr-s-2/baskets.parquet")
baskets = baskets[baskets["shopper"] < I]
# make features a little worse by removing large fraction of weeks
baskets = baskets[baskets["week"] > 70]
baskets["y"] = 1

### Index DataFrame

In [6]:
index_numpy = np.array(np.meshgrid(np.arange(I), np.arange(250), [89]))
index = pd.DataFrame(
    index_numpy.T.reshape(-1, 3), columns=["shopper", "product", "week"]
)

### Target variable

In [7]:
target = index.copy().merge(
    baskets[["shopper", "product", "week", "y"]],
    on=["shopper", "product", "week"],
    how="left",
)
target["y"] = target["y"].fillna(0).astype(int)

### Data without target week, no leakage possible

In [8]:
baskets_train = baskets[baskets["week"] < 89]

### Two meaningful features

In [9]:
shopper_product_frequency = purchase_frequency(
    ["shopper", "product"],
    1,
    89,
    baskets_train,
    index,
    "shopper_product_frequency",
    True,
)

In [10]:
product_frequency = purchase_frequency(
    ["product"], I, 89, baskets_train, index, "product_frequency", True
)

### One useless feature

In [11]:
random_feature = index.copy()
random_feature["random"] = np.random.uniform(0, 1, random_feature.shape[0])
random_feature

Unnamed: 0,shopper,product,week,random
0,0,0,89,0.071560
1,0,1,89,0.081979
2,0,2,89,0.032394
3,0,3,89,0.418805
4,0,4,89,0.841159
...,...,...,...,...
249995,999,245,89,0.038118
249996,999,246,89,0.870998
249997,999,247,89,0.841299
249998,999,248,89,0.806869


### Two features with target variable leakage

In [12]:
tmp = (
    baskets[baskets["week"] > 86]
    .groupby(["shopper", "product"], as_index=False)[["y"]]
    .sum()
)
tmp["y_int"] = 1
last_three_weeks = index.copy().merge(
    tmp[["shopper", "product", "y_int"]], on=["shopper", "product"], how="left"
)
last_three_weeks["y_int"] = last_three_weeks["y_int"].fillna(0).astype(int)

In [13]:
tmp = (
    baskets[baskets["week"] > 88]
    .groupby(["shopper", "product"], as_index=False)[["y"]]
    .sum()
)
tmp["y_int"] = 1
last_week = index.copy().merge(
    tmp[["shopper", "product", "y_int"]], on=["shopper", "product"], how="left"
)
last_week["y_int"] = last_week["y_int"].fillna(0).astype(int)

## Evaluation

### Check that indices are identical ...

... so we don't need to merge.

In [14]:
idx = ["shopper", "product", "week"]
assert np.all(target[idx].values == shopper_product_frequency[idx].values)
assert np.all(target[idx].values == product_frequency[idx].values)
assert np.all(target[idx].values == random_feature[idx].values)
assert np.all(target[idx].values == last_three_weeks[idx].values)
assert np.all(target[idx].values == last_week[idx].values)

### Clean features

In [15]:
pr_auc(target["y"], shopper_product_frequency["shopper_product_frequency"])

0.40822672984832264

In [16]:
pr_auc(target["y"], product_frequency["product_frequency"])

0.06303721659220562

In [17]:
pr_auc(target["y"], random_feature["random"])

0.02998192635303184

### Features with leakage

In [18]:
pr_auc(target["y"], last_three_weeks["y_int"])

0.7174136748227762

In [19]:
pr_auc(target["y"], last_week["y_int"])

1.0

&mdash; <br>
Dr. Sebastian Gabel <br>
Machine Learning in Marketing &ndash; Exercise 7 <br>
2020 <br>