In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.DataFrame()
for file_ in ["train1","train2","train3"]:
    df_ = pd.read_pickle("dataset/" + str(file_) + ".pkl")
    df_train = pd.concat([df_train, df_], axis=0)

print (df_train.shape)

(46639952, 7)


In [3]:
df_test = pd.DataFrame()
for file_ in ["test_1","test_2","test_3","valid"]:
    df_ = pd.read_pickle("dataset/" + str(file_) + ".pkl")
    df_test = pd.concat([df_test, df_], axis=0)

print (df_test.shape)

(36633158, 7)


In [6]:
df_train_set = pd.read_csv("dataset/training-set.csv")
df_test_set = pd.read_csv("dataset/testing-set.csv")
df_set = pd.concat([df_train_set, df_test_set], axis=0)

print (df_set.shape)

(81977, 2)


In [8]:
df_train = pd.merge(df_train, df_train_set, how="left", on="file_id")

In [10]:
df_train.isnull().sum()

file_id        0
customer_id    0
queryts        0
product_id     0
month          0
date           0
date_new       0
label          0
dtype: int64

### 商品行為

In [11]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by = trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index

    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [12]:
trn, sub = target_encode(df_train["product_id"], 
                         df_test["product_id"], 
                         target=df_train.label, 
                         min_samples_leaf=100,
                         smoothing=10,
                         noise_level=0.01)
df_train["prod_enc"] = trn
df_test["prod_enc"] = sub

In [13]:
f = {'prod_enc': ['mean', 'std', 'max', 'min']}
g_train = df_train.groupby(["file_id"]).aggregate(f).reset_index()
g_train.columns = ["file_id","prod_enc_mean","prod_enc_std","prod_enc_max","prod_enc_min"]

g_test = df_test.groupby(["file_id"]).aggregate(f).reset_index()
g_test.columns = ["file_id","prod_enc_mean","prod_enc_std","prod_enc_max","prod_enc_min"]
g = pd.concat([g_train, g_test], axis=0)

f = {'prod_enc_mean': 'mean','prod_enc_std': 'mean',
     'prod_enc_max': 'max','prod_enc_min': 'min'}
g = g.groupby(["file_id"]).aggregate(f).reset_index()
g.columns = ["file_id","prod_enc_mean","prod_enc_std","prod_enc_max","prod_enc_min"]

print (g.shape)

(81894, 5)


In [14]:
g.iloc[0, :]

file_id          00008c73ee43c15b16c26b26398c1577
prod_enc_mean                            0.125596
prod_enc_std                            0.0496438
prod_enc_max                             0.240221
prod_enc_min                            0.0316121
Name: 0, dtype: object

In [15]:
g.to_csv("dataset/prod_enc.csv", index=False)