In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train_set = pd.read_csv("dataset/training-set.csv")
df_test_set = pd.read_csv("dataset/testing-set.csv")

df_train_set["type"] = 'train'
df_test_set["type"] = 'test'

df_set = pd.concat([df_train_set, df_test_set], axis = 0)
print (df_set.shape)

(81977, 3)


In [3]:
df_ = pd.read_csv("dataset/file_cust_prod_ct_uct.csv")
df_ = pd.merge(df_, df_set, how='left', on='file_id')

In [4]:
df_train_set = df_[(df_.type == 'train')&(df_.date_min <= 51)]
df_val_set = df_[(df_.type == 'train')&(df_.date_min > 51)]
df_test_set = df_[(df_.type == 'test')]

print (df_train_set.shape, df_val_set.shape, df_test_set.shape)

(45694, 41) (6824, 41) (29376, 41)


In [5]:
df = pd.DataFrame()
for month_ in ["train1","train2","train3","valid"]:
    df_ = pd.read_pickle("dataset/" + str(month_) + ".pkl")
    df = pd.concat([df, df_], axis=0)

df = df.reset_index(drop=True)

In [6]:
df_train = df[df.file_id.isin(df_train_set.file_id)]
df_val = df[df.file_id.isin(df_val_set.file_id)]

print (df.shape, df_train.shape, df_val.shape)

(54250245, 7) (48899216, 7) (5351029, 7)


In [7]:
df = pd.DataFrame()
for month_ in ["test_1","test_2","test_3"]:
    df_ = pd.read_pickle("dataset/" + str(month_) + ".pkl")
    df = pd.concat([df, df_], axis=0)

df_test = pd.concat([df, df_val], axis=0)
df_test = df_test.reset_index(drop=True)

In [8]:
df_train['custXprod_id'] = df_train.customer_id + df_train.product_id
df_test['custXprod_id'] = df_test.customer_id + df_test.product_id

### 篩選不重要顧客

In [9]:
g_ = pd.read_csv("dataset/customer_id.csv")

In [10]:
g_ap = g_[(g_.cust_count > 1) & (g_.cust_date_min < 61) & (g_.cust_date_max > 60)]

In [11]:
df_train = df_train[df_train['customer_id'].isin(g_ap.customer_id)].reset_index(drop=True)
df_test = df_test[df_test['customer_id'].isin(g_ap.customer_id)].reset_index(drop=True)

print (df_train.shape, df_test.shape)

(33706609, 8) (32989698, 8)


In [12]:
df_train = pd.merge(df_train, df_set, how="left", on="file_id")

### Training Points

In [None]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by = trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 

    return add_noise(ft_trn_series, noise_level)

In [None]:
# 第三天顧客的分數 = 第一天 + 第二天的 target encoding

old_len = 0
trn = df_train[(df_train.date_new == 1)]
trn["cust_enc"] = None
for day_ in range(2, np.max(df_train.date_new)+1):
    df_before = df_train[(df_train.date_new < day_)] # 這天以前
    trn_ = target_encode(
        trn_series = df_before["custXprod_id"], target = df_before.label, 
        min_samples_leaf=100, smoothing=10, noise_level=0.01)
    df_before["cust_enc"] = trn_

    f = {'cust_enc': ['mean']} # 算出這天之前的分數
    g_cust = df_before.groupby(["custXprod_id"]).aggregate(f).reset_index()
    g_cust.columns = ["custXprod_id", "cust_enc"]

    df_day = df_train[(df_train.date_new == day_)] # 連接當天資料
    df_day = pd.merge(df_day, g_cust, how="left", on="custXprod_id")
    trn = pd.concat([trn, df_day], axis=0)

trn.cust_enc = pd.to_numeric(trn.cust_enc, errors = 'coerce')
print (trn.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [None]:
f = {'cust_enc': ['mean', 'std', 'max', 'min']}
g_train = trn.groupby(["file_id"]).aggregate(f).reset_index()
g_train.columns = ["file_id", "custXprod_enc_mean", "custXprod_enc_std",
                   "custXprod_enc_max", "custXprod_enc_min"]

print (g_train.shape)

### Test Points

In [None]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by = trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index

    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [None]:
trn, sub = target_encode(df_train["custXprod_id"], 
                         df_test["custXprod_id"], 
                         target=df_train.label, 
                         min_samples_leaf=100,
                         smoothing=10,
                         noise_level=0.01)
df_test["cust_enc"] = sub

In [None]:
f = {'cust_enc': ['mean', 'std', 'max', 'min']}

g_test = df_test.groupby(["file_id"]).aggregate(f).reset_index()
g_test.columns = ["file_id", "custXprod_enc_mean", "custXprod_enc_std",
                  "custXprod_enc_max", "custXprod_enc_min"]
g = pd.concat([g_train, g_test], axis=0)

f = {'custXprod_enc_mean': 'mean','custXprod_enc_std': 'mean',
     'custXprod_enc_max': 'max','custXprod_enc_min': 'min'}
g = g.groupby(["file_id"]).aggregate(f).reset_index()
g.columns = ["file_id", "custXprod_enc_mean", "custXprod_enc_std",
             "custXprod_enc_max", "custXprod_enc_min"]
print (g_train.shape, g_test.shape)

In [None]:
g['custXprod_enc_mean'] = g['custXprod_enc_mean'].fillna(g['custXprod_enc_mean'].mean())
g['custXprod_enc_min'] = g['custXprod_enc_min'].fillna(g['custXprod_enc_min'].min())
g['custXprod_enc_max'] = g['custXprod_enc_max'].fillna(g['custXprod_enc_max'].max())
g['custXprod_enc_std']=g['custXprod_enc_std'].fillna(-999.0)

In [None]:
g.isnull().sum()

In [None]:
g.to_csv("dataset/custXprod_enc_all_perday.csv", index=False)