In [1]:
import glob
import numpy as np
import pandas as pd

In [2]:
df_train_set = pd.read_csv("dataset/training-set.csv")
df_test_set = pd.read_csv("dataset/testing-set.csv")

df_train_set["type"] = 'train'
df_test_set["type"] = 'test'

df_set = pd.concat([df_train_set, df_test_set], axis = 0)
print (df_set.shape)

(81977, 3)


In [3]:
df_ = pd.read_csv("dataset/file_cust_prod_ct_uct.csv")
df_ = pd.merge(df_, df_set, how='left', on='file_id')

In [4]:
df_train = df_[(df_.type == 'train')&(df_.date_min <= 51)]
df_val = df_[(df_.type == 'train')&(df_.date_min > 51)]
df_test = df_[(df_.type == 'test')]

print (df_train.shape, df_val.shape, df_test.shape)

(45694, 41) (6824, 41) (29376, 41)


In [5]:
df = pd.DataFrame()
for month_ in ["train1","train2","train3"]:
    df_ = pd.read_pickle("dataset/" + str(month_) + ".pkl")
    df = pd.concat([df, df_], axis=0)

df = df.reset_index(drop=True)

f = {'date_new': ["min"]}
g = df.groupby(["file_id","customer_id"]).aggregate(f).reset_index()
g.columns = ["file_id","customer_id","first_date"]

g = pd.merge(g, df_set, how='left', on='file_id')
g = g.sort_values(by=["first_date", "file_id"]).reset_index(drop=True)
print (g.shape)

(34956818, 5)


In [6]:
g_train = g[g.file_id.isin(df_train.file_id)]
g_val = g[g.file_id.isin(df_val.file_id)]

print (g_train.shape, g_val.shape)

(34956818, 5) (0, 5)


In [7]:
# cumsum, cumcount customer_id

g_train['label_cumsum'] = g_train.groupby(['customer_id'])['label'].cumsum()
g_train['label_cumcount'] = g_train.groupby(['customer_id'])['label'].cumcount()  

g_train["label_cs"] = g_train.label_cumsum - g_train.label
g_train["cust_me"] = g_train.label_cs / (g_train.label_cumcount + 1e-10)

In [8]:
f = {'cust_me': ["mean","std","max","min"]}
g_train_me = g_train.groupby(["file_id"]).aggregate(f).reset_index()
g_train_me.columns = ["file_id", "cust_me_mean", "cust_me_std", "cust_me_max", "cust_me_min"]
print (g_train_me.shape)

(44830, 5)


### Testing Set Mean Encoding

In [9]:
f = {'first_date': ["max"], 'cust_me': lambda x: list(x)[-1]}
g_cust_last = g_train.groupby(["customer_id"]).aggregate(f).reset_index()
g_cust_last.columns = ["customer_id", "last_date", "cust_me"]

In [10]:
df = pd.DataFrame()
for month_ in ["test_1","test_2","test_3","valid"]:
    df_ = pd.read_pickle("dataset/" + str(month_) + ".pkl")
    df = pd.concat([df, df_], axis=0)

df = df.reset_index(drop=True)
f = {'date_new': ["min"]}
g = df.groupby(["file_id","customer_id"]).aggregate(f).reset_index()
g.columns = ["file_id","customer_id","first_date"]

g = pd.concat([g, g_val], axis=0)

g = pd.merge(g, g_cust_last, how='left', on='customer_id')
g = g.sort_values(by=["first_date", "file_id"]).reset_index(drop=True)
print (g.shape)

(27040934, 7)


In [11]:
f = {'cust_me': ["mean","std","max","min"]}
g_test_me = g.groupby(["file_id"]).aggregate(f).reset_index()
g_test_me.columns = ["file_id", "cust_me_mean", "cust_me_std", "cust_me_max", "cust_me_min"]
print (g_test_me.shape)

(40117, 5)


In [12]:
g = pd.concat([g_train_me, g_test_me], axis=0)

In [13]:
g.isnull().sum()

file_id            0
cust_me_mean     878
cust_me_std     4581
cust_me_max      878
cust_me_min      878
dtype: int64

In [14]:
g['cust_me_mean'] = g['cust_me_mean'].fillna(g['cust_me_mean'].mean())
g['cust_me_min'] = g['cust_me_min'].fillna(g['cust_me_min'].mean())
g['cust_me_max'] = g['cust_me_max'].fillna(g['cust_me_max'].mean())
g['cust_me_std']=g['cust_me_std'].fillna(-999.0)

In [15]:
g.to_csv("dataset/cust_ec_new.csv", index=False)