In [1]:
import operator
import pandas as pd
import numpy as np

1. 每個檔案(每日檔案數量, 每日顧客唯一數量, 每日商品唯一數量, 每日顧客X商品唯一數量)
2. Information of  file, cust, prod and custXprod each day, such as (max - min).
3. list 每個檔案(每日檔案數量, 每日顧客唯一數量, 每日商品唯一數量, 每日顧客X商品唯一數量) each day

In [2]:
def groupby_ct_uct(df_val):
    f = {'customer_id': ['count', lambda x: len(set(x))],
         'product_id': lambda x: len(set(x)),
         'custXprod_id': lambda x: len(set(x)),
         'date_new': ["max", "min", lambda x: len(set(x))]
    }
    g = df_val.groupby(["file_id"]).aggregate(f).reset_index()
    g.columns = ["file_id", "file_ct", "cust_uct", "prod_uct", "custXprod_uct",
                 "date_max", "date_min", "date_count"]

    g["date_during"] = g.date_max - g.date_min + 1
    g["file_freq"] = g.file_ct / g.date_count
    g["custXprod_freq"] = g.custXprod_uct / g.date_count

    print ("feature of cust, prod and date :", g.shape)
    return g

In [3]:
def two_minus_one(x):
    x = list(x)
    if len(x) > 1:
        m_ = x[1] - x[0]
    else:
        m_ = x[0] - x[0]
    return m_


def groupby_ct_uct_ed(df_val):
    # step 1: 算出每天的資訊
    f1 = {'customer_id': ['count', lambda x: len(set(x))],
          'product_id': lambda x: len(set(x)),
          'custXprod_id': lambda x: len(set(x))
    }
    g1 = df_val.groupby(["file_id", "date_new"]).aggregate(f1).reset_index()
    g1.columns = ["file_id", "date_new",
                  "file_ct_ed", "cust_uct_ed", "prod_uct_ed", "custXprod_uct_ed"]

    # step 2: max_minus_min, two_minus_one
    f3 = {'file_ct_ed': [lambda x: max(x) - min(x)],
          'custXprod_uct_ed': [lambda x: max(x) - min(x)]
    }
    g4_1 = g1.groupby(['file_id']).agg(f3).reset_index()
    g4_1.columns = ["file_id","file_ct_ed_mmm", "custXprod_uct_ed_mmm"]

    f4 = {"file_ct_ed": lambda x: two_minus_one(x),
          "custXprod_uct_ed": lambda x: two_minus_one(x)}
    g4_2 = g1.groupby(['file_id']).agg(f4).reset_index()
    g4_2.columns = ['file_id','file_ct_ed','custXprod_uct_ed']

    g4 = pd.merge(g4_1, g4_2, how="left", on="file_id")
    print ("feature about each day :",g4.shape)

    # step 3: list number each day
    f2 = lambda x: x - min(x)

    g1_1 = g1.groupby('file_id')["date_new"].apply(f2).reset_index(drop = True)
    g1["date_perfile"] = g1_1

    g2_col = ['file_id']
    for d in range(6):
        for varible_ in g1.columns[2:6]:
            g2_col.append(varible_ + "_d" + str(d))

    g2 = g1[(g1.date_perfile == 0)]
    g2 = g2.drop(['date_new', 'date_perfile'], axis=1)
    for i in range(1, 6):
        g1_ = g1[(g1.date_perfile == i)]
        g1_ = g1_.drop(['date_new', 'date_perfile'], axis=1)
        g2 = pd.merge(g2, g1_, how="left", on="file_id")

    g2 = g2.fillna(0)
    g2.columns = g2_col
    print ("feature about each day :", g2.shape)
    
    return g2, g4

### training set

In [4]:
df = pd.DataFrame()
for file_ in ["train1","train2","train3","valid"]:
    df_ = pd.read_pickle("dataset/" + str(file_) + ".pkl")
    df = pd.concat([df, df_], axis=0)

df['custXprod_id'] = df.customer_id + df.product_id

g_train = groupby_ct_uct(df)
g2, g4 = groupby_ct_uct_ed(df)

g_train = pd.merge(g_train, g2, how = "left", on = "file_id")
g_train = pd.merge(g_train, g4, how = "left", on = "file_id")
print (g_train.shape)

feature of cust, prod and date : (52518, 11)
feature about each day : (52518, 5)
feature about each day : (52518, 25)
(52518, 39)


### testing set

In [5]:
df = pd.DataFrame()
for file_ in ["test_1","test_2","test_3"]:
    df_ = pd.read_pickle("dataset/" + str(file_) + ".pkl")
    df = pd.concat([df, df_], axis=0)

df['custXprod_id'] = df.customer_id + df.product_id

g_test = groupby_ct_uct(df)
g2, g4 = groupby_ct_uct_ed(df)

g_test = pd.merge(g_test, g2, how = "left", on = "file_id")
g_test = pd.merge(g_test, g4, how = "left", on = "file_id")
print (g_test.shape)

feature of cust, prod and date : (29376, 11)
feature about each day : (29376, 5)
feature about each day : (29376, 25)
(29376, 39)


In [6]:
g = pd.concat([g_train, g_test], axis=0)
g.to_csv("dataset/file_cust_prod_ct_uct.csv", index=False)

In [7]:
print (g.shape)
g.iloc[1000, :]

(81894, 39)


file_id                 04f6e4a3745d8016a3e61e097b8b573f
file_ct                                             4679
cust_uct                                            3796
prod_uct                                               7
custXprod_uct                                       3798
date_max                                               8
date_min                                               3
date_count                                             6
date_during                                            6
file_freq                                        779.833
custXprod_freq                                       633
file_ct_ed_d0                                        232
cust_uct_ed_d0                                       162
prod_uct_ed_d0                                         3
custXprod_uct_ed_d0                                  162
file_ct_ed_d1                                        481
cust_uct_ed_d1                                       380
prod_uct_ed_d1                 