In [1]:
import operator
import numpy as np
import pandas as pd

In [2]:
# 計算每個檔案間隔時間，(0的比例, 小於10的比例, 小於30的比例)

def interval(x):
    df_ = sorted(list(x))
    if len(df_) > 1:
        A = df_[1:len(df_)]
        B = df_[:len(df_)-1]
        interval_list = list(map(operator.sub, A, B))
    else:
        interval_list = [0]

    interval_series = pd.Series(interval_list)
    _mean = np.mean(interval_list)
    _std = np.std(interval_list)
    sic1 = sum(interval_series < 1)
    sic2 = sum(interval_series < 10)
    sic3 = sum(interval_series < 30)
    sic4 = sum(interval_series < 60)
    sic5 = sum(interval_series < 300)
    lic = sum(interval_series >= 300)
    return _mean, _std, sic1, sic2, sic3, sic4, sic5,lic

def groupby_queryts(df_val):
    f = {'queryts': ["count", lambda x: interval(x)]}

    g = df_val.groupby(["file_id"]).aggregate(f).reset_index()
    g.columns = ["file_id", "_count", "queryts_list"]

    g_ = g['queryts_list'].apply(pd.Series)
    g_.columns = [
        "file_id_inteval_mean", "file_id_inteval_std",
        "sic1", "sic2", "sic3", "sic4", "sic5", "lic"
    ]
    g = pd.concat([g[["file_id","_count"]], g_], axis=1)
    g["file_id_sir1"] = g.sic1 / g._count
    g["file_id_sir2"] = (g.sic2 - g.sic1) / g._count
    g["file_id_sir3"] = (g.sic3 - g.sic2) / g._count
    g["file_id_sir4"] = (g.sic4 - g.sic3) / g._count
    g["file_id_sir5"] = (g.sic5 - g.sic4) / g._count
    g["file_id_lir"] = g.lic / g._count
    g = g.drop(["_count","sic1", "sic2", "sic3", "sic4", "sic5", "lic"], axis=1)
    
    print (g.shape)
    return g

In [3]:
def groupby_queryts_ed(df_):
    # step 1: calculate number each day
    f = {'queryts': ["count", lambda x: interval(x)]}

    g = df_.groupby(["file_id", "date_new"]).aggregate(f).reset_index()
    g.columns = ["file_id", "date_new", "_count", "queryts_list"]

    g_ = g['queryts_list'].apply(pd.Series)
    g_.columns = [
        "file_id_inteval_mean", "file_id_inteval_std","sic1", "sic2", "sic3", "sic4", "sic5", "lic"]
    g = pd.concat([g[["file_id", "date_new", "_count"]], g_], axis=1)
    g["file_id_sir1"] = g.sic1 / g._count
    g["file_id_sir2"] = (g.sic2 - g.sic1) / g._count
    g["file_id_sir3"] = (g.sic3 - g.sic2) / g._count
    g["file_id_sir4"] = (g.sic4 - g.sic3) / g._count
    g["file_id_sir5"] = (g.sic5 - g.sic4) / g._count
    g["file_id_lir"] = g.lic / g._count
    g = g.drop(["_count","sic1", "sic2", "sic3", "sic4", "sic5", "lic"], axis=1)

    # step 2: list unique number each day
    f2 = lambda x: x - min(x)

    g_1 = g.groupby('file_id')["date_new"].apply(f2).reset_index(drop = True)
    g["date_perfile"] = g_1

    g2_col = ['file_id']
    for d in range(3):
        for varible_ in g.columns[2:10]:
            g2_col.append(varible_ + "_d" + str(d+1))

    g2 = g[(g.date_perfile == 0)]
    g2 = g2.drop(['date_new', 'date_perfile'], axis=1)
    for i in range(1, 3):
        g_ = g[(g.date_perfile == i)]
        g_ = g_.drop(['date_new', 'date_perfile'], axis=1)
        g2 = pd.merge(g2, g_, how="left", on="file_id")

    g2 = g2.fillna(0)
    g2.columns = g2_col
    
    print (g2.shape)
    return g2

In [4]:
df = pd.DataFrame()
for file_ in ["train1","train2","train3","valid"]:
    df_ = pd.read_pickle("dataset/" + str(file_) + ".pkl")
    df = pd.concat([df, df_], axis=0)

g_train = groupby_queryts(df)
g2 = groupby_queryts_ed(df)

g_train = pd.merge(g_train, g2, how = "left", on = "file_id")
print (g_train.shape)

(52518, 9)
(52518, 25)
(52518, 33)


In [5]:
df = pd.DataFrame()
for file_ in ["test_1","test_2","test_3"]:
    df_ = pd.read_pickle("dataset/" + str(file_) + ".pkl")
    df = pd.concat([df, df_], axis=0)

g_test = groupby_queryts(df)
g2 = groupby_queryts_ed(df)

g_test = pd.merge(g_test, g2, how = "left", on = "file_id")
print (g_test.shape)

(29376, 9)
(29376, 25)
(29376, 33)


In [6]:
g = pd.concat([g_train, g_test], axis=0)

g.to_csv("dataset/queryts.csv", index=False)

In [7]:
print (g.shape)
g.iloc[0, :]

(81894, 33)


file_id                    0000e2398b12121a85166fed5fe2a3da
file_id_inteval_mean                                   8116
file_id_inteval_std                                 42742.1
file_id_sir1                                      0.0638298
file_id_sir2                                       0.765957
file_id_sir3                                      0.0425532
file_id_sir4                                              0
file_id_sir5                                              0
file_id_lir                                        0.106383
file_id_inteval_mean_d1                             526.738
file_id_inteval_std_d1                              1954.01
file_id_sir1_d1                                   0.0697674
file_id_sir2_d1                                    0.790698
file_id_sir3_d1                                   0.0465116
file_id_sir4_d1                                           0
file_id_sir5_d1                                           0
file_id_lir_d1                          