## 4-3　過去データの結合

In [1]:
import gc
import operator
import numpy as np
import pandas as pd
import pandas.tseries.offsets as offsets

In [2]:
reserve_tb = pd.read_csv('reserve.csv')
print(reserve_tb.shape)
reserve_tb.head()

(4030, 9)


Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100


In [3]:
reserve_tb.nunique()

reserve_id          4030
hotel_id             300
customer_id          888
reserve_datetime    4030
checkin_date         924
checkin_time           8
checkout_date        917
people_num             4
total_price          933
dtype: int64

### n 件前のデータ取得

`customer_id` ごとに `reserve_datetime` で並び替える。

In [4]:
result = reserve_tb.groupby('customer_id').apply(lambda group: group.sort_values(by='reserve_datetime', axis=0, inplace=False))
result.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
c_1,0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200
c_1,1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600
c_1,2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600
c_1,3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400
c_1,4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100


`shift` 関数で、行を下に 2 行ずらす。

In [5]:
result['before_price'] = result['total_price'].groupby('customer_id').shift(periods=2)
result.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,before_price
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
c_1,0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200,
c_1,1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600,
c_1,2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600,97200.0
c_1,3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400,20600.0
c_1,4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100,33600.0


### 過去 n 件の合計値

In [6]:
result = reserve_tb.groupby('customer_id').apply(lambda x: x.sort_values(by='reserve_datetime', ascending=True)).reset_index(drop=True)
result.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100


`center` を `False` にすると、自身のデータ行を含めて設定した件数になるよう「上」のデータ行を加える。

In [7]:
result['price_sum'] = pd.Series(
    result.groupby('customer_id').rolling(center=False, window=3, min_periods=3).sum().reset_index(drop=True).loc[:, 'total_price']
)
result.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,price_sum
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200,
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600,
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600,151400.0
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400,248600.0
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100,296100.0


### 過去 n 件の平均値

In [8]:
result = reserve_tb.groupby('customer_id').apply(lambda x: x.sort_values(by='reserve_datetime', ascending=True)).reset_index(drop=True)
result.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100


`center` を `False` にすると、自身のデータ行を含めて設定した件数になるよう「上」のデータ行を加える。

In [9]:
result['price_avg'] = pd.Series(
    result.groupby('customer_id')['total_price'].rolling(center=False, window=3, min_periods=1).mean().reset_index(drop=True)
)
result.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,price_avg
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200,97200.0
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600,58900.0
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600,50466.666667
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400,82866.666667
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100,98700.0


In [10]:
result['price_avg'] = result.groupby('customer_id')['price_avg'].shift(periods=1)
result.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,price_avg
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200,
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600,97200.0
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600,58900.0
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400,50466.666667
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100,82866.666667


### 過去 n 日間の合計値

In [11]:
reserve_tb['reserve_datetime'] = pd.to_datetime(reserve_tb['reserve_datetime'], format='%Y-%m-%d %H:%M:%S')

In [12]:
sum_table = pd.merge(
    reserve_tb[['reserve_id', 'customer_id', 'reserve_datetime']],
    reserve_tb[['customer_id', 'reserve_datetime', 'total_price']].rename(columns={'reserve_datetime': 'reserve_datetime_before'}),
    on='customer_id'
)

print(sum_table.shape)
sum_table.head()

(22994, 5)


Unnamed: 0,reserve_id,customer_id,reserve_datetime,reserve_datetime_before,total_price
0,r1,c_1,2016-03-06 13:09:42,2016-03-06 13:09:42,97200
1,r1,c_1,2016-03-06 13:09:42,2016-07-16 23:39:55,20600
2,r1,c_1,2016-03-06 13:09:42,2016-09-24 10:03:17,33600
3,r1,c_1,2016-03-06 13:09:42,2017-03-08 03:20:10,194400
4,r1,c_1,2016-03-06 13:09:42,2017-09-05 19:50:37,68100


直近 90 日以内かつ `reserve_datetime` よりも日付が前であるという、2 つの条件で絞っている。

In [13]:
sum_table = sum_table[
    operator.and_(
        sum_table['reserve_datetime'] > sum_table['reserve_datetime_before'],
        sum_table['reserve_datetime'] + offsets.Day(-90) <= sum_table['reserve_datetime_before'])
].reset_index(drop=True)

print(sum_table.shape)
sum_table.head()

(1337, 5)


Unnamed: 0,reserve_id,customer_id,reserve_datetime,reserve_datetime_before,total_price
0,r3,c_1,2016-09-24 10:03:17,2016-07-16 23:39:55,20600
1,r6,c_1,2017-11-27 18:47:05,2017-09-05 19:50:37,68100
2,r7,c_1,2017-12-29 10:38:36,2017-11-27 18:47:05,36000
3,r15,c_2,2018-04-19 11:25:00,2018-02-18 05:12:58,75600
4,r16,c_2,2018-07-06 04:18:28,2018-04-19 11:25:00,68800


In [14]:
sum_table = sum_table.groupby('reserve_id')['total_price'].sum().reset_index()
sum_table.columns = ['reserve_id', 'total_price_sum']

print(sum_table.shape)
sum_table.head()

(1220, 2)


Unnamed: 0,reserve_id,total_price_sum
0,r1000,44700
1,r1004,333600
2,r1007,128000
3,r1008,190000
4,r1009,133800


In [15]:
pd.merge(reserve_tb, sum_table, on='reserve_id', how='left').fillna(0).head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,total_price_sum
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4,97200,0.0
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2,20600,0.0
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2,33600,20600.0
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4,194400,0.0
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3,68100,0.0
