In [1]:
# 引用需要的函式
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

import datetime as dt
import pandas as pd
import numpy as np
import keras
import time


Using TensorFlow backend.


In [2]:
# 資料加入影片索引
def preprocess_df(df):    
    n_items = len(train_data['ItemId'].unique())
    aux = list(train_data['ItemId'].unique())
    itemids = np.array(aux)
    itemidmap = pd.Series(data=np.arange(n_items), index=itemids)  # (id_item => (0, n_items))
    
    item_key = 'ItemId'
    session_key = 'SessionId'
    time_key = 'Time'
    rating_key = 'Rating'
    
    data = pd.merge(df, pd.DataFrame({item_key:itemids, 'ItemIdx':itemidmap[itemids].values}), on=item_key, how='inner')
    data.sort_values([session_key, time_key, rating_key], inplace=True)

    length = len(data['ItemId'])
        
    return data

# 依據評價增加個數
def augment(df):    
    col_names = list(df.columns.values)[:3]
    print(col_names)
    augmented = np.repeat(df.values, df['DwellReps'], axis=0) 
    print(augmented[0][:3])  
    augmented = pd.DataFrame(data=augmented[:,:3],
                             columns=col_names)
    
    dtype = {'SessionId': np.int64, 
             'ItemId': np.int64, 
             'Rating': np.int64}
    
    for k, v in dtype.items():
        augmented[k] = augmented[k].astype(v)
                             
    
    return augmented

# 計算delltime
def compute_dwell_time(df):
   
    times_t = np.roll(df['Time'], -1) # Take time row
    times_dt  = df['Time']            # Copy, then displace by one
    
    diffs = np.subtract(times_t, times_dt) # Take the pairwise difference
    
    length = len(df['ItemId'])
    
    # cummulative offset start for each session
    offset_sessions = np.zeros(df['SessionId'].nunique()+1, dtype=np.int32)
    offset_sessions[1:] = df.groupby('SessionId').size().cumsum() 
    
    offset_sessions = offset_sessions - 1
    offset_sessions = np.roll(offset_sessions, -1)
    
    # session transition implies zero-dwell-time
    # note: paper statistics do not consider null entries, 
    # though they are still checked when augmenting
    np.put(diffs, offset_sessions, np.zeros((offset_sessions.shape)), mode='raise')
        
    return diffs

# 加入DwellReps欄位
def join_dwell_reps(df, dt, threshold=2000):
    # Calculate d_ti/threshold + 1
    # then add column to dataFrame
    
    dt //= threshold
    dt += 1   
    df['DwellReps'] = pd.Series(dt.astype(np.int64), index=dt.index)
    #return df

In [3]:
# 資料依照觀看時間分類為訓練，開發與測試三組
# 取影片被觀看數超過5部小於100部的影片觀看資料
# SessionId=>對應user id
# ItemId =>對應movie id
def preprocess():
    ratings_df = pd.read_csv('ratings.csv')

    # ALL TRAIN SET
    # Get only ratings between January 2008 to March 2013
    all_train_start = "09/01/1995"
    all_train_end = "01/03/2013"
    all_train_start_ts = time.mktime(dt.datetime.strptime(all_train_start, "%d/%m/%Y").timetuple())
    all_train_end_ts = time.mktime(dt.datetime.strptime(all_train_end, "%d/%m/%Y").timetuple())
    all_train_data = ratings_df

    # in date range
    all_train_data = all_train_data.loc[(all_train_data['timestamp'] >= all_train_start_ts) & (all_train_data['timestamp'] <= all_train_end_ts)]
    
    # only users 5 < rated_movies < 101
    all_train_data = all_train_data.groupby("userId").filter(lambda x: len(x) > 5 and len(x) < 101)

    # RECENT TRAIN SET
    # Get only ratings between January 2008 to March 2013
    train_start = "01/01/2008"
    train_end = "01/03/2013"
    train_start_ts = time.mktime(dt.datetime.strptime(train_start, "%d/%m/%Y").timetuple())
    train_end_ts = time.mktime(dt.datetime.strptime(train_end, "%d/%m/%Y").timetuple())
    train_data = ratings_df

    # in date range
    train_data = train_data.loc[(train_data['timestamp'] >= train_start_ts) & (train_data['timestamp'] <= train_end_ts)]
    # only users 5 < rated_movies < 101
    train_data = train_data.groupby("userId").filter(lambda x: len(x) > 5 and len(x) < 101)

    # DEV SET
    # Get only ratings between April 2014 to April 2015
    dev_start = "01/04/2013"
    dev_end = "01/04/2014"
    dev_start_ts = time.mktime(dt.datetime.strptime(dev_start, "%d/%m/%Y").timetuple())
    dev_end_ts = time.mktime(dt.datetime.strptime(dev_end, "%d/%m/%Y").timetuple())
    dev_data = ratings_df

    # in date range
    dev_data = dev_data.loc[(dev_data['timestamp'] >= dev_start_ts) & (dev_data['timestamp'] <= dev_end_ts)]
    # only users 5 < rated_movies < 101
    dev_data = dev_data.groupby("userId").filter(lambda x: len(x) > 5 and len(x) < 101)

    # TEST SET
    # Get only ratings between April 2015 to April 2016
    test_start = "02/04/2014"
    test_end = "01/04/2015"
    test_start_ts = time.mktime(dt.datetime.strptime(test_start, "%d/%m/%Y").timetuple())
    test_end_ts = time.mktime(dt.datetime.strptime(test_end, "%d/%m/%Y").timetuple())
    test_data = ratings_df

    # in date range
    test_data = test_data.loc[(test_data['timestamp'] >= test_start_ts) & (test_data['timestamp'] <= test_end_ts)]
    # only users 5 < rated_movies < 101
    test_data = test_data.groupby("userId").filter(lambda x: len(x) > 5 and len(x) < 101)

    all_train_data.columns = ['SessionId', 'ItemId', 'Rating', 'Time']
    train_data.columns = ['SessionId', 'ItemId','Rating','Time']
    dev_data.columns = ['SessionId', 'ItemId','Rating', 'Time',]
    test_data.columns = ['SessionId', 'ItemId','Rating', 'Time']

    all_train_data.to_csv('data/all_train.csv', sep='\t', index=False)
    train_data.to_csv('data/train.csv', sep='\t', index=False)
    dev_data.to_csv('data/dev.csv', sep='\t', index=False)
    test_data.to_csv('data/test.csv', sep='\t', index=False)

In [4]:
#資料分類
preprocess()

In [5]:
# 讀取訓練資料
PATH_TO_TRAIN = 'data/train.csv'
train_data = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'ItemId':np.int64})
print("\n訓練資料\n",train_data)

#加入影片索引
new_df = preprocess_df(train_data)
print("\n影片索引加入後\n",new_df)

# 計算dwelltime
dts = compute_dwell_time(new_df)

join_dwell_reps(new_df, dts, threshold=200000)

# 把時間因素考量進去
df_aug = augment(new_df)
print("\n把時間與評價因素考量進去\n",df_aug)


# 資料儲存
df_aug.columns = ['SessionId', 'ItemId','Time']
df_aug.to_csv("data/augmented_train_dwelltime.csv", index=False, sep='\t')


訓練資料
         SessionId  ItemId  Rating        Time
0              18     186     4.0  1267347706
1              18     858     4.0  1236356241
2              18     912     5.0  1283426281
3              18    1221     3.5  1236356224
4              18    1230     4.5  1236293194
5              18    1270     2.0  1236356476
6              18    1371     2.0  1236293443
7              18    1417     3.0  1264288804
8              18    1580     2.5  1236356556
9              18    1641     4.0  1236292683
10             18    1784     4.0  1262461363
11             18    2023     2.5  1236356245
12             18    2289     4.0  1234304051
13             18    2375     2.0  1263940759
14             18    2402     1.0  1236292957
15             18    2403     1.0  1236292945
16             18    2404     1.0  1236292961
17             18    2571     3.0  1236356282
18             18    2762     3.5  1236356454
19             18    2997     4.0  1236293533
20             18    3426  