In [1]:
import datetime as dt
import pandas as pd
import numpy as np
import keras
import time

Using TensorFlow backend.


In [2]:
    # 讀取資料ratings.csv
    ratings_df = pd.read_csv('ratings.csv')
    
    # 顯示

    # ALL TRAIN SET
    # Get only ratings between January 2008 to March 2013
    all_train_start = "09/01/1995"
    all_train_end = "01/03/2013"
    all_train_start_ts = time.mktime(dt.datetime.strptime(all_train_start, "%d/%m/%Y").timetuple())
    all_train_end_ts = time.mktime(dt.datetime.strptime(all_train_end, "%d/%m/%Y").timetuple())
    all_train_data = ratings_df.drop(['rating'], axis=1)

    # in date range
    all_train_data = all_train_data.loc[(all_train_data['timestamp'] >= all_train_start_ts) & (all_train_data['timestamp'] <= all_train_end_ts)]
    
    # only users 5 < rated_movies < 101
    #只取評鑑 6～100影片的資料
    all_train_data = all_train_data.groupby("userId").filter(lambda x: len(x) > 5 and len(x) < 101)
    print("all_train_data \n", all_train_data)

    # RECENT TRAIN SET
    # Get only ratings between January 2008 to March 2013
    train_start = "01/01/2008"
    train_end = "01/03/2013"
    train_start_ts = time.mktime(dt.datetime.strptime(train_start, "%d/%m/%Y").timetuple())
    train_end_ts = time.mktime(dt.datetime.strptime(train_end, "%d/%m/%Y").timetuple())
    train_data = ratings_df.drop(['rating'], axis=1)

    # in date range
    train_data = train_data.loc[(train_data['timestamp'] >= train_start_ts) & (train_data['timestamp'] <= train_end_ts)]
    # only users 5 < rated_movies < 101
    train_data = train_data.groupby("userId").filter(lambda x: len(x) > 5 and len(x) < 101)

    # DEV SET
    # Get only ratings between April 2014 to April 2015
    dev_start = "01/04/2013"
    dev_end = "01/04/2014"
    dev_start_ts = time.mktime(dt.datetime.strptime(dev_start, "%d/%m/%Y").timetuple())
    dev_end_ts = time.mktime(dt.datetime.strptime(dev_end, "%d/%m/%Y").timetuple())
    dev_data = ratings_df.drop(['rating'], axis=1)

    # in date range
    dev_data = dev_data.loc[(dev_data['timestamp'] >= dev_start_ts) & (dev_data['timestamp'] <= dev_end_ts)]
    # only users 5 < rated_movies < 101
    dev_data = dev_data.groupby("userId").filter(lambda x: len(x) > 5 and len(x) < 101)

    # TEST SET
    # Get only ratings between April 2015 to April 2016
    test_start = "02/04/2014"
    test_end = "01/04/2015"
    test_start_ts = time.mktime(dt.datetime.strptime(test_start, "%d/%m/%Y").timetuple())
    test_end_ts = time.mktime(dt.datetime.strptime(test_end, "%d/%m/%Y").timetuple())
    test_data = ratings_df.drop(['rating'], axis=1)

    # in date range
    test_data = test_data.loc[(test_data['timestamp'] >= test_start_ts) & (test_data['timestamp'] <= test_end_ts)]
    
    # only users 5 < rated_movies < 101
    test_data = test_data.groupby("userId").filter(lambda x: len(x) > 5 and len(x) < 101)

    all_train_data.columns = ['UserId', 'MovieId', 'Time']
    train_data.columns = ['UserId', 'MovieId', 'Time']
    dev_data.columns =['UserId', 'MovieId', 'Time']
    test_data.columns = ['UserId', 'MovieId', 'Time']

    all_train_data.to_csv('processed_all_train.csv', sep='\t', index=False)
    train_data.to_csv('processed_train.csv', sep='\t', index=False)
    dev_data.to_csv('processed_dev.csv', sep='\t', index=False)
    test_data.to_csv('processed_test.csv', sep='\t', index=False)

all_train_data 
           userId  movieId   timestamp
175            2        3   974820889
176            2       62   974820598
177            2       70   974820691
178            2      110   974820658
179            2      242   974820776
180            2      260   974821014
181            2      266   974820748
182            2      469   974820598
183            2      480   974820720
184            2      541   974821014
185            2      589   974820658
186            2      891   974820969
187            2      908   974820691
188            2      924   974821014
189            2     1121   974820598
190            2     1196   974821014
191            2     1210   974820598
192            2     1214   974821014
193            2     1249   974820691
194            2     1259   974820659
195            2     1270   974821014
196            2     1327   974820846
197            2     1356   974820598
198            2     1544   974820943
199            2     1580   97482

In [3]:
from matplotlib import pyplot as plt

In [4]:
def preprocess_df(df):    
    
    n_items = len(train_data['MovieId'].unique())
    print("計算不重複MovieId數量\n",n_items)
    
    aux = list(train_data['MovieId'].unique())
    print("列出不重複MovieId\n",aux)
    
    itemids = np.array(aux)
    #print("列出不重複MovieId 轉換成陣列\n",itemids)
    
    itemidmap = pd.Series(data=np.arange(n_items), index=itemids)  # (id_item => (0, n_items))
    print("MovieId建立索引\n",itemidmap)
    
    
    df2 = pd.DataFrame({'MovieId':itemids, 'ItemIdx':itemidmap[itemids].values})
    print("MovieId, ItemIdx DataFrame \n",df2)
    
    data = pd.merge(df,df2 , on='MovieId', how='inner')
    print("data\n",data)
    
    data.sort_values(['UserId', 'Time'], inplace=True)
    print("data sort_values['UserId', 'Time']\n",data)
    
    
    length = len(data['MovieId'])
        
    return data

In [7]:
def compute_dwell_time(df):
   
    times_t = np.roll(df['Time'], -1) # Take time row
    print("times_t shift時間\n",times_t)
    
    times_dt  = df['Time']            # Copy, then displace by one
    print("times_dt 原本t時間\n",times_dt)
    
    diffs = np.subtract(times_t, times_dt) # Take the pairwise difference
    print("diffs 時間差\n",diffs)
    
    
    length = len(df['MovieId'])
    print("length MovieId數量\n",length)
    
    # cummulative offset start for each session
    offset_sessions = np.zeros(df['UserId'].nunique()+1, dtype=np.int32)
    print("offset_sessions np.zeros\n",offset_sessions)
    
    offset_sessions[1:] = df.groupby('UserId').size().cumsum() 
    print("計算出哪個index換userid\n",offset_sessions)
    
    offset_sessions = offset_sessions - 1
    print("變成index array 需要-1 \n", offset_sessions)
        
    offset_sessions = np.roll(offset_sessions, -1)
    print("np.roll(offset_sessions, -1)  \n",offset_sessions)
    
    print("np.zeros\n",np.zeros((offset_sessions.shape)))
    
    # session transition implies zero-dwell-time
    # note: paper statistics do not consider null entries, 
    # though they are still checked when augmenting
    np.put(diffs, offset_sessions, np.zeros((offset_sessions.shape)), mode='raise')
    print("diffs  換userid就補上0\n",diffs)
    
        
    return diffs

In [8]:
# get paper statistics
def get_statistics(dts):
    filtered = np.array(list(filter(lambda x: int(x) != 0, dts)))
    print("過濾時間相減不為0的數列 \n", filtered)
    pd_dts = pd.DataFrame(filtered)
    pd_dts.boxplot(vert=False, showfliers=False) # no outliers in boxplot
    plt.show()
    pd_dts.describe()

In [9]:
def join_dwell_reps(df, dt, threshold=2000):
    # Calculate d_ti/threshold + 1
    # then add column to dataFrame
    
    dt //= threshold
    dt += 1   
    df['DwellReps'] = pd.Series(dt.astype(np.int64), index=dt.index)
    #return df


In [10]:
def augment(df):    
    col_names = list(df.columns.values)[:3]
    print(col_names)
    augmented = np.repeat(df.values, df['DwellReps'], axis=0) 
    print(augmented[0][:3])  
    augmented = pd.DataFrame(data=augmented[:,:3],
                             columns=col_names)
    
    dtype = {'UserId': np.int64, 
             'MovieId': np.int64, 
             'Time': np.float32}
    
    for k, v in dtype.items():
        augmented[k] = augmented[k].astype(v)
                             
    
    return augmented

In [12]:
###
# session => user
#item => movie

###

# load RSC15 preprocessed train dataframe
    PATH_TO_TRAIN = 'processed_train.csv'
    train_data = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'MovieId':np.int64})
    print("train_data \n", train_data)

    new_df = preprocess_df(train_data)
    print("new_df \n", new_df)
    
    dts = compute_dwell_time(new_df)
    print("dts \n", dts)

    #get_statistics(dts)

    join_dwell_reps(new_df, dts, threshold=200000)
    
    print("join_dwell_reps new_df \n", new_df)

    # Now, we augment the sessions copying each entry an additional (dwellReps[i]-1) times
    df_aug = augment(new_df)
    print("df_augf \n", df_aug)
    
    df_aug.to_csv("processed_augmented_train.csv", index=False, sep='\t')

train_data 
         UserId  MovieId        Time
0           18      186  1267347706
1           18      858  1236356241
2           18      912  1283426281
3           18     1221  1236356224
4           18     1230  1236293194
5           18     1270  1236356476
6           18     1371  1236293443
7           18     1417  1264288804
8           18     1580  1236356556
9           18     1641  1236292683
10          18     1784  1262461363
11          18     2023  1236356245
12          18     2289  1234304051
13          18     2375  1263940759
14          18     2402  1236292957
15          18     2403  1236292945
16          18     2404  1236292961
17          18     2571  1236356282
18          18     2762  1236356454
19          18     2997  1236293533
20          18     3426  1234208115
21          18     3812  1236293215
22          18     3967  1236293267
23          18     4306  1283426195
24          18     4313  1214135891
25          18     4428  1262461295
26          18 

data sort_values['UserId', 'Time']
         UserId  MovieId        Time  ItemIdx
36508       18    56788  1204200841       42
34322       18    55820  1206550220       41
37193       18    59382  1210866305       47
30389       18     6197  1211218352       29
25735       18     4313  1214135891       24
37210       18    60950  1225143599       49
22978       18     3426  1234208115       20
37513       18    61075  1234303868       50
13198       18     2289  1234304051       12
37124       18    59118  1234304461       46
32036       18     7456  1236292440       32
34308       18    55100  1236292610       40
12077       18     1641  1236292683        9
13670       18     2403  1236292945       15
13372       18     2402  1236292957       14
13760       18     2404  1236292961       16
25744       18     4447  1236292988       26
31393       18     6942  1236293050       31
36930       18    58315  1236293070       44
36915       18    57792  1236293178       43
7106        18     

df_augf 
          UserId  MovieId          Time
0            18    56788  1.204201e+09
1            18    56788  1.204201e+09
2            18    56788  1.204201e+09
3            18    56788  1.204201e+09
4            18    56788  1.204201e+09
5            18    56788  1.204201e+09
6            18    56788  1.204201e+09
7            18    56788  1.204201e+09
8            18    56788  1.204201e+09
9            18    56788  1.204201e+09
10           18    56788  1.204201e+09
11           18    56788  1.204201e+09
12           18    55820  1.206550e+09
13           18    55820  1.206550e+09
14           18    55820  1.206550e+09
15           18    55820  1.206550e+09
16           18    55820  1.206550e+09
17           18    55820  1.206550e+09
18           18    55820  1.206550e+09
19           18    55820  1.206550e+09
20           18    55820  1.206550e+09
21           18    55820  1.206550e+09
22           18    55820  1.206550e+09
23           18    55820  1.206550e+09
24           18