In [1]:
import os
import re
import zipfile
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
DATASET = 'ml-1m'  
RAW_PATH = os.path.join('./', DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 99
COMPLEMENT_AMOUNT = 151
SUBSTITUTE_AMOUNT = 57

# Load Data

1. Load interaction data and item metadata
2. Filter out items with less than 5 interactions
3. Calculate basic statistics

In [3]:
# Download data if not exists

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
if not os.path.exists(os.path.join(RAW_PATH, DATASET + '.zip')):
    print('Downloading data into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://files.grouplens.org/datasets/movielens/{}.zip'
        .format(RAW_PATH, DATASET), shell=True)

Downloading data into ./ml-1m


In [4]:
!unzip './ml-1m/ml-1m.zip'
os.rename('./ml-1m', './ml-1m_Chorus')

Archive:  ./ml-1m/ml-1m.zip
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [5]:
meta_df = pd.read_table('./ml-1m_Chorus/movies.dat', sep='::', header=None, encoding='ISO-8859-1')
data_df = pd.read_table('./ml-1m_Chorus/ratings.dat', sep='::', header=None, encoding='ISO-8859-1')

  return func(*args, **kwargs)


In [6]:
data_df.columns = ['user_id', 'item_id', 'label', 'time']
data_df.head()

Unnamed: 0,user_id,item_id,label,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
# Only retain users and items with at least 5 associated interactions

print('Filter before:', len(data_df))
filter_before = -1
while filter_before != len(data_df):
    filter_before = len(data_df)
    for stage in ['user_id', 'item_id']:
        val_cnt = data_df[stage].value_counts()
        cnt_df = pd.DataFrame({stage: val_cnt.index, 'cnt': val_cnt.values})
        data_df = pd.merge(data_df, cnt_df, on=stage, how='left')
        data_df = data_df[data_df['cnt'] >= 5].drop(columns=['cnt'])
print('Filter after:', len(data_df))

useful_meta_df = meta_df[meta_df[0].isin(data_df['item_id'])]  # remove unuseful metadata

Filter before: 1000209
Filter after: 999611


### Statistics

In [8]:
# Gather statistics

n_users = data_df['user_id'].value_counts().size
n_items = data_df['item_id'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['time'].min()
max_time = data_df['time'].max()

In [9]:
# Print statistics

time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 6040
# Items: 3416
# Interactions: 999611
Time Span: 2000-04-25/2003-02-28


# Build Dataset

### Interaction data

In [10]:
np.random.seed(RANDOM_SEED)

In [11]:
# Leave only user_id, item_id and time columns

out_df = data_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df.sort_values(by=['time', 'user_id'], kind='mergesort', inplace=True)
out_df = out_df.reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,6040,858,956703932
1,6040,593,956703954
2,6040,2384,956703954
3,6040,1961,956703977
4,6040,2019,956703977


In [12]:
# Reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,6040,721,956703932
1,6040,550,956703954
2,6040,2021,956703954
3,6040,1631,956703977
4,6040,1689,956703977


In [13]:
# Leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [14]:
# Train, validation, test split

leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(987531, 6040, 6040)

In [15]:
train_df.head()

Unnamed: 0,user_id,item_id,time
0,6040,721,956703932
1,6040,550,956703954
2,6040,2021,956703954
3,6040,1631,956703977
4,6040,1689,956703977


In [16]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
756,6034,2577,956712388,"[2733, 2608, 1654, 3265, 836, 764, 1732, 1034,..."
967,6035,2722,956713640,"[1168, 1685, 3300, 3003, 2840, 2768, 2958, 374..."
1027,6033,1412,956714112,"[2938, 1646, 844, 2745, 1553, 3225, 926, 2198,..."
1278,6038,985,956717204,"[2085, 1793, 1484, 2339, 198, 895, 1974, 1279,..."
1519,6031,341,956718589,"[3349, 3397, 1559, 1252, 2939, 596, 648, 574, ..."


In [22]:
# Save results

train_df.to_csv(os.path.join(RAW_PATH + '_Chorus', 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH + '_Chorus', 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH + '_Chorus', 'test.csv'), sep='\t', index=False)

### Item Metadata

In [23]:
# Find most popular items

from collections import Counter
cnt = Counter(out_df.item_id)
most_popular_tuples = cnt.most_common(len(cnt))
most_popular_items = [i[0] for i in most_popular_tuples]
most_popular_items_dict = dict(zip(most_popular_items, range(1, len(most_popular_items) + 1)))

In [24]:
# Level-2 category

l2_genre_lst = list()
for genre in useful_meta_df[2]:
    l2_genre_lst.append(genre if len(genre) > 2 else np.nan)
useful_meta_df['l2_category'] = l2_genre_lst  
l2_genres = sorted(useful_meta_df['l2_category'].dropna().unique())
l2_dict = dict(zip(l2_genres, range(1, len(l2_genres) + 1)))
useful_meta_df['l2_category'] = useful_meta_df['l2_category'].apply(lambda x: l2_dict[x] if x == x else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [25]:
# Find substitus as the most popular movies of the same set of genres

item_df = useful_meta_df.copy().reset_index(drop = True)
f = np.vectorize(lambda x: item2id[x])
item_df[0] = f(item_df[0])
item_df['r_substitute'] = [[]]*item_df.shape[0]
f_popular_items = np.vectorize(lambda x: most_popular_items_dict[x])
for i in range(len(item_df)):
  sub = [*item_df[item_df['l2_category'] == item_df['l2_category'][i]][0]]
  sub.remove(item_df[0][i])
  if len(sub) > SUBSTITUTE_AMOUNT:
    sub_dict = dict(zip(sub, f_popular_items(sub)))
    top_k_list = sorted(sub_dict.items(), key=lambda item: item[1])[:SUBSTITUTE_AMOUNT]
    sub = [x[0] for x in top_k_list]
  item_df['r_substitute'][i] = sub

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [26]:
# Find complements as the most popular movies in the set of movies, 
# that users have watched right after the ground-truth item

user_items = out_df.groupby(['user_id']).item_id.agg(list).reset_index()
item_df['r_complement'] = [[]]*item_df.shape[0]
for item in item_df[0]:
  comp = []
  for i in range(len(user_items)):
    if item in user_items.item_id[i]:
      next_item_ind = user_items.item_id[i].index(item) + 1
      if next_item_ind < len(user_items.item_id[i]):
        comp.append(user_items.item_id[i][next_item_ind])
  comp = set(comp)
  if item in comp:
    comp.difference({item})
  comp = comp.difference(set(item_df['r_substitute'][item-1])) #excluding substitute items
  #excluding complement items from the set of substitute items
  item_df['r_substitute'][item-1] = list(set(item_df['r_substitute'][item-1]).difference(comp)) 
  comp = list(comp)
  if len(comp) > COMPLEMENT_AMOUNT:
    comp_dict = dict(zip(comp, f_popular_items(comp)))
    top_k_list = sorted(comp_dict.items(), key=lambda item: item[1])[:COMPLEMENT_AMOUNT]
    comp = [x[0] for x in top_k_list]
  item_df['r_complement'][item-1] = comp

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [27]:
item_meta_data = dict()
for idx in range(len(item_df)):
  item_meta_data[idx] = {
        'item_id': item_df[0][idx],
        'i_category': item_df['l2_category'][idx],
        'r_complement': item_df['r_complement'][idx],
        'r_substitute': item_df['r_substitute'][idx],
    }

In [28]:
item_meta_df = pd.DataFrame.from_dict(item_meta_data, orient='index')
item_meta_df = item_meta_df[['item_id', 'i_category', 'r_complement', 'r_substitute']]
item_meta_df.head()

Unnamed: 0,item_id,i_category,r_complement,r_substitute
0,1,146,"[2459, 242, 997, 1011, 1698, 547, 2197, 1069, ...","[3232, 899, 3235, 2695, 1992, 1993, 1808, 1809..."
1,2,116,"[242, 447, 547, 999, 562, 998, 494, 1064, 927,...","[1828, 1829, 1637, 1675, 1711, 849, 2036, 54, ..."
2,3,208,"[2459, 242, 447, 547, 1324, 562, 494, 1360, 72...","[258, 1027, 1030, 7, 2186, 2061, 1298, 2198, 4..."
3,4,186,"[1540, 5, 9, 3082, 2061, 1550, 1551, 1554, 206...","[896, 1925, 1932, 526, 3088, 1042, 2707, 2068,..."
4,5,177,"[1011, 550, 562, 998, 1360, 1022, 589, 2748, 1...","[1280, 2305, 128, 1792, 2315, 1035, 909, 910, ..."


In [30]:
# Save results

item_meta_df.to_csv(os.path.join(RAW_PATH + '_Chorus', 'item_meta.csv'), sep='\t', index=False)

In [31]:
set(out_df.item_id) == set(item_meta_df.item_id)

True

In [32]:
f = np.vectorize(lambda x, y: len(set(x).intersection(set(y))))
f(item_meta_df.r_complement, item_meta_df.r_substitute).max()	

0