In [1]:
import os
import re
import zipfile
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
DATASET = 'ml-1m'  
RAW_PATH = os.path.join('./', DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 99

# Load Data

1. Load interaction data and item metadata
2. Filter out items with less than 5 interactions
3. Calculate basic statistics

In [3]:
# Download data if not exists

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
if not os.path.exists(os.path.join(RAW_PATH, DATASET + '.zip')):
    print('Downloading data into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://files.grouplens.org/datasets/movielens/{}.zip'
        .format(RAW_PATH, DATASET), shell=True)

Downloading data into ./ml-1m


In [4]:
!unzip './ml-1m/ml-1m.zip'

Archive:  ./ml-1m/ml-1m.zip
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [5]:
meta_df = pd.read_table('./ml-1m/movies.dat', sep='::', header=None, encoding='ISO-8859-1')
data_df = pd.read_table('./ml-1m/ratings.dat', sep='::', header=None, encoding='ISO-8859-1')

  return func(*args, **kwargs)


In [6]:
data_df.columns = ['user_id', 'item_id', 'label', 'time']
data_df.head()

Unnamed: 0,user_id,item_id,label,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [7]:
genres = [
    'i_Action', 'i_Adventure', 'i_Animation', "i_Children's", 'i_Comedy', 'i_Crime', 
    'i_Documentary', 'i_Drama', 'i_Fantasy', 'i_Film-Noir', 'i_Horror', 'i_Musical', 
    'i_Mystery', 'i_Romance', 'i_Sci-Fi', 'i_Thriller', 'i_War', 'i_Western', 'i_Other'
]

item_df = meta_df.copy()
item_df.columns = ['item_id', 'title', 'genre']
genre_dict = dict()
for g in genres:
    genre_dict[g] = np.zeros(len(item_df), dtype=np.int32)
item_genre = item_df['genre'].apply(lambda x: x.split('|')).values
for idx, genre_lst in enumerate(item_genre):
    for g in genre_lst:
        genre_dict['i_' + g][idx] = 1
for g in genres:
    item_df[g] = genre_dict[g]
item_df = item_df.drop(columns=['genre'])
item_df.head()

Unnamed: 0,item_id,title,i_Action,i_Adventure,i_Animation,i_Children's,i_Comedy,i_Crime,i_Documentary,i_Drama,...,i_Film-Noir,i_Horror,i_Musical,i_Mystery,i_Romance,i_Sci-Fi,i_Thriller,i_War,i_Western,i_Other
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Only retain users and items with at least 5 associated interactions

print('Filter before:', len(data_df))
filter_before = -1
while filter_before != len(data_df):
    filter_before = len(data_df)
    for stage in ['user_id', 'item_id']:
        val_cnt = data_df[stage].value_counts()
        cnt_df = pd.DataFrame({stage: val_cnt.index, 'cnt': val_cnt.values})
        data_df = pd.merge(data_df, cnt_df, on=stage, how='left')
        data_df = data_df[data_df['cnt'] >= 5].drop(columns=['cnt'])
print('Filter after:', len(data_df))

item_df = item_df[item_df['item_id'].isin(data_df['item_id'])]  # remove unuseful metadata

Filter before: 1000209
Filter after: 999611


### Statistics

In [9]:
# Gather statistics

n_users = data_df['user_id'].value_counts().size
n_items = data_df['item_id'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['time'].min()
max_time = data_df['time'].max()

In [10]:
# Print statistics

time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 6040
# Items: 3416
# Interactions: 999611
Time Span: 2000-04-25/2003-02-28


# Build Dataset

### Interaction data

In [11]:
np.random.seed(RANDOM_SEED)

In [12]:
# Leave only user_id, item_id and time columns

out_df = data_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df.sort_values(by=['time', 'user_id'], kind='mergesort', inplace=True)
out_df = out_df.reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,6040,858,956703932
1,6040,593,956703954
2,6040,2384,956703954
3,6040,1961,956703977
4,6040,2019,956703977


In [13]:
# Reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,6040,721,956703932
1,6040,550,956703954
2,6040,2021,956703954
3,6040,1631,956703977
4,6040,1689,956703977


In [14]:
# Leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [15]:
# Train, validation, test split

leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(987531, 6040, 6040)

In [16]:
train_df.head()

Unnamed: 0,user_id,item_id,time
0,6040,721,956703932
1,6040,550,956703954
2,6040,2021,956703954
3,6040,1631,956703977
4,6040,1689,956703977


In [17]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
756,6034,2577,956712388,"[2733, 2608, 1654, 3265, 836, 764, 1732, 1034,..."
967,6035,2722,956713640,"[1168, 1685, 3300, 3003, 2840, 2768, 2958, 374..."
1027,6033,1412,956714112,"[2938, 1646, 844, 2745, 1553, 3225, 926, 2198,..."
1278,6038,985,956717204,"[2085, 1793, 1484, 2339, 198, 895, 1974, 1279,..."
1519,6031,341,956718589,"[3349, 3397, 1559, 1252, 2939, 596, 648, 574, ..."


In [18]:
# Save results

train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

### Item Metadata

In [19]:
item_df['item_id'] = item_df['item_id'].apply(lambda x: item2id[x])


item_df['i_year'] = item_df['title'].apply(lambda x: int(re.match('.+\((\d{4})\)$', x).group(1)))
item_df = item_df.drop(columns=['title'])
seps = [1900, 1940, 1950, 1960, 1970, 1980, 1985] + list(range(1990, int(item_df['i_year'].max() + 2)))
year_dict = {}
for i, sep in enumerate(seps[:-1]):
    for j in range(seps[i], seps[i + 1]):
        year_dict[j] = i + 1
item_df['i_year'] = item_df['i_year'].apply(lambda x: year_dict[x] if x > 0 else 0)
    
item_df.head()

Unnamed: 0,item_id,i_Action,i_Adventure,i_Animation,i_Children's,i_Comedy,i_Crime,i_Documentary,i_Drama,i_Fantasy,...,i_Horror,i_Musical,i_Mystery,i_Romance,i_Sci-Fi,i_Thriller,i_War,i_Western,i_Other,i_year
0,1,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13
1,2,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,13
2,3,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,13
3,4,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,13
4,5,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13


In [20]:
# Save results

item_df.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)