# Preprocessing for the LastFM dataset

In [1]:
import numpy as np
import pandas as pd

import time
from calendar import monthrange
from datetime import datetime, timedelta

Convert the original TSV data into DataFrames, and sort them by timestamp

In [2]:
df_plays = pd.read_csv('../../data/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv',
     delimiter='\t',
     header=None,
     names=('userid', 'timestamp', 'artist-id', 'artist-name', 'track-id', 'track-name'))

df_users = pd.read_csv('../../data/lastfm-dataset-1K/userid-profile.tsv',
     delimiter='\t')

df_lastfm = pd.merge(df_plays, df_users, how='inner', left_on='userid', right_on='#id').drop(
    ['#id', 'artist-id', 'artist-name', 'track-name', 'registered'], axis=1).sort_values(
    by='timestamp').reset_index(drop=True)

In [3]:
df_lastfm.head()

Unnamed: 0,userid,timestamp,track-id,gender,age,country
0,user_000391,2005-02-14T00:00:07Z,6b4977f4-3c7a-492a-af61-1e877fa66f52,f,,Germany
1,user_000871,2005-02-14T00:00:38Z,9ecc2ab3-7294-43ad-bdeb-f51388a7a6e0,,,Argentina
2,user_000709,2005-02-14T00:01:44Z,1d0f1ea5-0a92-4578-a7e7-3f2a7129da61,m,,Canada
3,user_000285,2005-02-14T00:02:10Z,46909ba9-46c7-461e-a2ef-280eacd550e4,f,23.0,United States
4,user_000142,2005-02-14T00:02:40Z,14025355-94c2-4e9b-b63f-c16cab9e8086,,,Norway


Obtain a subset of the dataset similary to:

- [Fast Incremental Matrix Factorization for Recommendation with Positive-only Feedback](http://link.springer.com/chapter/10.1007%2F978-3-319-08786-3_41)
- [Forgetting Methods for Incremental Matrix Factorization in Recommender Systems](http://dl.acm.org/citation.cfm?id=2695820)

In [4]:
# 8 month from the first observation
# df_lastfm_600k = df_lastfm[df_lastfm['timestamp'] < '2005-10-18T00:32:59Z']
df_lastfm_600k = df_lastfm.head(666400)

# remove NaN track_id events
df_lastfm_600k = df_lastfm_600k.loc[pd.notnull(df_lastfm_600k['track-id'])].reset_index(drop=True)

# remove unique occurence of <user, item> pairs
df_lastfm_600k = df_lastfm_600k.loc[df_lastfm_600k.duplicated(
    ['userid', 'track-id'], keep=False)].reset_index(drop=True)

df_lastfm_600k = df_lastfm_600k.head(493063)

user_ids = list(set(df_lastfm_600k['userid']))
item_ids = list(set(df_lastfm_600k['track-id']))

df_lastfm_600k.shape, len(user_ids), len(item_ids)

((493063, 6), 164, 65013)

In [5]:
df_lastfm_600k.tail()

Unnamed: 0,userid,timestamp,track-id,gender,age,country
493058,user_000280,2005-10-17T23:50:12Z,29f4dcff-6462-43fa-8c5f-37b3c7322afb,,32.0,United Kingdom
493059,user_000966,2005-10-17T23:50:43Z,5f0be7d5-3dc8-4800-9de9-20443348dd6e,,,United States
493060,user_000273,2005-10-17T23:51:26Z,24c8706a-e05b-4e87-9c11-78d75ed88cf3,f,19.0,United States
493061,user_000833,2005-10-17T23:51:58Z,c954912b-00a3-485f-8130-c592930a5942,f,,United States
493062,user_000958,2005-10-17T23:53:02Z,3d026c3c-625d-4d9f-8e37-3a115501c1b4,m,,Mexico


Check the sparsity. It must be 99.11% according to the iMF paper.

In [8]:
A = np.zeros((len(user_ids), len(item_ids)))

for i, row in df_lastfm_600k.iterrows():
    u_index = user_ids.index(row['userid'])
    i_index = item_ids.index(row['track-id'])
    A[u_index, i_index] += 1
    
(A.size - np.count_nonzero(A)) / float(A.size) * 100.

99.11031864921574

Fill NaN contextual variables. Dummy values are set to:

- sex: male
- age: avg. of the samples
- country: United States

In [6]:
df_lastfm_600k['gender'] = df_lastfm_600k['gender'].fillna('m')
df_lastfm_600k['age'] = df_lastfm_600k['age'].fillna(np.mean(df_lastfm_600k['age']))
df_lastfm_600k['country'] = df_lastfm_600k['country'].fillna('United States')

In [7]:
df_lastfm_600k.tail()

Unnamed: 0,userid,timestamp,track-id,gender,age,country
493058,user_000280,2005-10-17T23:50:12Z,29f4dcff-6462-43fa-8c5f-37b3c7322afb,m,32.0,United Kingdom
493059,user_000966,2005-10-17T23:50:43Z,5f0be7d5-3dc8-4800-9de9-20443348dd6e,m,27.730935,United States
493060,user_000273,2005-10-17T23:51:26Z,24c8706a-e05b-4e87-9c11-78d75ed88cf3,f,19.0,United States
493061,user_000833,2005-10-17T23:51:58Z,c954912b-00a3-485f-8130-c592930a5942,f,27.730935,United States
493062,user_000958,2005-10-17T23:53:02Z,3d026c3c-625d-4d9f-8e37-3a115501c1b4,m,27.730935,Mexico


Create features used in the model.

1. **elapsed days** from the first sample
2. **user indices** corresponds to *userid*
3. **item indices** corresponds to *track-id*

In [9]:
# compute elapsed days from the first sample
head_date = datetime.strptime(df_lastfm_600k.iloc[0]['timestamp'], "%Y-%m-%dT%H:%M:%SZ")

u_indices = []
i_indices = []
dts = []

for i, row in df_lastfm_600k.iterrows():
    u_index = user_ids.index(row['userid'])
    u_indices.append(u_index)
    
    i_index = item_ids.index(row['track-id'])
    i_indices.append(i_index)
    
    date = datetime.strptime(row['timestamp'], "%Y-%m-%dT%H:%M:%SZ")
    dt = (date - head_date).days
    dts.append(dt)
    
len(u_indices), len(i_indices), len(dts)

(493063, 493063, 493063)

In [10]:
df_lastfm_600k['u_index'] = u_indices
df_lastfm_600k['i_index'] = i_indices
df_lastfm_600k['dt'] = dts

In [11]:
df_lastfm_600k.head()

Unnamed: 0,userid,timestamp,track-id,gender,age,country,u_index,i_index,dt
0,user_000391,2005-02-14T00:00:07Z,6b4977f4-3c7a-492a-af61-1e877fa66f52,f,27.730935,Germany,10,62500,0
1,user_000871,2005-02-14T00:00:38Z,9ecc2ab3-7294-43ad-bdeb-f51388a7a6e0,m,27.730935,Argentina,62,24753,0
2,user_000709,2005-02-14T00:01:44Z,1d0f1ea5-0a92-4578-a7e7-3f2a7129da61,m,27.730935,Canada,33,60928,0
3,user_000285,2005-02-14T00:02:10Z,46909ba9-46c7-461e-a2ef-280eacd550e4,f,23.0,United States,82,10729,0
4,user_000142,2005-02-14T00:02:40Z,14025355-94c2-4e9b-b63f-c16cab9e8086,m,27.730935,Norway,20,46441,0


In [12]:
df_lastfm_600k.tail()

Unnamed: 0,userid,timestamp,track-id,gender,age,country,u_index,i_index,dt
493058,user_000280,2005-10-17T23:50:12Z,29f4dcff-6462-43fa-8c5f-37b3c7322afb,m,32.0,United Kingdom,86,42695,245
493059,user_000966,2005-10-17T23:50:43Z,5f0be7d5-3dc8-4800-9de9-20443348dd6e,m,27.730935,United States,90,2383,245
493060,user_000273,2005-10-17T23:51:26Z,24c8706a-e05b-4e87-9c11-78d75ed88cf3,f,19.0,United States,132,44570,245
493061,user_000833,2005-10-17T23:51:58Z,c954912b-00a3-485f-8130-c592930a5942,f,27.730935,United States,148,54009,245
493062,user_000958,2005-10-17T23:53:02Z,3d026c3c-625d-4d9f-8e37-3a115501c1b4,m,27.730935,Mexico,105,4289,245


Export the DataFrame into an intermediate TSV file.

In [14]:
df_lastfm_600k.to_csv('../../data/lastfm-dataset-1K/lastfm-600k.tsv', sep='\t', index=False)