# Preprocessing for the LastFM dataset

In [1]:
import numpy as np
import pandas as pd

import time
from calendar import monthrange
from datetime import datetime, timedelta

Convert the original TSV data into DataFrames, and sort them by timestamp

In [2]:
df_plays = pd.read_csv('../../../data/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv',
     delimiter='\t',
     header=None,
     names=('userid', 'timestamp', 'artist-id', 'artist-name', 'track-id', 'track-name'))

df_users = pd.read_csv('../../../data/lastfm-dataset-1K/userid-profile.tsv',
     delimiter='\t')

df_lastfm = pd.merge(df_plays, df_users, how='inner', left_on='userid', right_on='#id').drop(
    ['#id', 'artist-id', 'artist-name', 'track-name', 'registered'], axis=1).sort_values(
    by='timestamp').reset_index(drop=True)

In [3]:
df_lastfm.head()

Unnamed: 0,userid,timestamp,track-id,gender,age,country
0,user_000391,2005-02-14T00:00:07Z,6b4977f4-3c7a-492a-af61-1e877fa66f52,f,,Germany
1,user_000871,2005-02-14T00:00:38Z,9ecc2ab3-7294-43ad-bdeb-f51388a7a6e0,,,Argentina
2,user_000709,2005-02-14T00:01:44Z,1d0f1ea5-0a92-4578-a7e7-3f2a7129da61,m,,Canada
3,user_000285,2005-02-14T00:02:10Z,46909ba9-46c7-461e-a2ef-280eacd550e4,f,23.0,United States
4,user_000142,2005-02-14T00:02:40Z,14025355-94c2-4e9b-b63f-c16cab9e8086,,,Norway


Obtain a subset of the dataset similary to:

- [Fast Incremental Matrix Factorization for Recommendation with Positive-only Feedback](http://link.springer.com/chapter/10.1007%2F978-3-319-08786-3_41)
- [Forgetting Methods for Incremental Matrix Factorization in Recommender Systems](http://dl.acm.org/citation.cfm?id=2695820)

In [29]:
# 3 month from the first observation
df_lastfm_3m = df_lastfm[df_lastfm['timestamp'] < '2005-05-15T00:00:00Z']
df_lastfm_3m.shape

(177196, 6)

In [30]:
# remove NaN track_id events
df_lastfm_3m = df_lastfm_3m.loc[pd.notnull(df_lastfm_3m['track-id'])].reset_index(drop=True)
df_lastfm_3m.shape

(159798, 6)

In [31]:
df_lastfm_3m.tail()

Unnamed: 0,userid,timestamp,track-id,gender,age,country
159793,user_000298,2005-05-14T23:57:15Z,3dc94d29-3f33-4032-8772-e599f081d085,m,28.0,Argentina
159794,user_000870,2005-05-14T23:58:42Z,e8ba000e-6304-473c-b6a7-49010b1433d7,m,,United Kingdom
159795,user_000293,2005-05-14T23:59:25Z,b83f61e2-a566-49cc-b8dd-9ad5a0caddb2,,39.0,United Kingdom
159796,user_000298,2005-05-14T23:59:45Z,22ea4524-3dcb-44e6-a1bc-4edfd25ad988,m,28.0,Argentina
159797,user_000142,2005-05-14T23:59:51Z,c0d44a5d-f84b-4d4b-babc-0f3937ef6edb,,,Norway


In [67]:
user_ids = list(set(df_lastfm_3m['userid']))
item_ids = list(set(df_lastfm_3m['track-id']))
countries = list(set(df_lastfm_3m['country']))

len(user_ids), len(item_ids), len(countries)

(92, 51072, 16)

Fill NaN variables.

- sex: male
- age: avg. of the samples
- country: United States

In [33]:
df_lastfm_3m['gender'] = df_lastfm_3m['gender'].fillna('m')
df_lastfm_3m['age'] = df_lastfm_3m['age'].fillna(np.mean(df_lastfm_3m['age']))
df_lastfm_3m['country'] = df_lastfm_3m['country'].fillna('United States')

df_lastfm_3m.tail()

Unnamed: 0,userid,timestamp,track-id,gender,age,country
159793,user_000298,2005-05-14T23:57:15Z,3dc94d29-3f33-4032-8772-e599f081d085,m,28.0,Argentina
159794,user_000870,2005-05-14T23:58:42Z,e8ba000e-6304-473c-b6a7-49010b1433d7,m,27.216032,United Kingdom
159795,user_000293,2005-05-14T23:59:25Z,b83f61e2-a566-49cc-b8dd-9ad5a0caddb2,m,39.0,United Kingdom
159796,user_000298,2005-05-14T23:59:45Z,22ea4524-3dcb-44e6-a1bc-4edfd25ad988,m,28.0,Argentina
159797,user_000142,2005-05-14T23:59:51Z,c0d44a5d-f84b-4d4b-babc-0f3937ef6edb,m,27.216032,Norway


In [34]:
# encode gender m/f into 0/1
df_lastfm_3m['gender'] = pd.get_dummies(df_lastfm_3m['gender'])['m']
df_lastfm_3m.tail()

Unnamed: 0,userid,timestamp,track-id,gender,age,country
159793,user_000298,2005-05-14T23:57:15Z,3dc94d29-3f33-4032-8772-e599f081d085,1.0,28.0,Argentina
159794,user_000870,2005-05-14T23:58:42Z,e8ba000e-6304-473c-b6a7-49010b1433d7,1.0,27.216032,United Kingdom
159795,user_000293,2005-05-14T23:59:25Z,b83f61e2-a566-49cc-b8dd-9ad5a0caddb2,1.0,39.0,United Kingdom
159796,user_000298,2005-05-14T23:59:45Z,22ea4524-3dcb-44e6-a1bc-4edfd25ad988,1.0,28.0,Argentina
159797,user_000142,2005-05-14T23:59:51Z,c0d44a5d-f84b-4d4b-babc-0f3937ef6edb,1.0,27.216032,Norway


In [40]:
# standardize age to be zero-mean
df_lastfm_3m['age'] = (df_lastfm_3m['age'] - df_lastfm_3m['age'].mean()) / df_lastfm_3m['age'].std(ddof=0)
df_lastfm_3m.tail()

Unnamed: 0,userid,timestamp,track-id,gender,age,country
159793,user_000298,2005-05-14T23:57:15Z,3dc94d29-3f33-4032-8772-e599f081d085,1.0,0.2998332,Argentina
159794,user_000870,2005-05-14T23:58:42Z,e8ba000e-6304-473c-b6a7-49010b1433d7,1.0,-1.358756e-15,United Kingdom
159795,user_000293,2005-05-14T23:59:25Z,b83f61e2-a566-49cc-b8dd-9ad5a0caddb2,1.0,4.506848,United Kingdom
159796,user_000298,2005-05-14T23:59:45Z,22ea4524-3dcb-44e6-a1bc-4edfd25ad988,1.0,0.2998332,Argentina
159797,user_000142,2005-05-14T23:59:51Z,c0d44a5d-f84b-4d4b-babc-0f3937ef6edb,1.0,-1.358756e-15,Norway


Create features used in the model.

1. **elapsed days** from the first sample
2. **user indices** corresponds to *userid*
3. **item indices** corresponds to *track-id*

In [63]:
now = datetime.now()
midnight = now.replace(hour=0, minute=0, second=0, microsecond=0)

max_sec = (now.replace(hour=23, minute=59, second=59, microsecond=59) - midnight).seconds

In [72]:
# compute elapsed days from the first sample
head_date = datetime.strptime(df_lastfm_3m.iloc[0]['timestamp'], "%Y-%m-%dT%H:%M:%SZ")

user_ids = []
track_ids = []

u_indices = []
i_indices = []
dts = []
times = []

for i, row in df_lastfm_3m.iterrows():
    if row['userid'] not in user_ids:
        user_ids.append(row['userid'])
    u_index = user_ids.index(row['userid'])
    u_indices.append(u_index)
    
    if row['track-id'] not in track_ids:
        track_ids.append(row['track-id'])
    i_index = track_ids.index(row['track-id'])
    i_indices.append(i_index)
    
    date = datetime.strptime(row['timestamp'], "%Y-%m-%dT%H:%M:%SZ")
    dt = (date - head_date).days
    dts.append(dt)

    # normalized time in a day [0.0, 1.0]
    time = (date - midnight).seconds / float(max_sec)
    times.append(time)
    
len(u_indices), len(i_indices), len(dts), len(times)

(159798, 159798, 159798, 159798)

In [73]:
df_lastfm_3m['u_index'] = u_indices
df_lastfm_3m['i_index'] = i_indices
df_lastfm_3m['dt'] = dts
df_lastfm_3m['time'] = times

In [74]:
df_lastfm_3m.head()

Unnamed: 0,userid,timestamp,track-id,gender,age,country,u_index,i_index,dt,time
0,user_000391,2005-02-14T00:00:07Z,6b4977f4-3c7a-492a-af61-1e877fa66f52,0.0,-1.358756e-15,Germany,0,0,0,8.1e-05
1,user_000871,2005-02-14T00:00:38Z,9ecc2ab3-7294-43ad-bdeb-f51388a7a6e0,1.0,-1.358756e-15,Argentina,1,1,0,0.00044
2,user_000709,2005-02-14T00:01:44Z,1d0f1ea5-0a92-4578-a7e7-3f2a7129da61,1.0,-1.358756e-15,Canada,2,2,0,0.001204
3,user_000285,2005-02-14T00:02:10Z,46909ba9-46c7-461e-a2ef-280eacd550e4,0.0,-1.612447,United States,3,3,0,0.001505
4,user_000142,2005-02-14T00:02:40Z,14025355-94c2-4e9b-b63f-c16cab9e8086,1.0,-1.358756e-15,Norway,4,4,0,0.001852


In [75]:
df_lastfm_3m.tail()

Unnamed: 0,userid,timestamp,track-id,gender,age,country,u_index,i_index,dt,time
159793,user_000298,2005-05-14T23:57:15Z,3dc94d29-3f33-4032-8772-e599f081d085,1.0,0.2998332,Argentina,10,6855,89,0.998102
159794,user_000870,2005-05-14T23:58:42Z,e8ba000e-6304-473c-b6a7-49010b1433d7,1.0,-1.358756e-15,United Kingdom,36,51071,89,0.999109
159795,user_000293,2005-05-14T23:59:25Z,b83f61e2-a566-49cc-b8dd-9ad5a0caddb2,1.0,4.506848,United Kingdom,29,13963,89,0.999606
159796,user_000298,2005-05-14T23:59:45Z,22ea4524-3dcb-44e6-a1bc-4edfd25ad988,1.0,0.2998332,Argentina,10,1564,89,0.999838
159797,user_000142,2005-05-14T23:59:51Z,c0d44a5d-f84b-4d4b-babc-0f3937ef6edb,1.0,-1.358756e-15,Norway,4,48519,89,0.999907


Export the DataFrame into an intermediate TSV file.

In [76]:
df_lastfm_3m.to_csv('lastfm.tsv', sep='\t', index=False)