In [1]:
%cd ../

/Users/hoangle/Projects/two-tower


In [2]:
import numpy as np
import polars as pl
import pandas as pd

from utils import Paths, Dictionary

# Read dataset

In [11]:
path = "data/raw/ml-1m/ml-1m.inter"

interactions_raw = pl.read_csv(path, separator='\t')
interactions_raw.head()

user_id:token,item_id:token,rating:float,timestamp:float
i64,i64,i64,i64
1,1193,5,978300760
1,661,3,978302109
1,914,3,978301968
1,3408,4,978300275
1,2355,5,978824291


In [13]:
path = "data/raw/ml-1m/ml-1m.item"

items_raw = pl.read_csv(path, separator='\t')
items_raw.head()

item_id:token,movie_title:token_seq,release_year:token,genre:token_seq
i64,str,i64,str
1,"""Toy Story""",1995,"""Animation Children's Comedy"""
2,"""Jumanji""",1995,"""Adventure Children's Fantasy"""
3,"""Grumpier Old Men""",1995,"""Comedy Romance"""
4,"""Waiting to Exhale""",1995,"""Comedy Drama"""
5,"""Father of the Bride Part II""",1995,"""Comedy"""


# Process

## Pre-process

In [5]:
cols = {
    'user_id:token': 'user_id', 
    'product_id:token': 'product_id',
    'timestamp:float': 'timestamp'
}

interactions = interactions_raw.select(cols.keys()).rename(cols)

interactions.head()

user_id,product_id,timestamp
i64,i64,i64
0,328100,1514332800
1,328100,1508112000
2,35140,1515024000
3,35140,1515024000
4,725280,1512259200


In [6]:
cols = {
    'product_id:token': 'product_id',
    'genres:token_seq': 'genres',
    'price:float': 'price'
}

items = items_raw.select(cols.keys()).rename(cols)
items.head()

product_id,genres,price
f64,str,str
643980.0,"""[Free to Play, Indie, RPG, Str…","""Free To Play"""
670290.0,"""[Casual, Free to Play, Indie, …","""Free to Play"""
767400.0,"""[Action, Adventure, Casual]""","""0.99"""
773570.0,,"""2.99"""
772540.0,"""[Action, Adventure, Simulation…","""3.99"""


In [7]:
# Filter out null rows
items = items.drop_nulls()

# Convert column `price`
items = items.with_columns(
    pl.when(pl.col('price').str.to_lowercase().str.contains('free'))
    .then(0.0)
    .when(pl.col("price").str.contains("[a-z]+"))
    .then(np.nan)
    .otherwise(pl.col('price'))
    .cast(pl.Float32)
)

items.head()

product_id,genres,price,literal
f64,str,str,f32
643980.0,"""[Free to Play, Indie, RPG, Str…","""Free To Play""",0.0
670290.0,"""[Casual, Free to Play, Indie, …","""Free to Play""",0.0
767400.0,"""[Action, Adventure, Casual]""","""0.99""",0.99
772540.0,"""[Action, Adventure, Simulation…","""3.99""",3.99
774276.0,"""[Free to Play, Indie, Simulati…","""9.99""",9.99


In [9]:
items.with_columns(pl.col('genres').str.split(', ').alias('list')).head()

product_id,genres,price,literal,list
f64,str,str,f32,list[str]
643980.0,"""[Free to Play, Indie, RPG, Str…","""Free To Play""",0.0,"[""[Free to Play"", ""Indie"", … ""Strategy]""]"
670290.0,"""[Casual, Free to Play, Indie, …","""Free to Play""",0.0,"[""[Casual"", ""Free to Play"", … ""Sports]""]"
767400.0,"""[Action, Adventure, Casual]""","""0.99""",0.99,"[""[Action"", ""Adventure"", ""Casual]""]"
772540.0,"""[Action, Adventure, Simulation…","""3.99""",3.99,"[""[Action"", ""Adventure"", ""Simulation]""]"
774276.0,"""[Free to Play, Indie, Simulati…","""9.99""",9.99,"[""[Free to Play"", ""Indie"", … ""Sports]""]"


## Encode

### Encode genres

In [8]:
genres = []

for item in items.itertuples():
    s = item.genres

    s = s[1:len(s) - 1]
    genres.extend(s.split(', '))

map_genre = Dictionary(set(genres))

map_genre.item2id('RPG')

20

In [9]:
def _f_encode(s: str):
    s = s[1:len(s) - 1]
    item_genres = np.int32([map_genre.item2id(genre) for genre in s.split(', ')])

    return item_genres

items['genres'] = items['genres'].map(_f_encode)

items.head()

Unnamed: 0,product_id,genres,price
0,643980,"[19, 6, 20, 14]",0.0
1,670290,"[7, 19, 6, 8, 17]",0.0
2,767400,"[1, 4, 7]",0.99
4,772540,"[1, 4, 8]",3.99
5,774276,"[19, 6, 8, 17]",9.99


## Save dataframe `items`

In [None]:
items.to

# Create fact

In [10]:
fact = (
    interactions
    .merge(items, on='product_id', how='inner')
)

# Only keep nan-free records
fact = fact[~fact.isna().any(axis=1)]

## Remove duplicate
fact = fact.groupby(['user_id', 'product_id']).first().reset_index()

# Apply 5-fore filtering
K = 5

user_count = fact.groupby('user_id')['timestamp'].count()
user_ids_valid = user_count[user_count >= K].index
fact = fact[fact['user_id'].isin(user_ids_valid)]

product_count = fact.groupby('product_id')['timestamp'].count()
product_ids_valid = product_count[product_count >= K].index
fact = fact[fact['product_id'].isin(product_ids_valid)]

fact.head()

Unnamed: 0,user_id,product_id,timestamp,genres,price
20,13,620,1511308800,"[1, 4]",19.99
21,13,35140,1514937600,"[1, 4]",19.99
22,13,220200,1412294400,"[6, 8]",39.990002
23,13,230410,1451520000,"[1, 19]",0.0
24,13,237870,1398643200,"[1, 4, 6, 20, 8]",14.99


### Encode user_id and product_id to indices starting from 0

In [11]:
map_user = Dictionary(fact['user_id'].unique())
map_product = Dictionary(fact['product_id'].unique())

fact['user_id'] = fact['user_id'].map(map_user._map_item2id)
fact['product_id'] = fact['product_id'].map(map_product._map_item2id)

fact.head()

Unnamed: 0,user_id,product_id,timestamp,genres,price
20,0,0,1511308800,"[1, 4]",19.99
21,0,1,1514937600,"[1, 4]",19.99
22,0,2,1412294400,"[6, 8]",39.990002
23,0,3,1451520000,"[1, 19]",0.0
24,0,4,1398643200,"[1, 4, 6, 20, 8]",14.99


# Split train-val-test

## With LOO

In [12]:
fact = fact.sort_values(by='timestamp')

In [13]:
test = fact.groupby('user_id').last().reset_index()
test.head()

Unnamed: 0,user_id,product_id,timestamp,genres,price
0,0,1,1514937600,"[1, 4]",19.99
1,1,9,1489190400,"[1, 8]",19.99
2,2,38,1515110400,"[1, 14]",59.990002
3,3,44,1515110400,"[1, 4, 6]",2.99
4,4,82,1514592000,"[4, 6]",19.99


In [14]:
trainval = (
    fact
    .merge(test, on=['user_id', 'product_id'], how="outer", indicator=True)
    .query("_merge == 'left_only'")
    .rename(columns={
        'timestamp_x': 'timestamp',
        'genres_x': 'genres',
        'price_x': 'price'
    })
    .drop(columns=['timestamp_y', 'genres_y', 'price_y', '_merge'])
)

val = fact.groupby('user_id').last().reset_index()
train = (
    trainval
    .merge(val, on=['user_id', 'product_id'], how="outer", indicator=True)
    .query("_merge == 'left_only'")
    .rename(columns={
        'timestamp_x': 'timestamp',
        'genres_x': 'genres',
        'price_x': 'price'
    })
    .drop(columns=['timestamp_y', 'genres_y', 'price_y', '_merge'])
)

# Save splits

In [16]:
paths = Paths("steam", 'loo')

train.to_parquet(paths.train(), index=False)
val.to_parquet(paths.val(), index=False)
test.to_parquet(paths.test(), index=False)

In [17]:
len(map_genre)

22

In [18]:
len(map_product)

4008

In [19]:
len(map_user)

25569