In [1]:
import pandas as pd
df = pd.read_csv("ratings_Beauty.csv")
df

Unnamed: 0,UserId,ProductId,Rating,Timestamp
0,A39HTATAQ9V7YF,0205616461,5.0,1369699200
1,A3JM6GV9MNOF9X,0558925278,3.0,1355443200
2,A1Z513UWSAAO0F,0558925278,5.0,1404691200
3,A1WMRR494NWEWV,0733001998,4.0,1382572800
4,A3IAAVS479H7M7,0737104473,1.0,1274227200
...,...,...,...,...
2023065,A3DEHKPFANB8VA,B00LORWRJA,5.0,1405296000
2023066,A3DEHKPFANB8VA,B00LOS7MEE,5.0,1405296000
2023067,AG9TJLJUN5OM3,B00LP2YB8E,5.0,1405382400
2023068,AYBIB14QOI9PC,B00LPVG6V0,5.0,1405555200


In [2]:
USER_COL = "UserId"
ITEM_COL = "ProductId"
RATING_COL = "Rating"
PREDICTION_COL = ""

In [3]:
from pandas.api.types import CategoricalDtype
from scipy import sparse

users = df[USER_COL].unique()
items = df[ITEM_COL].unique()
shape = (len(users), len(items))

# Create indices for users and movies
user_cat = CategoricalDtype(categories=sorted(users), ordered=True)
item_cat = CategoricalDtype(categories=sorted(items), ordered=True)
df['UserIndex'] = df[USER_COL].astype(user_cat).cat.codes
df['ItemIndex'] = df[ITEM_COL].astype(item_cat).cat.codes
user_map = df[["UserIndex", USER_COL]].drop_duplicates().sort_values(['UserIndex']).set_index('UserIndex', verify_integrity=True)
item_map = df[["ItemIndex", ITEM_COL]].drop_duplicates().sort_values(['ItemIndex']).set_index('ItemIndex', verify_integrity=True)

# Conversion via COO matrix
coo = sparse.coo_matrix((df[RATING_COL], (df['UserIndex'], df['ItemIndex'])), shape=shape)
csr = coo.tocsr()

In [4]:
df

Unnamed: 0,UserId,ProductId,Rating,Timestamp,UserIndex,ItemIndex
0,A39HTATAQ9V7YF,0205616461,5.0,1369699200,725046,0
1,A3JM6GV9MNOF9X,0558925278,3.0,1355443200,814606,1
2,A1Z513UWSAAO0F,0558925278,5.0,1404691200,313101,1
3,A1WMRR494NWEWV,0733001998,4.0,1382572800,291075,2
4,A3IAAVS479H7M7,0737104473,1.0,1274227200,802842,3
...,...,...,...,...,...,...
2023065,A3DEHKPFANB8VA,B00LORWRJA,5.0,1405296000,759765,249269
2023066,A3DEHKPFANB8VA,B00LOS7MEE,5.0,1405296000,759765,249270
2023067,AG9TJLJUN5OM3,B00LP2YB8E,5.0,1405382400,1035084,249271
2023068,AYBIB14QOI9PC,B00LPVG6V0,5.0,1405555200,1195527,249272


In [2]:
from utils import pandas_df_to_csr

In [5]:
user_index, item_index, ratings_csr = pandas_df_to_csr(df)
ratings_csr

<1210271x249274 sparse matrix of type '<class 'numpy.float64'>'
	with 2023070 stored elements in Compressed Sparse Row format>

In [49]:
user_map = df[["UserIndex", "UserId"]].drop_duplicates().sort_values(['UserIndex']).set_index('UserIndex', verify_integrity=True)
user_map

Unnamed: 0_level_0,UserId
UserIndex,Unnamed: 1_level_1
0,A00008821J0F472NDY6A2
1,A000186437REL8X2RW8UW
2,A0002574WYJMBWKNCPY8
3,A00029263J863WSR0TDRS
4,A00031961JI1CBNV98TW
...,...
1210266,AZZZLZXCEE4WK
1210267,AZZZMSZI9LKE6
1210268,AZZZO4QCZROW8
1210269,AZZZRS1YZ8HVP


In [27]:
user_map = df[["UserIndex", "UserId"]]
user_map = user_map.drop_duplicates()
user_map

Unnamed: 0,UserIndex,UserId
0,725046,A39HTATAQ9V7YF
1,814606,A3JM6GV9MNOF9X
2,313101,A1Z513UWSAAO0F
3,291075,A1WMRR494NWEWV
4,802842,A3IAAVS479H7M7
...,...,...
2023054,1012502,ADQ41IJPQW2TN
2023058,254698,A1SJD7QDROVPCC
2023059,1030275,AFPRQT3V8C1U1
2023061,249628,A1RYQPQ01T5D5R


In [29]:
user_map = user_map.sort_values(['UserIndex'])
user_map

Unnamed: 0,UserIndex,UserId
1609862,0,A00008821J0F472NDY6A2
299576,1,A000186437REL8X2RW8UW
1791795,2,A0002574WYJMBWKNCPY8
1557570,3,A00029263J863WSR0TDRS
898429,4,A00031961JI1CBNV98TW
...,...,...
1051117,1210266,AZZZLZXCEE4WK
1075015,1210267,AZZZMSZI9LKE6
1768894,1210268,AZZZO4QCZROW8
1169932,1210269,AZZZRS1YZ8HVP


In [30]:
user_map = user_map.set_index('UserIndex', verify_integrity=True)
user_map

Unnamed: 0_level_0,UserId
UserIndex,Unnamed: 1_level_1
0,A00008821J0F472NDY6A2
1,A000186437REL8X2RW8UW
2,A0002574WYJMBWKNCPY8
3,A00029263J863WSR0TDRS
4,A00031961JI1CBNV98TW
...,...
1210266,AZZZLZXCEE4WK
1210267,AZZZMSZI9LKE6
1210268,AZZZO4QCZROW8
1210269,AZZZRS1YZ8HVP


In [45]:
user_map.loc[range(5)]

Unnamed: 0_level_0,UserId
UserIndex,Unnamed: 1_level_1
0,A00008821J0F472NDY6A2
1,A000186437REL8X2RW8UW
2,A0002574WYJMBWKNCPY8
3,A00029263J863WSR0TDRS
4,A00031961JI1CBNV98TW
