In [4]:
import pandas as pd
import numpy as np
import os
import findspark
import pyspark

In [5]:
col = np.load(os.path.join(os.path.join(os.getcwd(), '../models/als_ml_train_20210310_084446/col.npy')))
col

array([[ 2.4088502e-02,  4.5487039e-02, -5.9732427e-03, ...,
         3.9603140e-02, -1.2920763e-01,  3.1931356e-02],
       [ 5.6173459e-02, -3.1872738e-02, -1.8654661e-02, ...,
         4.1457918e-02,  7.9145906e-03, -8.8962384e-02],
       [ 3.6168084e-02,  5.5262349e-03,  2.6637517e-02, ...,
         7.8163475e-02, -1.2771778e-01,  7.8532314e-03],
       ...,
       [ 3.4128662e-02, -1.2621130e-02,  1.8723119e-02, ...,
         6.5758708e-03, -1.1179015e-02, -1.4638690e-02],
       [ 2.6132176e-03,  2.0419287e-03,  2.0713739e-03, ...,
        -4.4840756e-03, -3.8591241e-03, -8.2134764e-05],
       [ 3.5262550e-03, -4.0169805e-03,  4.5639514e-03, ...,
        -6.4203939e-03, -1.6353601e-03, -2.8254604e-03]], dtype=float32)

# Pre-processing Data
- Import from csv and convert to df
- Drop unused columns and filter invalid rows
- Make interaction array
- Split data to train and test sets
- Convert training set to sparse matrix R

In [6]:
# import data and convert to df
path = os.path.join(os.getcwd(), '../data/u.purchase.csv')

names = ['user_id','username','item_id','product_name','status']
df = pd.read_csv(path, ',', names=names, engine='python', skiprows=1) # skip header row
df.head()

Unnamed: 0,user_id,username,item_id,product_name,status
0,6,Andy Fajar Handika,9463,Nasi Goreng Selimut,VOID BY USER
1,6,Andy Fajar Handika,3663,Nasi Jenggo Ikan Cakalang,VOID BY USER
2,6,Andy Fajar Handika,5930,Nasi Kebuli Sapi,VOID BY USER
3,12,Thomas Dian,6845,Nasi Rames Ayam Bumbu Bali,SUCCESS
4,12,Thomas Dian,7233,Nasi Dori Asam Manis,PAYMENT EXPIRED


In [7]:
# remove unused columns
df.drop(['username', 'product_name'], axis=1, inplace=True)

# only keep SUCCESS status
df = df.drop(df[df.status!='SUCCESS'].index)
df.drop(['status'], axis=1, inplace=True)
df.head()

Unnamed: 0,user_id,item_id
3,12,6845
9,12,6271
12,76,7120
13,76,7121
14,76,7122


In [8]:
# get unique users and items
users = df.user_id.unique()
items = df.item_id.unique()

In [9]:
# Make interaction list [u, i, r_ui]
interactions = np.array([[0,0,0]])

for u in users:
    user_purchases = df[df.user_id==u].item_id
    unique, counts = np.unique(user_purchases, return_counts=True)
    u_interactions = np.array(list(zip(np.zeros(len(counts), dtype=int), unique, counts)))
    for index, u_in in enumerate(u_interactions):
        # change item to indexes and add user index
        _, i, r_ui = u_in
        i_i = np.where(items==i)[0]
        u_i = np.where(users==u)[0]
        u_interactions[index] = [u_i, i_i, r_ui]
    # add to interactions array
    interactions = np.vstack([interactions, u_interactions]) 

np.delete(interactions, 0)
interactions

array([[   0,    0,    0],
       [   0,    1,    1],
       [   0,    0,    1],
       ...,
       [9383,  105,    1],
       [9384,  585,    1],
       [9385,  125,    1]])

In [10]:
# split training
np.random.seed(0)

def train_test_split(interactions, ts_ratio):
    shuffled = np.random.permutation(interactions)
    ts_length = round(ts_ratio * len(interactions))
    ts_interactions = shuffled[:ts_length]
    tr_interactions = shuffled[ts_length:]
    return tr_interactions, ts_interactions

tr_interactions, ts_interactions = train_test_split(interactions, 0.1)

In [11]:
# Make R (r_ui matrix)
from scipy import sparse
u_tr, i_tr, r_tr = zip(*tr_interactions)
R = sparse.csr_matrix((r_tr, (u_tr, i_tr)), shape=(len(users), len(items)))
R.toarray()

array([[1, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
# Make ts (r_ui matrix)

u_ts, i_ts, r_ts = zip(*ts_interactions)
ts = sparse.csr_matrix((r_ts, (u_ts, i_ts)), shape=(len(users), len(items)))
ts.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
sparcity = R.count_nonzero() / (len(users)*len(items))
sparcity*100

0.259190269509105

# Create Model
- Init weights (confidence matrix)
- train()
    1. Load params (stored in /data/x.json)
    2. Iterate num of iterations
    3. Alternate solving row and columns
    4. Return row and column matrices


In [14]:
from scipy.sparse.linalg import spsolve

def implicit_als(R, _alpha=40, iters=10, _lambda=0.1, latent_factors=10):
    C = R * _alpha
    n_users, n_items = R.shape

    # init random X and Y matrices with normal distribution
    X = sparse.csr_matrix(np.random.normal(size=(n_users, latent_factors)))
    Y = sparse.csr_matrix(np.random.normal(size=(n_items, latent_factors)))

    # compute I, and lambda * I
    I = sparse.eye(latent_factors)

    X_I = sparse.eye(n_users)
    Y_I = sparse.eye(n_items)
    l_I = _lambda * I

    # TRAIN ITERATIONS
    for i in range(iters):
        print(f"iteration {i+1} of {iters}")

        yTy = Y.T.dot(Y)
        xTx = X.T.dot(X)

        # loop all users
        for u in range(n_users):
            u_row = C[u,:].toarray()[0]

            # calculate preference p(u) binary values
            p_u = u_row.copy()
            p_u[p_u != 0] = 1.0

            # calculate Cu and CuI
            CuI = sparse.diags(u_row)
            Cu = CuI + Y_I

            yT_CuI_y = Y.T.dot(CuI).dot(Y)
            yT_Cu_pu = Y.T.dot(Cu).dot(p_u.T)
            X[u] = spsolve(yTy + yT_CuI_y + l_I, yT_Cu_pu) # change user row to optimised
        
        for i in range(n_items):
            i_row = C[:,i].T.toarray()[0]

            p_i = i_row.copy()
            p_i[p_i != 0] = 1.0

            CiI = sparse.diags(i_row)
            Ci = CiI + X_I

            xT_Ci_x = X.T.dot(CiI).dot(X)
            xT_Ci_pi = X.T.dot(Ci).dot(p_i.T)
            Y[i] = spsolve(xTx + xT_Ci_x + l_I, xT_Ci_pi)
    return X, Y


In [15]:
# X, Y = implicit_als(R, _alpha=15, iters=10, _lambda=0.1, latent_factors=10)

In [16]:
import implicit

model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.2, iterations=10)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 20
data_conf = (R.T * alpha_val).astype('double')

model.fit(data_conf)

100%|██████████| 10/10 [00:02<00:00,  4.92it/s]


In [17]:
import math

def rmse(pred, ts):
    ts = ts.toarray()
    mse = 0
    for u in range(ts.shape[0]):
        error = ts[u] - pred[u]
        mse += np.sum(error**2)
        mse /= ts.shape[1]
    mse /= ts.shape[0]
    rmse = math.sqrt(mse)
    return rmse

u_ts, i_ts, r_ts = zip(*ts_interactions)
ts = sparse.csr_matrix((r_ts, (u_ts, i_ts)), shape=(len(users), len(items)))
len(r_tr)

In [18]:
rmse(norm_pred, ts)

NameError: name 'norm_pred' is not defined

In [90]:
from sklearn.preprocessing import normalize

pred = np.dot(model.user_factors, model.item_factors.T)
norm_pred = normalize(pred, axis=1)

In [100]:
norm_pred.shape

(9386, 5522)

In [91]:

names = ['user_id','username','item_id','product_name','status']
df_new = pd.read_csv(path, ',', names=names, engine='python', skiprows=1) # skip header row
df_new.drop(['user_id', 'username', 'status'], axis=1, inplace=True)
df_new = df_new.drop_duplicates()
item_names = df_new.to_numpy()

In [79]:
np.where(item_names[:,0]==14936)[0]

array([], dtype=int64)

In [96]:
os.path.join("/hyperparams", "/default.json")

'/default.json'

In [1]:
def recommend(user_id, k):
    user_i = np.where(users==user_id)[0]
    user_pred = norm_pred[user_i][0]
    recommendations = []
    for _ in range(k):
        max_i = np.argmax(user_pred)
        user_pred = np.delete(user_pred, max_i)
        item_id = items[max_i]
        item_name = item_names[np.where(item_names[:,0]==item_id)[0][0]][1]
        recommendations.append(item_name)
    return recommendations

In [2]:
recommend(144, 10)

NameError: name 'np' is not defined

In [None]:
from sklearn.preprocessing import normalize

normed_matrix = normalize(matrix, axis=1, norm='l1')

In [121]:
recommended = model.recommend(68072, R)

IndexError: index 68072 is out of bounds for axis 0 with size 9386

In [28]:
from pyspark.mllib.recommendation import ALS
from pyspark import SparkContent
sc = SparkContext("local", "First App")

NameError: name 'SparkContext' is not defined

In [25]:
rdd = sc.parallelize(tr_interactions)

In [27]:
model = ALS.trainImplicit(tr_df, rank=5, lambda_=0.01, alpha=1.0, iterations=5)

TypeError: Ratings should be represented by either an RDD or a DataFrame, but got <class 'pandas.core.frame.DataFrame'>.