# Import the necessary libraries

In [13]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix
import math

Note that tensorflow.contrib is only supported by 1.x version of tensorflow. 
To avoid import error install lower version of the libray

In [3]:
!pip install tensorflow==1.15



In [3]:
import tensorflow as tf
from tensorflow.contrib.factorization.python.ops import factorization_ops
print('Tensorlow version: {}'.format(tf.__version__))

Tensorlow version: 1.15.0


# Download the 100k movielens data set

In [None]:
!curl -O 'http://files.grouplens.org/datasets/movielens/ml-100k.zip'

In [None]:
# unzip the file
!unzip ml-100k.zip

In [10]:
# create a directory "data" and copy the user rating data "u.data" into it

!mkdir -p data
!cp ml-100k/u.data data/

In [12]:
# get the list of files in the working directory and take a look at the user rating dataset

!ls
!echo ">>u.data"
!head data/u.data

data  ml-100k  ml-100k.zip  readme.md  wals-engine.ipynb  wals.ipynb
>>u.data
196	242	3	881250949
186	302	3	891717742
22	377	1	878887116
244	51	2	880606923
166	346	1	886397596
298	474	4	884182806
115	265	2	881171488
253	465	5	891628467
305	451	3	886324817
6	86	3	883603013


# Data preprocessing

In [4]:
input_file = 'data/u.data'
headers = ['user_id', 'item_id', 'rating', 'timestamp']
header_row = None
ratings_df = pd.read_csv(input_file,
                         sep='\t',
                         names=headers,
                         header=header_row,
                         dtype={
                           'user_id': np.int32,
                           'item_id': np.int32,
                           'rating': np.float32,
                           'timestamp': np.int32,
                         })

In [5]:
def n_unique(colnm):
    """returns the number of unique values in a given column"""
    
    n = len(np.unique(ratings_df[colnm]))
    return(n)

n_users = n_unique("user_id")    
n_items = n_unique("item_id") 

print("Total number of users:",n_users)
print("Total number of items:",n_items)

print("ratings_df.shape =",ratings_df.shape)
ratings_df.head()

Total number of users: 943
Total number of items: 1682
ratings_df.shape = (100000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [6]:
ratings_df.describe()

Unnamed: 0,user_id,item_id,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125754,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [7]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    100000 non-null  int32  
 1   item_id    100000 non-null  int32  
 2   rating     100000 non-null  float32
 3   timestamp  100000 non-null  int32  
dtypes: float32(1), int32(3)
memory usage: 1.5 MB


In [8]:
ratings = ratings_df[['user_id', 'item_id', 'rating']].values
ratings[:,0] -= 1
ratings[:,1] -= 1

print(ratings.shape,ratings.dtype)
for i in [0,1]:
    print("start indexing at:",np.min(ratings[:,i]),"end indexint at:",np.max(ratings[:,i]))


(100000, 3) float64
start indexing at: 0.0 end indexint at: 942.0
start indexing at: 0.0 end indexint at: 1681.0


# Create sparse train and test datasets

In [9]:
def _create_sparse_train_and_test(ratings, n_users, n_items):
    
    """Given ratings, create sparse matrices for train and test sets.
    Args:
      ratings:  list of ratings tuples  (u, i, r)
      n_users:  number of users
      n_items:  number of items
      
    Returns:
       train, test sparse matrices in scipy coo_matrix format.
       
    """
    print("Ratings shape: {}, minimum rating: {}, number of users: {}, number of items: {}".
          format(ratings.shape,ratings.min(),n_users,n_items))

    # pick a test size
    test_set_size = int(len(ratings) * TEST_SET_RATIO)
    print('Test set size:{}'.format(test_set_size))
    
    # select indexes randomly for the test set
    np.random.seed(1)
    test_set_idx = np.random.choice(range(len(ratings)),
                                    size=test_set_size, replace=False)
    test_set_idx = sorted(test_set_idx)

    # sift ratings into train and test sets
    ts_ratings = ratings[test_set_idx]
    tr_ratings = np.delete(ratings, test_set_idx, axis=0)
    
    # create training and test matrices as coo_matrix's
    u_tr, i_tr, r_tr = zip(*tr_ratings)
    tr_sparse = coo_matrix((r_tr, (u_tr, i_tr)), shape=(n_users, n_items))

    u_ts, i_ts, r_ts = zip(*ts_ratings)
    test_sparse = coo_matrix((r_ts, (u_ts, i_ts)), shape=(n_users, n_items))

    return tr_sparse, test_sparse

In [10]:
## 10% of the data will be test set.
TEST_SET_RATIO = 0.1
tr_sparse, test_sparse = _create_sparse_train_and_test(ratings, n_users, n_items)

print("\nTrain sparse matrix dimension: {}".format(tr_sparse.shape))
print("Number of train sparse matix rows: {}".format(tr_sparse.row.shape))
print("Number of train sparse matix colunms: {}\n".format(tr_sparse.col.shape))

print("Test sparse matrix dimension: {}".format(test_sparse.shape))
print("Number of test sparse matix rows: {}".format(test_sparse.row.shape))
print("Number of test sparse matix colunms: {}\n".format(test_sparse.col.shape))

Ratings shape: (100000, 3), minimum rating: 0.0, number of users: 943, number of items: 1682
Test set size:10000

Train sparse matrix dimension: (943, 1682)
Number of train sparse matix rows: (90000,)
Number of train sparse matix colunms: (90000,)

Test sparse matrix dimension: (943, 1682)
Number of test sparse matix rows: (10000,)
Number of test sparse matix colunms: (10000,)



### Make weights function

Linear and log ratings
Notice that both weights are inversely related to the number of users rating the jth item, 
so that the weights down weights the item that have large number of observed ratings. 
The following code visualizes these weights for each item.

In [11]:
def make_wts(data, wt_type, obs_wt, feature_wt_exp, axis):
    """Generate observed item weights.
      Args:
        data:             coo_matrix of ratings data
        wt_type:          weight type, LOG_RATINGS or LINEAR_RATINGS
        obs_wt:           linear weight factor
        feature_wt_exp:   logarithmic weight factor
        axis:             axis to make weights for, 1=rows/users, 0=cols/items
      Returns:
        vector of weights for cols (items) or rows (users)
    """
    assert wt_type in ["LOG_RATINGS","LINEAR_RATINGS"]
    # recipricol of sum of number of items across rows (if axis is 0)
    frac = np.array(1.0/(data > 0.0).sum(axis))
    
    # filter any invalid entries i.e unrated movies
    frac[np.ma.masked_invalid(frac).mask] = 0.0
    
    # normalize weights according to assumed distribution of ratings
    if wt_type == "LOG_RATINGS":
        wts = np.array(np.power(frac, feature_wt_exp)).flatten()
    elif wt_type == "LINEAR_RATINGS":
        wts = np.array(obs_wt * frac).flatten()
    
    # check again for any numerically unstable entries
    assert np.isfinite(wts).sum() == wts.shape[0]
    return wts

# Model training

#### Defining the tensorflow graph

In [12]:
def define_graph(data,PARAMS):
    graph = tf.Graph()
    with graph.as_default():
        

        input_tensor = tf.SparseTensor(indices=np.array([data.row, data.col]).T,
                                       values=(data.data).astype(np.float32),
                                       dense_shape=data.shape)


        row_wts = None
        col_wts = None
        num_rows = data.shape[0]
        num_cols = data.shape[1]

        # initialize the weights 
        if PARAMS["wt_type"] in ["LOG_RATINGS","LINEAR_RATINGS"]:
            row_wts = np.ones(num_rows)
            col_wts = make_wts(data, 
                               PARAMS["wt_type"], 
                               PARAMS['feature_wt_factor'],
                               PARAMS['feature_wt_exp'],axis=0)
            
            #initalize the WALS model instance

        model = factorization_ops.WALSModel(num_rows, num_cols, PARAMS["latent_factors"],
                                            unobserved_weight=PARAMS["unobs_weight"],
                                            regularization=PARAMS["regularization"],
                                            row_weights=row_wts,
                                            col_weights=col_wts)

        return(graph,model,input_tensor)

#### Create a tensorflow session to train the model

In [14]:
def train(graph,model,input_tensor,verbose=False):   
    sess = tf.Session(graph=graph)
    with graph.as_default():
        row_update_op = model.update_row_factors(sp_input=input_tensor)[1]
        col_update_op = model.update_col_factors(sp_input=input_tensor)[1]

        sess.run(model.initialize_op)
        sess.run(model.worker_init)
        for i in range(num_iterations):
            sess.run(model.row_update_prep_gramian_op)
            sess.run(model.initialize_row_update_op)
            sess.run(row_update_op)
            sess.run(model.col_update_prep_gramian_op)
            sess.run(model.initialize_col_update_op)
            sess.run(col_update_op)
            if verbose and i % 1 == 0:
                rf = sess.run(row_factor)
                print("iter",i,rf.mean(),rf.min(),sess.run(col_factor).mean())
    return sess 

#### Function to find the Root Mean Square Error

In [15]:
def get_rmse(output_row, output_col, actual):
        """Compute rmse between predicted and actual ratings.
        Args:
          output_row: evaluated numpy array of row_factor
          output_col: evaluated numpy array of col_factor
          actual: coo_matrix of actual (test) values
        Returns:
          rmse
        """
        mse = 0
        rate_preds = []
        for i in range(actual.data.shape[0]):
            row_pred = output_row[actual.row[i]]
            col_pred = output_col[actual.col[i]]
            rate_pred = np.dot(row_pred, col_pred)
            rate_preds.append(rate_pred)
            err = actual.data[i] - rate_pred
            mse += err * err
        mse /= actual.data.shape[0]
        rmse = math.sqrt(mse)
        return rmse,rate_preds

#### Initalize the hyper parameters needed for the model training

In [16]:
PARAMS = {
    'regularization': 0.01,
    'unobs_weight': .001,  
    'feature_wt_factor': 189.8,
    'feature_wt_exp': 0.08,
}
latent_factors  = [1,  2,   5, 15]
regularizations = [0.001,0.1,  5, 20,100]
wt_types = ["UNIFORM","LINEAR_RATINGS","LOG_RATINGS"]
rmse_best = np.Inf
num_iterations = 40

#### The model training

In [19]:
for wt_type in wt_types:
    print(wt_type)
    for lf in latent_factors:
        for r in regularizations:
            PARAMS["wt_type"] = wt_type
            PARAMS["latent_factors"] = lf
            PARAMS["regularizations"] = r

            graph,model,input_tensor = define_graph(tr_sparse,PARAMS)
            sess = train(graph,model,input_tensor)

            output_row = model.row_factors[0].eval(session=sess)
            output_col = model.col_factors[0].eval(session=sess)        

            rmse_train, _ = get_rmse(output_row, output_col, tr_sparse)
            rmse_test, rate_preds_test = get_rmse(output_row, output_col, test_sparse)
            print("{:10} latent factor {:3.0f}, reg {:5.1f}, rmse (train) {:5.2f}, rmse (test) {:5.2f}".format(" ",
                                                                                                               lf,
                                                                                                               r,
                                                                                                               rmse_train,
                                                                                                               rmse_test))
            if rmse_test < rmse_best:
                rmse_best = rmse_test
                out = {
#                       "rate_preds_test":rate_preds_test,
                      "rmse":rmse_test,
                      "output_row":output_row,
                      "output_col":output_col,
                      "params": PARAMS
                      }

UNIFORM
           latent factor   1, reg   0.0, rmse (train)  2.87, rmse (test)  2.89
           latent factor   1, reg   0.1, rmse (train)  2.87, rmse (test)  2.89
           latent factor   1, reg   5.0, rmse (train)  2.87, rmse (test)  2.89
           latent factor   1, reg  20.0, rmse (train)  2.87, rmse (test)  2.89
           latent factor   1, reg 100.0, rmse (train)  2.87, rmse (test)  2.88
           latent factor   2, reg   0.0, rmse (train)  2.75, rmse (test)  2.78
           latent factor   2, reg   0.1, rmse (train)  2.75, rmse (test)  2.78
           latent factor   2, reg   5.0, rmse (train)  2.75, rmse (test)  2.78
           latent factor   2, reg  20.0, rmse (train)  2.75, rmse (test)  2.78
           latent factor   2, reg 100.0, rmse (train)  2.75, rmse (test)  2.78
           latent factor   5, reg   0.0, rmse (train)  2.56, rmse (test)  2.61
           latent factor   5, reg   0.1, rmse (train)  2.57, rmse (test)  2.62
           latent factor   5, reg   5.0, rms

### Check the parameters that gave the best Root Mean Squared Error

In [26]:
out

{'rmse': 0.9464870272485855,
 'output_row': array([[0.18383189, 0.7959356 ],
        [0.32868737, 0.6887443 ],
        [0.5603745 , 0.3657162 ],
        ...,
        [0.44000283, 0.65004677],
        [0.64669406, 0.60879785],
        [0.49036273, 0.6062054 ]], dtype=float32),
 'output_col': array([[ 2.8985252 ,  4.26063   ],
        [ 2.8035667 ,  3.145607  ],
        [ 2.2716706 ,  3.2159824 ],
        ...,
        [ 1.8420905 , -0.18509825],
        [-0.35046592,  2.7877023 ],
        [-1.6071336 ,  3.0399823 ]], dtype=float32),
 'params': {'regularization': 0.01,
  'unobs_weight': 0.001,
  'feature_wt_factor': 189.8,
  'feature_wt_exp': 0.08,
  'wt_type': 'LOG_RATINGS',
  'latent_factors': 15,
  'regularizations': 100}}

# Comments

The following parameters gave the best Root mean squared error:<br>
Parameters:
- Regularization = 0.01
- unobs_weight : 0.001,
- feature_wt_factor : 189.8,
- feature_wt_exp: 0.08,
- wt_type : LOG_RATINGS,
- latent_factors : 15,
-  regularizations: 100