In [1]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve
from pandas.api.types import CategoricalDtype
import random
from sklearn import metrics

In [2]:
booking = pd.read_csv('hotel_booking_update.csv')

In [3]:
booking.head()

Unnamed: 0,consumer_id,total_including_tax,name,id,order_id
0,345,251.0,Super8 By Wyndham West Kelowna Bc,1631,6801
1,379,333.6,The Sutton Place Hotel - Edmonton,1636,6838
2,379,312.45,The Sutton Place Hotel - Edmonton,1636,6844
3,387,615.23,Hotel Alma,1639,6847
4,389,110.0,Quality Inn& Conference Centre Downtown,1641,6850


In [28]:
booking.name.value_counts()

Best Western Plus Meridian Hotel, Lloydminster    4
Hotel Signature Quebec                            3
Delta Hotels Ottawa City Centre                   3
Holiday Inn Ottawa Parliament Hill                3
Econo Lodge Inn & Suites University               3
                                                 ..
Four Points By Sheraton Vancouver Airport         1
DoubleTree by Hilton Montreal                     1
Days Inn by Wyndham Moose Jaw                     1
BEST WESTERN WARREN HOTEL                         1
HOLIDAY INN WEST                                  1
Name: name, Length: 379, dtype: int64

The most booked hotel has only been booked 4 times.

In [4]:
booking.consumer_id.value_counts()

493     22
685      5
1764     4
3020     3
718      3
        ..
2427     1
865      1
2914     1
1379     1
514      1
Name: consumer_id, Length: 368, dtype: int64

The most frequent customers has made 22 bookings And below are his (or her) bookings.   

In [5]:
booking.loc[booking['consumer_id'] == 493]

Unnamed: 0,consumer_id,total_including_tax,name,id,order_id
51,493,560.16,Podollan Inn & Spa Grande Prairie,2211,8771
193,493,114.44,Holiday Inn Express Hotel & Suites Edmonton South,2209,8769
198,493,144.4,Hampton Inn by Hilton Lloydminster,2210,8770
204,493,161.59,"Home2 Suites by Hilton Fort St. John, Fort St....",2616,10393
205,493,420.74,"Best Western Plus Meridian Hotel, Lloydminster",2619,10397
206,493,163.49,"Holiday Inn Express Hotel & Suites Edson, Edson",2622,10401
207,493,327.01,Best Western Plus Hinton Inn & Suites,2625,10405
217,493,980.94,Best Western Plus Sherwood Park Inn & Suites,2620,10398
218,493,182.76,"Podollan Inn & Spa Grande Prairie, Grande Prairie",2623,10402
219,493,248.92,"Best Western Plus Fox Creek, Fox Creek",2626,10406


In [6]:
df = booking.groupby(['consumer_id', 'name', 'id'])['total_including_tax'].sum().reset_index()
df['count'] = df['name'].map(booking['name'].value_counts())

In [10]:
hotel_lookup = df[['id', 'name']].drop_duplicates() 
hotel_lookup['id'] = hotel_lookup.id.astype(str)

In [12]:
df['consumer_id'] = df.consumer_id.astype(int) 
df = df[['id', 'count', 'consumer_id']] 
grouped_purchased = df.groupby(['consumer_id', 'id']).sum().reset_index()

In [14]:
customers = list(np.sort(grouped_purchased['consumer_id'].unique()))
hotels = list(grouped_purchased['id'].unique())
quantity = list(grouped_purchased['count']) 
rows = grouped_purchased['consumer_id'].astype(CategoricalDtype(categories = customers)).cat.codes 
cols = grouped_purchased['id'].astype(CategoricalDtype(categories = hotels)).cat.codes
purchases_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(hotels)))

In [15]:
purchases_sparse

<368x379 sparse matrix of type '<class 'numpy.int64'>'
	with 405 stored elements in Compressed Sparse Row format>

In [16]:
matrix_size = purchases_sparse.shape[0]*purchases_sparse.shape[1] 
num_purchases = len(purchases_sparse.nonzero()[0]) 
sparsity = 100*(1 - (num_purchases/matrix_size))
sparsity

99.70961913502352

I learned that in generally, for collaborative filtering to work, the maximum sparsity you could get away with would probably be around 99.5% or so. We are at over 99.7%. So we exceed this limit. So we should not expect to get decent results. 

In [17]:
def make_train(ratings, pct_test = 0.2):
    
    test_set = ratings.copy() 
    test_set[test_set != 0] = 1 
    training_set = ratings.copy()  
    nonzero_inds = training_set.nonzero() 
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) 
    random.seed(0) 
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) 
    samples = random.sample(nonzero_pairs, num_samples) 
    user_inds = [index[0] for index in samples] 
    item_inds = [index[1] for index in samples] 
    training_set[user_inds, item_inds] = 0 
    training_set.eliminate_zeros() 
    return training_set, test_set, list(set(user_inds))

In [18]:
hotel_train, hotel_test, hotel_users_altered = make_train(purchases_sparse, pct_test = 0.2)

In [21]:
def implicit_weighted_ALS(training_set, lambda_val = 0.1, alpha = 40, iterations = 10, rank_size = 20, seed = 0):
    
    conf = (alpha*training_set) 
    num_user = conf.shape[0]
    num_hotel = conf.shape[1] 
    
    rstate = np.random.RandomState(seed)
    
    X = sparse.csr_matrix(rstate.normal(size = (num_user, rank_size))) 
    Y = sparse.csr_matrix(rstate.normal(size = (num_hotel, rank_size))) 
                                                                 
    X_eye = sparse.eye(num_user)
    Y_eye = sparse.eye(num_hotel)
    lambda_eye = lambda_val * sparse.eye(rank_size) 
   
    for iter_step in range(iterations): 
        
        yTy = Y.T.dot(Y)
        xTx = X.T.dot(X)
        
        for u in range(num_user):
            conf_samp = conf[u,:].toarray() 
            pref = conf_samp.copy() 
            pref[pref != 0] = 1 
            CuI = sparse.diags(conf_samp, [0]) 
            yTCuIY = Y.T.dot(CuI).dot(Y) 
            yTCupu = Y.T.dot(CuI + Y_eye).dot(pref.T)                                                  
            X[u] = spsolve(yTy + yTCuIY + lambda_eye, yTCupu) 
            
        for i in range(num_hotel):
            conf_samp = conf[:,i].T.toarray() 
            pref = conf_samp.copy()
            pref[pref != 0] = 1 
            CiI = sparse.diags(conf_samp, [0]) 
            xTCiIX = X.T.dot(CiI).dot(X) 
            xTCiPi = X.T.dot(CiI + X_eye).dot(pref.T) 
            Y[i] = spsolve(xTx + xTCiIX + lambda_eye, xTCiPi)
            
    return X, Y.T

In [22]:
user_vecs, hotel_vecs = implicit_weighted_ALS(hotel_train, lambda_val = 0.1, alpha = 15, iterations = 1,
                                            rank_size = 20)

In [23]:
user_vecs[0,:].dot(hotel_vecs).toarray()[0,:5]

array([0., 0., 0., 0., 0.])

In [24]:
alpha = 15
user_vecs, hotel_vecs = implicit.alternating_least_squares((hotel_train*alpha).astype('double'), 
                                                          factors=20, 
                                                          regularization = 0.1, 
                                                         iterations = 50)

This method is deprecated. Please use the AlternatingLeastSquares class instead


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [25]:
def auc_score(predictions, test):
    
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)

In [26]:
def calc_mean_auc(training_set, altered_users, predictions, test_set):
    
    store_auc = [] 
    popularity_auc = [] 
    pop_hotels = np.array(test_set.sum(axis = 0)).reshape(-1) 
    hotel_vecs = predictions[1]
    for user in altered_users: 
        training_row = training_set[user,:].toarray().reshape(-1) 
        zero_inds = np.where(training_row == 0) 
        user_vec = predictions[0][user,:]
        pred = user_vec.dot(hotel_vecs).toarray()[0,zero_inds].reshape(-1)
        actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
        pop = pop_hotels[zero_inds] 
        store_auc.append(auc_score(pred, actual)) 
        popularity_auc.append(auc_score(pop, actual)) 
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))

In [27]:
calc_mean_auc(hotel_train, hotel_users_altered, 
              [sparse.csr_matrix(user_vecs), sparse.csr_matrix(hotel_vecs.T)], hotel_test)

(0.496, 0.51)

Our RecSys did not beat popularity. Our RecSys had a mean AUC of 0.496, while the popular hotel benchmark had a higher AUC of 0.51. So, simply put, at this moment, we are better off just recommend popular hotels to every one.