In [None]:
!pip install surprise

In [2]:
from datetime import datetime
import pandas as pd
import numpy as np
import seaborn as sns
import os
import random
import matplotlib
import matplotlib.pyplot as plt

In [3]:
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error 
import xgboost as xgb
from surprise import Reader, Dataset
from surprise import BaselineOnly
from surprise import KNNBaseline
from surprise import SVD
from surprise import SVDpp
from surprise.model_selection import GridSearchCV

In [None]:
#Load datafiniti hotel reviews dataset
dataset = pd.read_csv('/Datafiniti_Hotel_Reviews_Jun19.csv')
dataset

In [None]:
#change column name
dataset.rename(columns = {'reviews.username':'username', 'reviews.rating':'rating'}, inplace = True)
dataset

In [None]:
#hotel id as numbers
temp = []
hotel_id = []
i = 0
for hotel in dataset.id:
  if hotel not in temp:
    temp.append(hotel)
    hotel_id.append(i+1)
    i += 1
  else:
    hotel_id.append(i)

dataset['hotel_id'] = hotel_id
dataset.head(6)

In [None]:
#username as numbers
temp = []
user_id = []
i = 0
for user in dataset.username:
  if user not in temp:
    temp.append(user)
    user_id.append(i+1)
    i += 1
  else:
    user_id.append(i)

dataset['user_id'] = user_id
dataset.head(6)

In [None]:
# Take data columns hotel_id, review_username, and reviews_rating and sorting it by username
user_rating = pd.DataFrame(dataset, columns=['id','hotel_id','username','user_id', 'rating'])
user_rating


In [None]:
# Drop duplicate data id_hotel based on user_rating data
hotel_unique = user_rating.drop_duplicates(subset=['hotel_id'], keep='first')
hotel_unique

In [11]:
#split to train and test data
split_value = int(len(user_rating) * 0.80)
train_data = user_rating[:split_value]
test_data = user_rating[split_value:]

In [None]:
#count number of rating in train set data
plt.figure(figsize = (12, 8))
ax = sns.countplot(x="rating", data=train_data)
ax.set_yticklabels([num for num in ax.get_yticks()])
plt.tick_params(labelsize = 15)
plt.title("Count Ratings in train data", fontsize = 20)
plt.xlabel("Ratings", fontsize = 20)
plt.ylabel("Number of Ratings", fontsize = 20)
plt.show()

In [None]:
#number of rated hotel per user
no_rated_hotel_per_user = train_data.groupby(by = "user_id")["rating"].count().sort_values(ascending = False)
no_rated_hotel_per_user.head()

In [None]:
#rating number per hotel
no_ratings_per_hotel = train_data.groupby(by = "hotel_id")["rating"].count().sort_values(ascending = False)
no_ratings_per_hotel.head()

In [21]:

def get_user_item_sparse_matrix(df):
    sparse_data = sparse.csr_matrix((df.rating, (df.user_id, df.hotel_id)))
    return sparse_data

In [22]:
train_sparse_data = get_user_item_sparse_matrix(train_data)

In [23]:
test_sparse_data = get_user_item_sparse_matrix(test_data)

In [None]:
global_average_rating = train_sparse_data.sum()/train_sparse_data.count_nonzero()
print("Global Average Rating: {}".format(global_average_rating))

In [25]:
def get_average_rating(sparse_matrix, is_user):
    ax = 1 if is_user else 0
    sum_of_ratings = sparse_matrix.sum(axis = ax).A1  
    no_of_ratings = (sparse_matrix != 0).sum(axis = ax).A1 
    rows, cols = sparse_matrix.shape
    average_ratings = {i: sum_of_ratings[i]/no_of_ratings[i] for i in range(rows if is_user else cols) if no_of_ratings[i] != 0}
    return average_ratings

In [26]:
average_rating_user = get_average_rating(train_sparse_data, True)

In [30]:
avg_rating_hotel = get_average_rating(train_sparse_data, False)

In [None]:
total_users = len(np.unique(user_rating["user_id"]))
train_users = len(average_rating_user)
uncommonUsers = total_users - train_users
                  
print("Total no. of Users = {}".format(total_users))
print("No. of Users in train data= {}".format(train_users))
print("No. of Users not present in train data = {}({}%)".format(uncommonUsers, np.round((uncommonUsers/total_users)*100), 2))

In [None]:
total_hotel = len(np.unique(user_rating["hotel_id"]))
train_hotel = len(avg_rating_hotel)
uncommonHotels = total_hotel - train_hotel
                  
print("Total no. of hotels = {}".format(total_hotel))
print("No. of hotels in train data= {}".format(train_hotel))
print("No. of hotes not present in train data = {}({}%)".format(uncommonHotels, np.round((uncommonHotels/total_hotel)*100), 2))

In [None]:
hotel_data = pd.DataFrame(dataset, columns=['hotel_id','rating','id'])
hotel_data

In [44]:
def compute_user_similarity(sparse_matrix, limit=100):
    row_index, col_index = sparse_matrix.nonzero()
    rows = np.unique(row_index)
    similar_arr = np.zeros(61700).reshape(617,100)
    
    for row in rows[:limit]:
        sim = cosine_similarity(sparse_matrix.getrow(row), train_sparse_data).ravel()
        similar_indices = sim.argsort()[-limit:]
        similar = sim[similar_indices]
        similar_arr[row] = similar
    
    return similar_arr
similar_user_matrix  = compute_user_similarity(train_sparse_data, 100)

In [45]:
def compute_hotel_similarity_count(sparse_matrix, hotel_data, hotel_id):
    similarity = cosine_similarity(sparse_matrix.T, dense_output = False)
    no_of_similar_hotels = hotel_data.loc[hotel_id][1], similarity[hotel_id].count_nonzero()
    return no_of_similar_hotels

In [None]:
similar_hotels = compute_hotel_similarity_count(train_sparse_data, hotel_data, 25)
print("Similar Hotels = {}".format(similar_hotels))