# Recomendación de hoteles.
 - Recomendar hoteles en base a la calificación de los usuarios.
 - 

In [1]:
# Cargar librerías
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import svds

In [2]:
# Leer la data
data = pd.read_csv('tripadvisor_hoteles_cusco.csv')
data.head()

Unnamed: 0,f1_hotel,f2_costo,f3_cliente,f4_titulo,f5_contenido,f6_calificacion,f7_fecHosped,f8_page
0,Casa Cartagena Boutique Hotel & Spa,"PEN 1,120",Wayne S,Neat little place in the center of Cusco.,Nice boutique hotel. Staff were very friendly ...,50,October 2023,1.0
1,Casa Cartagena Boutique Hotel & Spa,"PEN 1,120",Leslie W,Perfect hotel and experience,My husband and I stayed here for two nights wh...,50,September 2023,1.0
2,Casa Cartagena Boutique Hotel & Spa,"PEN 1,120",Mimi,Amazing stay at Cusco,We had a pleasant stay at this hotel for 3 nig...,50,September 2023,1.0
3,Casa Cartagena Boutique Hotel & Spa,"PEN 1,120",T0niaYVR,Amazing service and hospitality received,From the start of my stay to the end of it I f...,50,August 2023,1.0
4,Casa Cartagena Boutique Hotel & Spa,"PEN 1,120",Natacha Talbot,Breathtaking place and walkable to everything,"If you come to Cusco, this is the place to sta...",50,July 2023,1.0


In [3]:
data.nunique()

f1_hotel            1089
f2_costo             204
f3_cliente         35332
f4_titulo          33585
f5_contenido       41478
f6_calificacion        5
f7_fecHosped         236
f8_page              248
dtype: int64

Se ha extraido data de:
- 1,089 hoteles.
- 35,332 clientes únicos.
- 41,478 opiniones o reviews


In [4]:
n_custom = data.f3_cliente.unique().shape[0]
n_hotels = data.f1_hotel.unique().shape[0]

print("\n Número de clientes = " + str(n_custom) + ' | Número de hoteles = ' + 
     str(n_hotels))


 Número de clientes = 35333 | Número de hoteles = 1089


In [6]:
# Filtramos hoteles que tengan más de 2 calificación
print('Hay ', (data.f1_hotel.value_counts()>3).sum(), 'con más de un comentario')

Hay  643 con más de un comentario


In [7]:
# Filtramos solamente estos clientes
df_filter = data.f1_hotel.value_counts().reset_index()
#user_filter
hotels_f = df_filter[df_filter.f1_hotel>3]['index'].tolist()
data1 = data[data.f1_hotel.isin(hotels_f)]

In [8]:
n_custom = data1.f3_cliente.unique().shape[0]
n_hotels = data1.f1_hotel.unique().shape[0]

print("\n Número de clientes = " + str(n_custom) + ' | Número de hoteles = ' + 
     str(n_hotels))


 Número de clientes = 34727 | Número de hoteles = 643


In [9]:
def preparar_data(df):
    """
    Función que recibe el dataset de entrada y preprocesa la data para utilizarlo en el sistema de recoemndacióna aimplementar.
    """
    # Seleccionar cliente_id, hotel_id, calificación
    df = df[['f3_cliente', 'f1_hotel', 'f6_calificacion']].copy()
    # Eliminar clientes sin id
    df = df.dropna()
    # constuimos encoder y decoder, por que la data que pasemos al sistema solo acepta enteros.
    l_customs= list(set(df.f3_cliente))
    l_hotels = list(set(df.f1_hotel))
    
    # Encoder
    encoder_customs = dict(zip(l_customs, range(n_custom)))
    encoder_hotels = dict(zip(l_hotels, range(n_hotels)))
    
    # Decoder
    # Invertir el diccionario
    decoder_customs = {valor: clave for clave, valor in encoder_customs.items()}
    decoder_hotels = {valor: clave for clave, valor in encoder_hotels.items()}
    
    # Transformamos el dataset
    df['f3_cliente'] = df['f3_cliente'].map(encoder_customs)
    df['f1_hotel'] = df['f1_hotel'].map(encoder_hotels)


    
    # cambiamos de tipo de datos a todas las features del datset
    for col in df.columns:
        df[col] = df[col].astype(int)
        
    # Poner calificaciones entre 0 y 5
    #df['f6_calificacion'] = df['f6_calificacion']/10
    
    return df, encoder_customs, encoder_hotels, decoder_customs, decoder_hotels

In [10]:
data2, encoder_customs, encoder_hotels, decoder_customs, decoder_hotels = preparar_data(data1)
data2.sort_values('f3_cliente').head()

Unnamed: 0,f3_cliente,f1_hotel,f6_calificacion
14686,0,50,50
40525,0,125,50
21482,1,563,40
33,1,128,50
7524,2,72,50


In [11]:
# Crear matriz usuario  - item - matriz de similiridad
df_matriz = np.zeros((n_custom, n_hotels))

for line in data2.itertuples():
    df_matriz[line[1], line[2]] = line[3]
    
df_matriz

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
df_matriz.shape

(34727, 643)

## Calculamos similaridad entre usuarios y entre hoteles = items

In [13]:
%%time
# Calculamos las similaridades entre items
user_similarity = pairwise_distances(df_matriz, metric = 'cosine')
item_similarity = pairwise_distances(df_matriz.T, metric= 'cosine')

Wall time: 26.6 s


In [14]:
# Top 3 usuarios similares para el usuario 7
print("Usuarios similares al 7: \n",
     pd.DataFrame(user_similarity).loc
      [6,pd.DataFrame(user_similarity).loc
       [6,:]>0].sort_values(ascending = False)[0:3])

Usuarios similares al 7: 
 0        1.0
23140    1.0
23154    1.0
Name: 6, dtype: float64


In [15]:
# Top 3 item similares para el ítem 7
pd.DataFrame(item_similarity).loc[
    5, pd.DataFrame(item_similarity).loc[5,:]>0].sort_values(ascending = False)[0:3]

0      1.0
430    1.0
423    1.0
Name: 5, dtype: float64

## Predecir puntuacionees en base a similaridades

In [16]:
# Función por item basados en la predicción del ratio
def item_based_prediction(rating_matrix, similarity_matrix):
    return(rating_matrix.dot(similarity_matrix)/
           np.array([np.abs(similarity_matrix).sum(axis = 1)]))

# Función para usuarios basados en la predicción del ratio.
def user_based_prediction(rating_matrix, similarity_matrix):
    mean_user_rating = rating_matrix.mean(axis=1)
    ratings_diff = (rating_matrix - mean_user_rating[:,np.newaxis])
    return(mean_user_rating[:,np.newaxis]+similarity_matrix.dot(ratings_diff)/
          np.array([np.abs(similarity_matrix).sum(axis = 1)]).T)

In [17]:
%%time
item_based_prediction = item_based_prediction(df_matriz, item_similarity)
user_based_prediction = user_based_prediction(df_matriz, user_similarity)

Wall time: 12.7 s


In [18]:
# Calculamos el RMSE
def rmse(prediction, actual):
    prediction = prediction[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    #return(sqrt(mean_squared_error(prediction, actual)))
    return(mean_absolute_error(prediction, actual))

In [19]:
print('User-based CF RMSE:' + str(rmse(user_based_prediction, df_matriz)/10))
print('Item-based CF RMSE:' + str(rmse(item_based_prediction, df_matriz)/10))

User-based CF RMSE:4.311046644026965
Item-based CF RMSE:4.319112015009177


In [20]:
print('User-based CF RMSE:' + str(rmse(user_based_prediction, df_matriz)/10))
print('Item-based CF RMSE:' + str(rmse(item_based_prediction, df_matriz)/10))

User-based CF RMSE:4.311046644026965
Item-based CF RMSE:4.319112015009177


## Buscamos dado un hotel que otros hoteles se le recomendaría a un usuario

In [21]:
def find_top5_recomendet(y_pred, valor_buscado):
    # Predicciones para peliculas que los usuarios no han calificado aún
    # Buscar la clave correspondiente al valor
    clave_encontrada = decoder_hotels.get(valor_buscado)

    # Imprimir la clave encontrada
    if clave_encontrada is not None:
        print(f"Top 5 hoteles recomendados o similares a: {clave_encontrada}")
        
        # Prediccin para películas que el usuario 6 no ha calificado aún.
        predictions = y_pred.loc[valor_buscado-1, pd.DataFrame(df_matriz).loc[valor_buscado-1,:]==0]
        top = predictions.sort_values(ascending = False).head()
        recomendations = pd.DataFrame(data = top)
        recomendations.columns = ['Predicted Rating']
        recomendations = recomendations.reset_index()
        recomendations['hotel'] = recomendations['index'].map(decoder_hotels)
        print('*'*10)
        for i in recomendations.index:
            print(f'Opción {i+1} --> Score = {round(recomendations.loc[i,"Predicted Rating"], 6)}, --> Hotel {recomendations.loc[i,"hotel"]}')
    
    
    else:
        print(f"No se encontró ninguna clave para el valor {valor_buscado}")
        
    return recomendations

In [22]:
y_user_based = pd.DataFrame(user_based_prediction)
y_item_based = pd.DataFrame(item_based_prediction)

In [26]:
valor_buscado = 10
recomendations = find_top5_recomendet(y_user_based, valor_buscado)

Top 5 hoteles recomendados o similares a: Hostal Tu Hogar
**********
Opción 1 --> Score = 3.368263, --> Hotel Monasterio, A Belmond Hotel, Cusco
Opción 2 --> Score = 1.516748, --> Hotel LOKI Cusco
Opción 3 --> Score = 1.21048, --> Hotel Pariwana Hostel Cusco
Opción 4 --> Score = 1.170275, --> Hotel Ecopackers
Opción 5 --> Score = 1.127347, --> Hotel Tierra Viva Cusco Saphi


In [27]:
valor_buscado = 10
recomendations = find_top5_recomendet(y_item_based, valor_buscado)

Top 5 hoteles recomendados o similares a: Hostal Tu Hogar
**********
Opción 1 --> Score = 0.078035, --> Hotel Pirwa Garcilaso Hostel
Opción 2 --> Score = 0.078018, --> Hotel Rumi Wasi
Opción 3 --> Score = 0.078018, --> Hotel Pirwa Posada del Corregidor
Opción 4 --> Score = 0.078011, --> Hotel Pirwa Suecia Bed & Breakfast
Opción 5 --> Score = 0.078008, --> Hotel Pirwa Hostel Backpackers Familiar, San Blas


-------------
## Basado en SVD

In [28]:
# Código de ejemplo para sistemas de recomendación usando SVD
# Calcular el nivel de dispersión.
sparsity = round(1.0-len(data2)/float(n_custom*n_hotels),3)
print(sparsity)
print('El nivel de dispersión es: ' + str(sparsity*100) + '%')

0.998
El nivel de dispersión es: 99.8%


-------------

In [29]:
# Conseguir el svd de la matriz de entrenamiento, elegir k.
u, s, vt = svds(df_matriz, k = 10)
s_diag_matrix = np.diag(s)
X_pred = np.dot(np.dot(u,s_diag_matrix), vt)
print('User based CF MSE: ' + str(rmse(X_pred, df_matriz)/10))

User based CF MSE: 3.206532089263736


In [30]:
y_user_based_svd = pd.DataFrame(X_pred)
#y_user_based.columns = d_hotesl.keys()
#y_user_based.index = d_customs.values()
y_user_based_svd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,633,634,635,636,637,638,639,640,641,642
0,0.000050,0.115744,-3.260170e-04,7.299563e-04,0.001519,4.290430e-04,-0.000355,0.086914,0.000117,-0.000199,...,2.143995e-04,0.064182,0.001444,0.002528,-0.000379,0.069543,0.003900,3.219362e-04,0.001340,0.000836
1,-0.000280,0.047781,-2.906757e-04,6.654487e-05,0.000275,-1.026010e-04,-0.000545,-0.000368,-0.000675,0.000226,...,7.352703e-05,0.103534,0.000515,-0.000329,-0.003156,-0.000311,-0.001214,1.932013e-02,0.043334,0.052288
2,-0.000152,0.035703,2.225547e-02,3.123651e-02,0.012873,-2.242631e-04,-0.000496,-0.000515,-0.000300,-0.001394,...,3.708496e-06,-0.004901,-0.000149,-0.000025,-0.004090,-0.000428,0.034644,-1.205687e-04,0.021933,0.019089
3,-0.000140,0.000125,-3.883039e-04,-1.295042e-05,0.039806,-1.210427e-04,0.010049,-0.000415,-0.000224,-0.000093,...,1.229178e-04,0.117545,0.000898,0.000123,0.095206,-0.000481,-0.000794,-1.265186e-04,-0.000497,-0.002913
4,0.000033,0.002583,2.566255e-05,1.447736e-04,0.000156,7.745387e-05,0.000087,0.000363,0.000076,0.000102,...,3.645218e-06,0.001119,0.000090,0.000090,0.001147,0.000291,0.000775,3.249006e-05,0.000106,0.000974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34722,0.000064,0.004378,-7.359800e-06,5.116929e-04,0.000211,4.042575e-04,0.000374,0.000297,0.000066,0.000822,...,8.142244e-06,0.002773,0.000319,0.000232,0.002945,0.000237,0.000828,1.170049e-04,0.000283,0.001593
34723,0.000001,0.000193,2.697742e-05,5.577424e-05,0.000018,1.416708e-05,0.000014,0.000007,0.000014,-0.000003,...,2.647866e-07,0.000064,0.000010,0.000007,0.000091,0.000005,0.000084,-2.549966e-07,0.000028,0.000066
34724,0.000006,0.000210,-9.277247e-07,-9.840736e-07,0.000003,-2.851070e-07,0.000003,0.000011,0.000023,-0.000002,...,1.938564e-07,0.000054,0.000006,0.000006,0.000055,0.000009,0.000081,-3.779771e-07,-0.000002,0.000100
34725,-0.000152,0.035703,2.225547e-02,3.123651e-02,0.012873,-2.242631e-04,-0.000496,-0.000515,-0.000300,-0.001394,...,3.708496e-06,-0.004901,-0.000149,-0.000025,-0.004090,-0.000428,0.034644,-1.205687e-04,0.021933,0.019089


In [34]:
valor_buscado = 21
recomendations = find_top5_recomendet(y_user_based_svd, valor_buscado)

Top 5 hoteles recomendados o similares a: Teatro Inka Hostal
**********
Opción 1 --> Score = 1.039714, --> Hotel Casa Cartagena Boutique Hotel & Spa
Opción 2 --> Score = 0.744439, --> Hotel Wild Rover Cusco
Opción 3 --> Score = 0.668609, --> Hotel Pariwana Hostel Cusco
Opción 4 --> Score = 0.465794, --> Hotel Ecopackers
Opción 5 --> Score = 0.419364, --> Hotel LOKI Cusco


In [35]:
for i in range(0,600,10):
    #recomendations = find_top5_recomendet(y_user_based, valor_buscado)
    #recomendations = find_top5_recomendet(y_user_based, valor_buscado)
    print(i)
    recomendations = find_top5_recomendet(y_user_based_svd, i+1)
    print()

0
Top 5 hoteles recomendados o similares a: Hotel Artsy Fartsy
**********
Opción 1 --> Score = 1.839447, --> Hotel Inkaterra La Casona
Opción 2 --> Score = 1.46105, --> Hotel Encantada Casa Boutique Spa
Opción 3 --> Score = 1.265992, --> Hotel Amaru Hostal
Opción 4 --> Score = 1.037089, --> Hotel El Mercado
Opción 5 --> Score = 0.877812, --> Hotel Eco-Hotel B&B Pension Alemana

10
Top 5 hoteles recomendados o similares a: San Francisco Cusco Hotel
**********
Opción 1 --> Score = 0.056247, --> Hotel Palacio Nazarenas, A Belmond Hotel, Cusco
Opción 2 --> Score = 0.039431, --> Hotel Xima Hotels
Opción 3 --> Score = 0.020728, --> Hotel Monasterio, A Belmond Hotel, Cusco
Opción 4 --> Score = 0.00867, --> Hotel Wild Rover Cusco
Opción 5 --> Score = 0.004858, --> Hotel Casa Cartagena Boutique Hotel & Spa

20
Top 5 hoteles recomendados o similares a: Teatro Inka Hostal
**********
Opción 1 --> Score = 1.039714, --> Hotel Casa Cartagena Boutique Hotel & Spa
Opción 2 --> Score = 0.744439, --> Hot