# Section 1 | Import Libraries and Load Data

In [1]:
# Data processing
import pandas as pd
import numpy as np
import scipy.stats

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Similarity and split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv("data\Tempat-Wisata-Toba-Preprocessing.csv")

In [3]:
data.shape

(43226, 9)

In [4]:
data.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'address', 'PlaceID',
       'Nama_tempat_wisata', 'Category', 'ReviewerId', 'Rating', 'Reviews'],
      dtype='object')

In [5]:
data = data.rename(columns={'ReviewerId': 'user_id', 'PlaceID': 'item_id', 'Rating': 'rating'})
# data['user_id'] = data['user_id'].astype(str)
# data['item_id'] = data['item_id'].astype(str)
data['rating'] = data['rating'].astype(float)
data.dropna(inplace=True)
data.head(5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,address,item_id,Nama_tempat_wisata,Category,user_id,rating,Reviews
1,1,1,"Jl. Sibola Hotang, Sibola Hotangsas, Kec. Bali...",0,PASIR PUTIH LUMBAN BULBUL,Wisata Bahari,1.11909e+20,5.0,bagus
3,3,3,"Jl. Sibola Hotang, Sibola Hotangsas, Kec. Bali...",0,PASIR PUTIH LUMBAN BULBUL,Wisata Bahari,1.13072e+20,5.0,sangat menyenagkan
4,4,4,"Jl. Sibola Hotang, Sibola Hotangsas, Kec. Bali...",0,PASIR PUTIH LUMBAN BULBUL,Wisata Bahari,1.06173e+20,5.0,bebas foto dimana aja cuma 2k
6,6,6,"Jl. Sibola Hotang, Sibola Hotangsas, Kec. Bali...",0,PASIR PUTIH LUMBAN BULBUL,Wisata Bahari,1.14239e+20,5.0,amazing pengen kesini lagi
8,8,8,"Jl. Sibola Hotang, Sibola Hotangsas, Kec. Bali...",0,PASIR PUTIH LUMBAN BULBUL,Wisata Bahari,1.04743e+20,3.0,jalan menuju lokasi perlu diperhatikan oleh pe...


In [6]:
data.shape

(22166, 9)

# Section 2 | Data Splitting

In [7]:
# Step 1: Split into 85% training+validation and 15% test
train_val_data, test_data = train_test_split(data, test_size=0.15, random_state=42)
# Step 2: Define validation size as 17% of train+validation, rounded up
val_size = int(len(train_val_data) * 0.17) + 1
# Step 3: Split 85% data into training (about 70% of original) and validation (about 15% of original)
train_data, val_data = train_test_split(train_val_data, test_size=val_size, random_state=42)
# Check sizes
print(f"Train data size: {train_data.shape}")
print(f"Validation data size: {val_data.shape}")
print(f"Test data size: {test_data.shape}")

Train data size: (15638, 9)
Validation data size: (3203, 9)
Test data size: (3325, 9)


# Section 3 | Create User-Item Matrix

In [8]:
def create_user_item_matrix(data, user_col='user_id', item_col='Nama_tempat_wisata', rating_col='rating'):
    data = data.groupby([user_col, item_col])[rating_col].mean().reset_index()
    return data.pivot(index=user_col, columns=item_col, values=rating_col).fillna(0)

# Apply the function to create matrices for each split
train_matrix = create_user_item_matrix(train_data)
val_matrix = create_user_item_matrix(val_data).reindex(columns=train_matrix.columns, fill_value=0)
test_matrix = create_user_item_matrix(test_data).reindex(columns=train_matrix.columns, fill_value=0)

In [9]:
train_matrix

Nama_tempat_wisata,Aek Rangat Pangururan,Aek Sipangolu Bakkara,Air Terjun Efrata,Air Terjun Janji,Air Terjun Pandumaan,Air Terjun Sampuran,Air Terjun Siboruon,Air Terjun Situmurun,Air Terjun Taman Eden 100,Air Terjun Tombak Pangaribuan,...,Tao Silalahi Hotel,Tarabunga,Tempat Pengasingan Soekarno,The Kaldera,Tombak Sulusulu,Tugu D.I PANJAITAN,Tugu Toga Aritonang,WF Coffee&Resto,Wisata Bukit Gibeon,Wisata Rumah Pohon
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.000030e+20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.000040e+20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.000090e+20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.000100e+20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.000120e+20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1.184390e+20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.184410e+20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.184430e+20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.184440e+20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Section 4 | Define RMSE, MAE, MAP, Precision, & Recall Calculation Function 

In [10]:
def compute_rmse(true_matrix, approx_matrix):
    return np.sqrt(mean_squared_error(true_matrix.ravel(), approx_matrix.ravel()))

In [11]:
def compute_map(true_matrix, pred_matrix, threshold=3):
    map_score = 0
    n_users = true_matrix.shape[0]
    
    for user_idx in range(n_users):
        true_ratings = true_matrix[user_idx] >= threshold  # Relevant items based on threshold
        pred_ratings = pred_matrix[user_idx]
        
        # Sort items by predicted rating (descending)
        sorted_items = np.argsort(pred_ratings)[::-1]
        
        # Precision at each relevant item
        precision_at_k = []
        relevant_items = 0
        for i, item_idx in enumerate(sorted_items):
            if true_ratings[item_idx]:
                relevant_items += 1
                precision_at_k.append(relevant_items / (i + 1))
        
        if precision_at_k:
            map_score += np.mean(precision_at_k)
    
    return map_score / n_users

In [12]:
def compute_precision_all_users(true_matrix, pred_matrix, threshold=3):
    precision_all_users = 0
    n_users = true_matrix.shape[0]
    
    for user_idx in range(n_users):
        true_ratings = true_matrix[user_idx] >= threshold  # Relevant items based on threshold
        pred_ratings = pred_matrix[user_idx] >= threshold  # Predicted items based on threshold
        
        # Compute Precision: the proportion of relevant items among predicted items
        relevant_predicted = np.sum(true_ratings & pred_ratings)  # True positives
        predicted_items = np.sum(pred_ratings)  # Total predicted items
        
        if predicted_items > 0:
            precision_all_users += relevant_predicted / predicted_items
        else:
            precision_all_users += 0
    
    return precision_all_users / n_users

In [13]:
def compute_recall_all_users(true_matrix, pred_matrix, threshold=3):
    recall_all_users = 0
    n_users = true_matrix.shape[0]
    
    for user_idx in range(n_users):
        true_ratings = true_matrix[user_idx] >= threshold  # Relevant items based on threshold
        pred_ratings = pred_matrix[user_idx] >= threshold  # Predicted items based on threshold
        
        # Compute Recall: the proportion of relevant items that are predicted
        relevant_predicted = np.sum(true_ratings & pred_ratings)  # True positives
        total_relevant_items = np.sum(true_ratings)  # Total relevant items
        
        if total_relevant_items > 0:
            recall_all_users += relevant_predicted / total_relevant_items
        else:
            recall_all_users += 0
    
    return recall_all_users / n_users

In [14]:
from sklearn.metrics import mean_absolute_error

def compute_mae(true_matrix, approx_matrix):
    return mean_absolute_error(true_matrix.ravel(), approx_matrix.ravel())

# Section 5 | UBCF

In [15]:
user_similarity = cosine_similarity(train_matrix.values)
print("User similarity matrix computed.")

User similarity matrix computed.


In [16]:
user_similarity

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.45083482],
       [0.        , 1.        , 0.        , ..., 0.        , 0.65094455,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.65094455, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.45083482, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

## RSME

In [17]:
train_pred_matrix = np.dot(user_similarity, train_matrix.values) / np.abs(user_similarity).sum(axis=1, keepdims=True)
train_rmse = compute_rmse(train_matrix.values, train_pred_matrix)
print(f"Training RMSE (UBCF): {train_rmse:.4f}")

Training RMSE (UBCF): 0.3477


In [18]:
val_matrix_aligned = val_matrix.reindex(index=train_matrix.index, fill_value=0)

val_pred_matrix = np.dot(user_similarity, val_matrix_aligned.values) / np.abs(user_similarity).sum(axis=1, keepdims=True)

val_rmse = compute_rmse(val_matrix_aligned.values, val_pred_matrix)
print(f"Validation RMSE (UBCF): {val_rmse:.4f}")

Validation RMSE (UBCF): 0.2275


In [19]:
test_matrix_aligned = test_matrix.reindex(index=train_matrix.index, fill_value=0)

test_pred_matrix = np.dot(user_similarity, test_matrix_aligned.values) / np.abs(user_similarity).sum(axis=1, keepdims=True)

test_rmse = compute_rmse(test_matrix_aligned.values, test_pred_matrix)
print(f"Test RMSE (UBCF): {test_rmse:.4f}")

Test RMSE (UBCF): 0.2276


## MAE

In [20]:
train_mae = compute_mae(train_matrix.values, train_pred_matrix)
val_mae = compute_mae(val_matrix_aligned.values, val_pred_matrix)
test_mae = compute_mae(test_matrix_aligned.values, test_pred_matrix)

In [21]:
print(f"Training MAE (UBCF): {train_mae:.4f}")
print(f"Validation MAE (UBCF): {val_mae:.4f}")
print(f"Test MAE (UBCF): {test_mae:.4f}")

Training MAE (UBCF): 0.0845
Validation MAE (UBCF): 0.0237
Test MAE (UBCF): 0.0237


## MAP

In [22]:
train_map = compute_map(train_matrix.values, train_pred_matrix)
val_map = compute_map(val_matrix_aligned.values, val_pred_matrix)
test_map = compute_map(test_matrix_aligned.values, test_pred_matrix)

In [23]:
print(f"Training MAP (UBCF): {train_map:.4f}")
print(f"Validation MAP (UBCF): {val_map:.4f}")
print(f"Test MAP (UBCF): {test_map:.4f}")

Training MAP (UBCF): 0.9646
Validation MAP (UBCF): 0.0529
Test MAP (UBCF): 0.0534


## Precision

In [24]:
train_precision = compute_precision_all_users(train_matrix.values, train_pred_matrix)
val_precision = compute_precision_all_users(val_matrix_aligned.values, val_pred_matrix)
test_precision = compute_precision_all_users(test_matrix_aligned.values, test_pred_matrix)

In [25]:
print(f"Training Precision (UBCF): {train_precision:.4f}")
print(f"Validation Precision (UBCF): {val_precision:.4f}")
print(f"Test Precision (UBCF): {test_precision:.4f}")

Training Precision (UBCF): 0.7260
Validation Precision (UBCF): 0.0000
Test Precision (UBCF): 0.0001


## Recall

In [26]:
train_recall = compute_recall_all_users(train_matrix.values, train_pred_matrix)
val_recall = compute_recall_all_users(val_matrix_aligned.values, val_pred_matrix)
test_recall = compute_recall_all_users(test_matrix_aligned.values, test_pred_matrix)

In [27]:
print(f"Training Recall (UBCF): {train_recall:.4f}")
print(f"Validation Recall (UBCF): {val_recall:.4f}")
print(f"Test Recall (UBCF): {test_recall:.4f}")

Training Recall (UBCF): 0.6483
Validation Recall (UBCF): 0.0000
Test Recall (UBCF): 0.0001


# Section 6 | SVD

In [28]:
from sklearn.decomposition import TruncatedSVD

In [29]:
n_factors = 20
svd = TruncatedSVD(n_components=n_factors, random_state=42)
U_train = svd.fit_transform(train_matrix)
Sigma = svd.components_

## RMSE

In [30]:
train_approx_matrix = np.dot(U_train, Sigma)
train_rmse = compute_rmse(train_matrix.values, train_approx_matrix)
print(f"Training RMSE: {train_rmse:.4f}")

Training RMSE: 0.3680


In [31]:
U_val = np.dot(val_matrix.values, Sigma.T)
val_approx_matrix = np.dot(U_val, Sigma)
val_rmse = compute_rmse(val_matrix.values, val_approx_matrix)
print(f"Validation RMSE: {val_rmse:.4f}")

Validation RMSE: 0.3046


In [32]:
U_test = np.dot(test_matrix.values, Sigma.T)
test_approx_matrix = np.dot(U_test, Sigma)
test_rmse = compute_rmse(test_matrix.values, test_approx_matrix)
print(f"Test RMSE: {test_rmse:.4f}")

Test RMSE: 0.3026


## MAE

In [33]:
train_mae_svd = compute_mae(train_matrix.values, train_approx_matrix)
val_mae_svd = compute_mae(val_matrix.values, val_approx_matrix)
test_mae_svd = compute_mae(test_matrix.values, test_approx_matrix)

In [34]:
print(f"Training MAE (SVD): {train_mae_svd:.4f}")
print(f"Validation MAE (SVD): {val_mae_svd:.4f}")
print(f"Test MAE (SVD): {test_mae_svd:.4f}")

Training MAE (SVD): 0.0583
Validation MAE (SVD): 0.0414
Test MAE (SVD): 0.0416


## MAP

In [35]:
train_map_svd = compute_map(train_matrix.values, train_approx_matrix)
val_map_svd = compute_map(val_matrix.values, val_approx_matrix)
test_map_svd = compute_map(test_matrix.values, test_approx_matrix)

In [36]:
print(f"Training MAP (SVD): {train_map_svd:.4f}")
print(f"Validation MAP (SVD): {val_map_svd:.4f}")
print(f"Test MAP (SVD): {test_map_svd:.4f}")

Training MAP (SVD): 0.6777
Validation MAP (SVD): 0.6553
Test MAP (SVD): 0.6497


## Precison

In [37]:
train_precision_svd = compute_precision_all_users(train_matrix.values, train_approx_matrix)
val_precision_svd = compute_precision_all_users(val_matrix.values, val_approx_matrix)
test_precision_svd = compute_precision_all_users(test_matrix.values, test_approx_matrix)

In [38]:
print(f"Training Precision (SVD): {train_precision_svd:.4f}")
print(f"Validation Precision (SVD): {val_precision_svd:.4f}")
print(f"Test Precision (SVD): {test_precision_svd:.4f}")

Training Precision (SVD): 0.7022
Validation Precision (SVD): 0.6217
Test Precision (SVD): 0.6089


## Recall

In [39]:
train_recall_svd = compute_recall_all_users(train_matrix.values, train_approx_matrix)
val_recall_svd = compute_recall_all_users(val_matrix.values, val_approx_matrix)
test_recall_svd = compute_recall_all_users(test_matrix.values, test_approx_matrix)

In [40]:
print(f"Training Recall (SVD): {train_recall_svd:.4f}")
print(f"Validation Recall (SVD): {val_recall_svd:.4f}")
print(f"Test Recall (SVD): {test_recall_svd:.4f}")

Training Recall (SVD): 0.5833
Validation Recall (SVD): 0.5817
Test Recall (SVD): 0.5740


# Section 7 | BERT

# Visualisasi