In [8]:
# Import Libraries
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load Dataset
data = pd.read_csv("/content/Tempat-Wisata-Toba-Preprocessing.csv")

In [9]:
# Data Cleaning
data_cleaned = data.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])
data_cleaned['Reviews'] = data_cleaned['Reviews'].fillna('No review')
data_cleaned = data_cleaned[data_cleaned['Rating'].between(1, 5)]
data_cleaned = data_cleaned.drop_duplicates()
data_cleaned['Nama_tempat_wisata'] = data_cleaned['Nama_tempat_wisata'].str.title()
data_cleaned['Reviews'] = data_cleaned['Reviews'].str.lower()
data_cleaned['Category'] = data_cleaned['Category'].str.title()
data_cleaned['ReviewerId'] = data_cleaned['ReviewerId'].astype(str)

In [10]:
# Group Data by Reviewer and Place
data_grouped = data_cleaned.groupby(['Nama_tempat_wisata', 'ReviewerId'], as_index=False)['Rating'].mean()

In [11]:
# Pivot to Create User-Item Matrix
df = data_grouped.pivot(index='Nama_tempat_wisata', columns='ReviewerId', values='Rating').fillna(0)

In [12]:
# Sparsity Filtering
user_interactions = df.astype(bool).sum(axis=1)
df_filtered = df.loc[user_interactions[user_interactions >= 5].index]

place_reviews = df_filtered.astype(bool).sum(axis=0)
df_filtered = df_filtered.loc[:, place_reviews[place_reviews >= 10].index]

# Check if the filtered dataset is empty
if df_filtered.empty:
    raise ValueError("Filtered dataset is empty. Adjust the sparsity filter criteria.")
else:
    print(f"Filtered data shape: {df_filtered.shape}")

Filtered data shape: (97, 188)


In [13]:
# Normalize Data
df_normalized = df_filtered.subtract(df_filtered.mean(axis=1), axis=0).fillna(0)

In [14]:
# Split Data into Train, Validation, and Test Sets
train_data, temp_data = train_test_split(df_normalized, test_size=0.3, random_state=42)
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Train Data Shape: {train_data.shape}")
print(f"Validation Data Shape: {validation_data.shape}")
print(f"Test Data Shape: {test_data.shape}")

Train Data Shape: (67, 188)
Validation Data Shape: (15, 188)
Test Data Shape: (15, 188)


In [15]:
# Train KNN Model
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(train_data)

In [16]:
# Function to calculate Precision, Recall, and MAP
def precision_at_k(recommended_items, actual_items, k=5):
    recommended_set = set(recommended_items[:k])
    actual_set = set(actual_items)
    true_positives = len(recommended_set.intersection(actual_set))
    return true_positives / k if k > 0 else 0

def recall_at_k(recommended_items, actual_items, k=5):
    recommended_set = set(recommended_items[:k])
    actual_set = set(actual_items)
    true_positives = len(recommended_set.intersection(actual_set))
    return true_positives / len(actual_set) if len(actual_set) > 0 else 0

def mean_average_precision(recommended_items, actual_items, k=5):
    if len(actual_items) == 0:
        return 0.0
    average_precision = 0.0
    for i in range(min(k , len(recommended_items))):
        if recommended_items[i] in actual_items:
            average_precision += precision_at_k(recommended_items, actual_items, i + 1)
    return average_precision / min(k, len(recommended_items)) if min(k, len(recommended_items)) > 0 else 0

In [17]:
# Recommendation Function
def recommend_items_knn(df, reviewer_id, n_neighbors=5, n_recommendations=5):
    if reviewer_id not in df.columns:
        raise ValueError(f"ReviewerId {reviewer_id} not found in data.")

    user_ratings = df[reviewer_id]
    rated_items = user_ratings[user_ratings > 0].index.tolist()
    unrated_items = df.index.difference(rated_items)

    recommendations = []
    for item in unrated_items:
        try:
            item_vector = pd.DataFrame([df.loc[item]], columns=df.columns)
            distances, indices = knn_model.kneighbors(item_vector, n_neighbors=n_neighbors)

            similar_items = df.index[indices.flatten()]
            similar_scores = distances.flatten()

            weighted_sum = 0
            similarity_sum = 0
            for sim_item, score in zip(similar_items, similar_scores):
                if sim_item in rated_items:
                    rating = df.loc[sim_item, reviewer_id]
                    weighted_sum += rating * (1 - score)
                    similarity_sum += (1 - score)

            predicted_rating = weighted_sum / similarity_sum if similarity_sum > 0 else global_avg
            recommendations.append((item, predicted_rating))
        except:
            continue

    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    return recommendations[:n_recommendations]

# Test Recommendation Function
reviewer_id = df_filtered.columns[0]  # Ambil Reviewer ID pertama sebagai contoh
recommended_items = recommend_items_knn(df_filtered, reviewer_id, n_neighbors=5, n_recommendations=5)

print("Recommendations for Reviewer:", reviewer_id)
for item, rating in recommended_items:
    print(f"{item}, Rating: {rating:.2f}")

Recommendations for Reviewer: 1.00016e+20
Air Terjun Siboruon, Rating: 5.00
Batu Gantung, Rating: 5.00
Bukit Senyum Motung, Rating: 5.00
Desa Meat, Rating: 5.00
Istana Kaldera Unesco Geopark Danau Toba, Rating: 5.00


In [None]:
# Validation: Predict Ratings and Evaluate
global_avg = train_data[train_data > 0].mean().mean()

predicted_ratings = []
actual_ratings = []
recommended_items_list = []

for item in validation_data.index:
    for reviewer_id in validation_data.columns:
        actual_rating = validation_data.loc[item, reviewer_id]
        if actual_rating > 0:
            try:
                item_vector = pd.DataFrame([train_data.loc[item]], columns=train_data.columns)
                distances, indices = knn_model.kneighbors(item_vector, n_neighbors=5)

                similar_items = train_data.index[indices.flatten()]
                similar_scores = distances.flatten()

                weighted_sum = 0
                similarity_sum = 0
                for sim_item, score in zip(similar_items, similar_scores):
                    if reviewer_id in train_data.columns and sim_item in train_data.index:
                        rating = train_data.loc[sim_item, reviewer_id]
                        weighted_sum += rating * (1 - score)
                        similarity_sum += (1 - score)

                predicted_rating = weighted_sum / similarity_sum if similarity_sum > 0 else global_avg
                predicted_rating = max(1, min(5, predicted_rating))
            except:
                predicted_rating = global_avg

            predicted_ratings.append(predicted_rating)
            actual_ratings.append(actual_rating)

            # Collect recommended items for precision and recall calculation
            recommended_items = [item for item, _ in recommend_items_knn(df_filtered, reviewer_id, n_neighbors=5, n_recommendations=5)]
            recommended_items_list.append(recommended_items)

# Calculate Validation Metrics
mse_val = mean_squared_error(actual_ratings, predicted_ratings)
rmse_val = np.sqrt(mse_val)
mae_val = mean_absolute_error(actual_ratings, predicted_ratings)

# Calculate Precision, Recall, and MAP
precision_scores = [precision_at_k(recommended_items, actual_ratings, k=5) for recommended_items, actual_rating in zip(recommended_items_list, actual_ratings)]
recall_scores = [recall_at_k(recommended_items, actual_ratings, k=5) for recommended_items, actual_rating in zip(recommended_items_list, actual_ratings)]
map_score = np.mean([mean_average_precision(recommended_items, actual_ratings, k=5) for recommended_items, actual_rating in zip(recommended_items_list, actual_ratings)])

print(f"Validation RMSE: {rmse_val:.2f}")
print(f"Validation MAE: {mae_val:.2f}")
print(f"Validation Precision@5: {np.mean(precision_scores):.2f}")
print(f"Validation Recall@5: {np.mean(recall_scores):.2f}")
print(f"Validation MAP@5: {map_score:.2f}")

# Test: Final Evaluation
predicted_ratings_test = []
actual_ratings_test = []
recommended_items_list_test = []

for item in test_data.index:
    for reviewer_id in test_data.columns:
        actual_rating = test_data.loc[item, reviewer_id]
        if actual_rating > 0:
            try:
                item_vector = pd.DataFrame([train_data.loc[item]], columns=train_data.columns)
                distances, indices = knn_model.kneighbors(item_vector, n_neighbors=5)

                similar_items = train_data.index[indices.flatten()]
                similar_scores = distances.flatten()

                weighted_sum = 0
                similarity_sum = 0
                for sim_item, score in zip(similar_items, similar_scores):
                    if reviewer_id in train_data.columns and sim_item in train_data.index:
                        rating = train_data.loc[sim_item, reviewer_id]
                        weighted_sum += rating * (1 - score)
                        similarity_sum += (1 - score)

                predicted_rating = weighted_sum / similarity_sum if similarity_sum > 0 else global_avg
                predicted_rating = max(1, min(5, predicted_rating))
            except:
                predicted_rating = global_avg

            predicted_ratings_test.append(predicted_rating)
            actual_ratings_test.append(actual_rating)

            # Collect recommended items for precision and recall calculation
            recommended_items = [item for item, _ in recommend_items_knn(df_filtered, reviewer_id, n_neighbors=5, n_recommendations=5)]
            recommended_items_list_test.append(recommended_items)

# Calculate Test Metrics
mse_test = mean_squared_error(actual_ratings_test, predicted_ratings_test)
rmse_test = np.sqrt(mse_test)
mae_test = mean_absolute_error(actual_ratings_test, predicted_ratings_test)

# Calculate Precision, Recall, and MAP for Test Set
precision_scores_test = []
recall_scores_test = []
map_scores_test = []

# Get a list of all actual items the user interacted with
# Use 'Nama_tempat_wisata' instead of 'item' to get the places the user reviewed
all_actual_items = data_cleaned[data_cleaned['ReviewerId'] == reviewer_id]['Nama_tempat_wisata'].tolist()

for recommended_items in recommended_items_list_test:
    # Use all_actual_items instead of actual_rating for precision, recall, and MAP calculation
    precision = precision_at_k(recommended_items, all_actual_items, k=5)
    recall = recall_at_k(recommended_items, all_actual_items, k=5)
    map_score = mean_average_precision(recommended_items, all_actual_items, k=5)

    precision_scores_test.append(precision)
    recall_scores_test.append(recall)
    map_scores_test.append(map_score)

print(f"Test RMSE: {rmse_test:.2f}")
print(f"Test MAE: {mae_test:.2f}")
print(f"Test Precision@5: {np.mean(precision_scores_test):.2f}")
print(f"Test Recall@5: {np.mean(recall_scores_test):.2f}")
print(f"Test MAP@5: {np.mean(map_scores_test):.2f}")

Validation RMSE: 0.95
Validation MAE: 0.69
Validation Precision@5: 0.00
Validation Recall@5: 0.00
Validation MAP@5: 0.00
