In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re
import random
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [21]:
# Load the cafe dataset
df_cafe = pd.read_csv('dataset/New_CafeJakarta.csv')

In [22]:
df_cafe.columns

Index(['cafe_id', 'name', 'description', 'website', 'featured_image',
       'main_category', 'categories', 'workday_timing', 'closed_on', 'phone',
       'address', 'review_keywords', 'link', 'feature', 'outdoor', 'Indoor',
       'RetroVintage', 'MinimalisCafe', 'IndustrialCafe', 'ModernCafe',
       'ArtCafe', 'PetCafe', 'BooksCafe', '24HoursCafe', 'MeetingCafe',
       'StudyCafe', 'goodViews', 'FamilyCafe', 'CountryFood', 'Smoking',
       'NonSmoking', 'Coffee', 'NonCoffee', 'GardenCafe'],
      dtype='object')

In [23]:
df_cafe = df_cafe.drop(['outdoor', 'Indoor',
       'RetroVintage', 'MinimalisCafe', 'IndustrialCafe', 'ModernCafe',
       'ArtCafe', 'PetCafe', 'BooksCafe', '24HoursCafe', 'MeetingCafe',
       'StudyCafe', 'goodViews', 'FamilyCafe', 'CountryFood', 'Smoking',
       'NonSmoking', 'Coffee', 'NonCoffee', 'GardenCafe'], axis=1)

In [24]:
df_cafe = df_cafe.rename(columns={"name":"cafe_name"})

In [25]:
df_cafe.columns

Index(['cafe_id', 'cafe_name', 'description', 'website', 'featured_image',
       'main_category', 'categories', 'workday_timing', 'closed_on', 'phone',
       'address', 'review_keywords', 'link', 'feature'],
      dtype='object')

In [26]:
# df_cafe["description"].fillna("Tidak tersedia", inplace=True)
# df_cafe["website"].fillna("Tidak tersedia", inplace=True)

print(df_cafe["feature"].isnull().sum())

# cafe_df["description"] = cafe_df["description"].replace("NaN", "Tidak ada deskripsi")
# df_cafe.info()

0


In [27]:
def print_cafe_feature(index):
    example = df_cafe[df_cafe.index == index][['cafe_name', 'address', 'feature']].values[0]
    if len(example) > 0:
        print('Cafe name:', example[0])
        print('Address:', example[1])
        print('Feature:', example[2])

In [28]:
print_cafe_feature(0)

Cafe name: Bakoel Koffie Cikini
Address: Bakoel Koffie Cikini, Jl. Cikini Raya No.25, RT.16/RW.1, Cikini, Kec. Menteng, Kota Jakarta Pusat, Daerah Khusus Ibukota Jakarta 10330
Feature: outdoor, indoor, vintage, art cafe, meeting cafe, study cafe, smoking, nonsmoking, coffee, noncoffee


In [29]:
print_cafe_feature(11)

Cafe name: Hause Rooftop
Address: Hause Rooftop, Md Place, Jl. Setia Budi Selatan No.7, RT.5/RW.1, Kuningan, Setia Budi, Kecamatan Setiabudi, Kota Jakarta Selatan, Daerah Khusus Ibukota Jakarta 12910
Feature: outdoor, indoor, modern cafe, meeting cafe, study cafe, good views, family cafe, smoking, nonsmoking, coffee, noncoffee, garden cafe


In [30]:
print_cafe_feature(40)

Cafe name: Blumchen Coffee
Address: Blumchen Coffee, Fairground SCBD Lot 14, Jl. Jend. Sudirman kav 52-53 No.Kav 52-53, RT.5/RW.3, Senayan, Kec. Kby. Baru, Kota Jakarta Selatan, Daerah Khusus Ibukota Jakarta 12190
Feature: indoor, vintage, art cafe, meeting cafe, family cafe, nonsmoking, coffee, noncoffee


In [32]:
df_cafe.set_index('cafe_name', inplace=True)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0.0, stop_words='english')
tfidf_matrix = tf.fit_transform(df_cafe['feature'])
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim

array([[1.        , 0.48677596, 0.21031335, ..., 0.20109829, 0.348863  ,
        0.34399495],
       [0.48677596, 1.        , 0.36182558, ..., 0.24510972, 0.23150036,
        0.37041063],
       [0.21031335, 0.36182558, 1.        , ..., 0.08057067, 0.08299038,
        0.20061869],
       ...,
       [0.20109829, 0.24510972, 0.08057067, ..., 1.        , 0.49090403,
        0.08883776],
       [0.348863  , 0.23150036, 0.08299038, ..., 0.49090403, 1.        ,
        0.17401665],
       [0.34399495, 0.37041063, 0.20061869, ..., 0.08883776, 0.17401665,
        1.        ]])

In [33]:
# Set index utama di kolom 'name'
indices = pd.Series(df_cafe.index)
indices[:50]

0                     Bakoel Koffie Cikini
1                   Anomali Coffee Menteng
2               Lucky Cat Coffee & Kitchen
3                           Anomali Coffee
4                  Giyanti Coffee Roastery
5                                 The Cafe
6                             Arborea Cafe
7                            Walking Drums
8                       Shisha Cafe Kemang
9     Monolog Coffee Company Plaza Senayan
10                           Kedai Tjikini
11                           Hause Rooftop
12                             Kopi Kalyan
13                    Langit Seduh Rooftop
14                    Saudagar Kopi Sabang
15                    1/15 Coffee, Menteng
16                                Goedkoop
17               Fami Cafe grand indonesia
18                    Pison Coffee Jakarta
19                    One Fifteenth Coffee
20                             Crematology
21                         Djournal Coffee
22                               Starbucks
23         

In [39]:
def recommend_by_name(cafe_name, cos_sim=cos_sim):
    recommended_cafe = []

    # Mengambil nama kafe berdasarkan variabel indices
    idx = indices[indices == cafe_name].index[0]

    # Membuat series berdasarkan skor kesamaan
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending=False)

    # Mengambil index dan dibuat 10 baris rekomendasi terbaik
    top_10_indexes = list(score_series.iloc[1:11].index)

    for i in top_10_indexes:
        recommended_cafe.append(list(df_cafe.index)[i])

    return recommended_cafe


In [49]:
recommend_by_name("The Cat Cabin")

['Coffee D. Lites',
 '1947 Cafe Jakarta',
 'Kopi Cat Cafe by Groovy - Kemang',
 'One Fifteenth Coffee',
 'Super Cup Coffee Shop',
 '127 Cafe - Kosenda Hotel',
 "The People's Cafe Rajawali",
 'Cecemuwe Cafe and Space - Senayan',
 'Fami Cafe grand indonesia',
 'One Eighty Cafe']

In [75]:
import Levenshtein

def recommend_by_feature(query, cos_sim=cos_sim):
    recommended_cafe = []

    # Mencari indeks berdasarkan setiap kata kunci dalam kolom 'feature'
    matching_indices = set()
    
    for keyword in query.split():
        # Menentukan batas jarak Levenshtein yang diterima
        similarity_threshold = 0.4  # Sesuaikan sesuai kebutuhan

        # Memfilter kata kunci berdasarkan jarak Levenshtein
        similar_keywords = [word for word in df_cafe['feature'].unique() if Levenshtein.ratio(keyword, word) >= similarity_threshold]
        
        # Mencari indeks kafe yang memiliki setidaknya satu fitur yang cocok dengan kata kunci
        matching_indices.update(df_cafe[df_cafe['feature'].apply(lambda x: any(keyword in x for keyword in similar_keywords))].index)

    if not matching_indices:
        print(f"Tidak ada kafe yang cocok dengan kata kunci '{query}' dalam kolom 'feature'.")
        return recommended_cafe

    # Membuat series berdasarkan skor kesamaan
    score_series = pd.Series(cos_sim[list(matching_indices)]).sort_values(ascending=False)

    # Mengambil index dan dibuat 10 baris rekomendasi terbaik
    top_10_indexes = list(score_series.iloc[1:11].index)

    for i in top_10_indexes:
        recommended_cafe.append(list(df_cafe.index)[i])

    return recommended_cafe


In [57]:
df_cafe["feature"].head()

cafe_name
Bakoel Koffie Cikini          outdoor, indoor, vintage, art cafe, meeting ca...
Anomali Coffee Menteng        outdoor, indoor, minimalis cafe, modern cafe, ...
Lucky Cat Coffee & Kitchen    outdoor, indoor, minimalis cafe, 24 hours cafe...
Anomali Coffee                outdoor, indoor, industrial cafe, meeting cafe...
Giyanti Coffee Roastery       outdoor, indoor, minimalis cafe, art cafe, mee...
Name: feature, dtype: object

In [76]:
recommend_by_feature('outdoor')

Tidak ada kafe yang cocok dengan kata kunci 'outdoor' dalam kolom 'feature'.


[]