# Content Based Filtering : Hotel Bandung

In [51]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import re
import random

df = pd.read_csv("https://raw.githubusercontent.com/wulannurafifah/Content-Based-Recomendation/main/Data%20Stupen%20-%20Sheet1%20(1).csv")
df.head()

Unnamed: 0,Nama Perusahaan,Alamat,Modul Pembelajaran
0,PT Orbit Ventura Indonesia,"Veteran RI Building 15th Floor Unit Z15-002, P...",Sasaran Pengembangan Skill Coding
1,Wulan,Pemalang,Text Mining
2,Wulan,Pemalang,Machine Learning
3,PT Nurul Fikri Cipta Inovasi,Jl. Situ Indah No.116 RT. 006 RW. 010 Kel. Tug...,UI/UX


## 1. Ikhtisar

In [52]:
df.describe()

Unnamed: 0,Nama Perusahaan,Alamat,Modul Pembelajaran
count,4,4,4
unique,3,3,4
top,Wulan,Pemalang,Sasaran Pengembangan Skill Coding
freq,2,2,1


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Nama Perusahaan     4 non-null      object
 1   Alamat              4 non-null      object
 2   Modul Pembelajaran  4 non-null      object
dtypes: object(3)
memory usage: 224.0+ bytes


## 2. Deskripsi Hotel (Sebelum Preprocessing)

In [58]:
def print_description(index):
    example = df[df.index == index][['Modul Pembelajaran', 'Nama Perusahaan','Alamat']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Nama:', example[1])
        print('Alamat:', example[2])

In [59]:
print_description(1)

Text Mining
Nama: Wulan
Alamat: Pemalang


In [61]:
print_description(3)

UI/UX
Nama: PT Nurul Fikri Cipta Inovasi
Alamat: Jl. Situ Indah No.116 RT. 006 RW. 010 Kel. Tugu, Kec. Cimanggis, Kota Depok, Jawa Barat


In [62]:
print_description(1)

Text Mining
Nama: Wulan
Alamat: Pemalang


## 3. Text Preprocessing

In [63]:
import nltk
nltk.download('stopwords')
clean_spcl = re.compile('[/(){}\[\]\|@,;]')
clean_symbol = re.compile('[^0-9a-z #+_]')
#stopworda = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = clean_spcl.sub(' ', text)
    text = clean_symbol.sub('', text)
    #text = ' '.join(word for word in text.split() if word not in stopworda) # hapus stopword dari kolom deskripsi
    return text
    
df['modulpembelajaran_clean'] = df['Modul Pembelajaran'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [64]:
df.head()

Unnamed: 0,Nama Perusahaan,Alamat,Modul Pembelajaran,modulpembelajaran_clean
0,PT Orbit Ventura Indonesia,"Veteran RI Building 15th Floor Unit Z15-002, P...",Sasaran Pengembangan Skill Coding,sasaran pengembangan skill coding
1,Wulan,Pemalang,Text Mining,text mining
2,Wulan,Pemalang,Machine Learning,machine learning
3,PT Nurul Fikri Cipta Inovasi,Jl. Situ Indah No.116 RT. 006 RW. 010 Kel. Tug...,UI/UX,ui ux


## 4. Deskripsi Hotel (Setelah Preprocessing)

In [65]:
# Deskripsi kedua (Setelah preprocessing)
def print_description_clean(index):
    example = df[df.index == index][['modulpembelajaran_clean', 'Nama Perusahaan','Alamat']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Nama:', example[1])
        print('Alamat:', example[2])

In [66]:
print_description_clean(1)

text mining
Nama: Wulan
Alamat: Pemalang


In [68]:
print_description_clean(2)

machine learning
Nama: Wulan
Alamat: Pemalang


In [70]:
print_description(1)

Text Mining
Nama: Wulan
Alamat: Pemalang


## 5. TF-IDF & Cosine Similarity

In [72]:
df.set_index('Nama Perusahaan', inplace=True)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['modulpembelajaran_clean'])
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [73]:
# Set index utama di kolom 'name'
indices = pd.Series(df.index)
indices[:50]

0      PT Orbit Ventura Indonesia
1                           Wulan
2                           Wulan
3    PT Nurul Fikri Cipta Inovasi
Name: Nama Perusahaan, dtype: object

## 6. Modelling

In [84]:
def recommendations(name, cos_sim = cos_sim):
    
    recommended_hotel = []
    
    # Mengambil nama hotel berdasarkan variabel indicies
    idx = indices[indices == name].index[0]

    # Membuat series berdasarkan skor kesamaan
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)

    # mengambil index dan dibuat 10 baris rekomendasi terbaik
    top_10_indexes = list(score_series.iloc[1:2].index)
    
    for i in top_10_indexes:
        recommended_hotel.append(list(df.index)[i])
        
    return recommended_hotel

## 7. Prediksi

In [85]:
recommendations('Wulan')

['PT Orbit Ventura Indonesia']

In [86]:
recommendations("PT Orbit Ventura Indonesia")

['Wulan']

In [87]:
recommendations("PT Nurul Fikri Cipta Inovasi")

['PT Orbit Ventura Indonesia']

# Selesai...