# Libraries

In [70]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from wordcloud import WordCloud
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tqdm import tqdm
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

# Tokenizing
nltk.download('punkt')
# Stopwords removal
nltk.download('stopwords')
# Lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

print('Completed')

Completed


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TrisnaWahyudi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TrisnaWahyudi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TrisnaWahyudi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\TrisnaWahyudi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\TrisnaWahyudi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [71]:
# get slang word data from github
indo_slang_word = pd.read_csv('datasets/utils/indo_slang_word.csv')
indo_slang_word.head()

Unnamed: 0,slang,formal,In-dictionary,context,category1,category2,category3
0,woww,wow,1,wow,elongasi,0,0
1,aminn,amin,1,Selamat ulang tahun kakak tulus semoga panjang...,elongasi,0,0
2,met,selamat,1,Met hari netaas kak!? Wish you all the best @t...,abreviasi,0,0
3,netaas,menetas,1,Met hari netaas kak!? Wish you all the best @t...,afiksasi,elongasi,0
4,keberpa,keberapa,0,Birthday yg keberpa kak?,abreviasi,0,0


# Preprocessing methods

## Cleaning


In [72]:
def cleaning(text):
    # Case folding
    text = text.lower()
    # Trim text
    text = text.strip()
    # Remove punctuations, special characters, and double whitespace
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\\s+', ' ', text)  # Corrected line
    # Number removal
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    # Remove number and whitespaces
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    return text

In [73]:
# Contoh penggunaan
original_sentence = "Pelayan tidak ramah dan Tidak bersahabat."
modified_sentence = cleaning(original_sentence)

print("Original Sentence:", original_sentence)
print("Modified Sentence:", modified_sentence)

Original Sentence: Pelayan tidak ramah dan Tidak bersahabat.
Modified Sentence: pelayan tidak ramah dan tidak bersahabat


## Negation Handling

In [74]:
def handle_negation(review):
    # Membuat pola regular expression untuk menemukan kata "tidak" dan kata-kata setelahnya
    negation_pattern = re.compile(r'\btidak\b\s+(\w+)')

    # Menggabungkan kata-kata setelah kata "tidak" dengan tanda garis bawah
    modified_review = negation_pattern.sub(lambda match: 'tidak' + match.group(1), review)

    return modified_review

In [75]:
# Contoh penggunaan
print("Original Sentence:", modified_sentence)
modified_sentence = handle_negation(modified_sentence)

print("Modified Sentence:", modified_sentence)

Original Sentence: pelayan tidak ramah dan tidak bersahabat
Modified Sentence: pelayan tidakramah dan tidakbersahabat


## Tokenization

In [76]:
from nltk.tokenize import word_tokenize

In [77]:
# Contoh penggunaan
print("Original Sentence:", modified_sentence)
modified_sentence = word_tokenize(modified_sentence)

print("Modified Sentence:", modified_sentence)

Original Sentence: pelayan tidakramah dan tidakbersahabat
Modified Sentence: ['pelayan', 'tidakramah', 'dan', 'tidakbersahabat']


## Word Normalization

In [78]:
def replace_slang_word(doc,slang_word):
    for index in range(0,len(doc)-1):
        index_slang = slang_word.slang==doc[index]
        formal = list(set(slang_word[index_slang].formal))
        if len(formal)==1:
            doc[index]=formal[0]
    return doc

In [79]:
# Contoh penggunaan
print("Original Sentence:", modified_sentence)
modified_sentence = replace_slang_word(modified_sentence, indo_slang_word)

print("Modified Sentence:", modified_sentence)

Original Sentence: ['pelayan', 'tidakramah', 'dan', 'tidakbersahabat']
Modified Sentence: ['pelayan', 'tidakramah', 'dan', 'tidakbersahabat']


## Stopword removal

In [80]:
filtering = stopwords.words('indonesian')
def stopword_removal(review):

    x = []
    data = []
    def myFunc(x):
      if x in filtering:
        return False
      else:
        return True

    fit = filter(myFunc, review)
    for x in fit:
      data.append(x)

    return data

In [81]:
# Contoh penggunaan
print("Original Sentence:", modified_sentence)
modified_sentence = stopword_removal(modified_sentence)

print("Modified Sentence:", modified_sentence)

Original Sentence: ['pelayan', 'tidakramah', 'dan', 'tidakbersahabat']
Modified Sentence: ['pelayan', 'tidakramah', 'tidakbersahabat']


## Stemming

In [82]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [83]:
# Contoh penggunaan
print("Original Sentence:", modified_sentence)
modified_sentence = [stemmer.stem(word) for word in modified_sentence]

print("Modified Sentence:", modified_sentence)

Original Sentence: ['pelayan', 'tidakramah', 'tidakbersahabat']
Modified Sentence: ['layan', 'tidakramah', 'tidakbersahabat']


## Preprocessor

In [84]:
def data_preprocessor(data, label):
  tqdm.pandas(desc="Proses Cleaning")
  data['review'] = data['review'].progress_apply(lambda x: cleaning(x))
  
  tqdm.pandas(desc="Proses Negation Handling")
  data['review'] = data['review'].progress_apply(lambda x: handle_negation(x))
  
  tqdm.pandas(desc="Proses Tokenizing")
  data['review'] = data['review'].progress_apply(lambda x: word_tokenize(x))
  
  tqdm.pandas(desc="Proses Word Normalizing")
  data['review'] = data['review'].progress_apply(lambda x: replace_slang_word(x, indo_slang_word))
  
  tqdm.pandas(desc="Proses Stopword Removal")
  data['review'] = data['review'].progress_apply(stopword_removal)
  
  tqdm.pandas(desc="Proses Stemming")
  data['review'] = data['review'].progress_apply(lambda x: [stemmer.stem(word) for word in x])
  data["review"] = data["review"].apply(' '.join)
  
  # drop unused coloumns
  data['label'] = data[label]
  data = data.drop(["kebersihan", "linen", "service", "Gabungan", "aspek"], axis=1)
  
  return data

# Datasets Preprocessing

In [85]:
kebersihan_positive = pd.read_csv('datasets/kebersihan-positive.csv')
kebersihan_negative = pd.read_csv('datasets/kebersihan-negative.csv')

service_positive = pd.read_csv('datasets/service-positive.csv')
service_negative = pd.read_csv('datasets/service-negative.csv')

linen_positive = pd.read_csv('datasets/linen-positive.csv')
linen_negative = pd.read_csv('datasets/linen-negative.csv')


# Combine dataframes
kebersihan_combined = pd.concat([kebersihan_positive, kebersihan_negative])
service_combined = pd.concat([service_positive, service_negative])
linen_combined = pd.concat([linen_positive, linen_negative])

# aspect labeling
kebersihan_combined['aspek'] = 'kebersihan'
service_combined['aspek'] =  'service'
linen_combined['aspek'] =  'linen'

# Combine all dataframes
all_data = pd.concat([kebersihan_combined, service_combined, linen_combined])

all_data.head()

Unnamed: 0,review,kebersihan,linen,service,Gabungan,aspek
0,Tempat nyaman bersih tapi tv terlalu tinggi ti...,pos,neut,neut,posneutneut,kebersihan
1,"hotelnya bersih dan nyaman, tp keyika masuk ka...",pos,neut,neut,posneutneut,kebersihan
2,"Bersih,nyaman & asri. Cuma syg prnh airnya mat...",pos,neut,neut,posneutneut,kebersihan
3,"Kamar bersih dan nyaman, sarapan minimalis, cu...",pos,neut,neut,posneutneut,kebersihan
4,bersih tempatnya. saya suka. cuma air panasnya...,pos,neut,neut,posneutneut,kebersihan


In [86]:
# save combined dataset as new csv file
all_data.to_csv('datasets/hotel-review.csv', index=False)

## Clustering Datasets

In [87]:
df = pd.read_csv('datasets/hotel-review.csv')
df.head(10)

Unnamed: 0,review,kebersihan,linen,service,Gabungan,aspek
0,Tempat nyaman bersih tapi tv terlalu tinggi ti...,pos,neut,neut,posneutneut,kebersihan
1,"hotelnya bersih dan nyaman, tp keyika masuk ka...",pos,neut,neut,posneutneut,kebersihan
2,"Bersih,nyaman & asri. Cuma syg prnh airnya mat...",pos,neut,neut,posneutneut,kebersihan
3,"Kamar bersih dan nyaman, sarapan minimalis, cu...",pos,neut,neut,posneutneut,kebersihan
4,bersih tempatnya. saya suka. cuma air panasnya...,pos,neut,neut,posneutneut,kebersihan
5,"Bersih, nyaman, tapi WiFi lola banget",pos,neut,neut,posneutneut,kebersihan
6,"Kamar bagus,rapi,bersih dan wangi tapi sayang ...",pos,neut,neut,posneutneut,kebersihan
7,"bersih, kamar yg nyaman to kurang 1 yaitu WiFi...",pos,neut,neut,posneutneut,kebersihan
8,kamar bersih namun air panas tidak teesedia,pos,neut,neut,posneutneut,kebersihan
9,"haha..keliatan kolam renang besar,gk tau buat ...",pos,neut,neut,posneutneut,kebersihan


In [88]:
df_clean = data_preprocessor(df, 'aspek')
df_clean.head()

Proses Cleaning: 100%|██████████| 894/894 [00:00<00:00, 50913.22it/s]
Proses Negation Handling: 100%|██████████| 894/894 [00:00<00:00, 297879.55it/s]
Proses Tokenizing: 100%|██████████| 894/894 [00:00<00:00, 16829.17it/s]
Proses Word Normalizing:   3%|▎         | 26/894 [00:00<00:22, 38.22it/s]

Proses Word Normalizing: 100%|██████████| 894/894 [00:10<00:00, 85.60it/s] 
Proses Stopword Removal: 100%|██████████| 894/894 [00:00<00:00, 16532.23it/s]
Proses Stemming: 100%|██████████| 894/894 [00:40<00:00, 22.07it/s] 


Unnamed: 0,review,label
0,nyaman bersih tv tidakbisa lihat,kebersihan
1,hotel bersih nyaman keyika masuk kamar bau bek...,kebersihan
2,bersih nyaman asri sayang air mati lokasi bkn ...,kebersihan
3,kamar bersih nyaman sarap minimal menu air pan...,kebersihan
4,bersih tempat suka air panas macet kmrn,kebersihan


In [89]:
# save data to csv file
df_clean.to_csv('datasets/cleaned_datasets/clean_clustering_dataset.csv', index=False)

## Aspek Kebersihan

In [90]:
kebersihan_combined.head()

Unnamed: 0,review,kebersihan,linen,service,Gabungan,aspek
0,Tempat nyaman bersih tapi tv terlalu tinggi ti...,pos,neut,neut,posneutneut,kebersihan
1,"hotelnya bersih dan nyaman, tp keyika masuk ka...",pos,neut,neut,posneutneut,kebersihan
2,"Bersih,nyaman & asri. Cuma syg prnh airnya mat...",pos,neut,neut,posneutneut,kebersihan
3,"Kamar bersih dan nyaman, sarapan minimalis, cu...",pos,neut,neut,posneutneut,kebersihan
4,bersih tempatnya. saya suka. cuma air panasnya...,pos,neut,neut,posneutneut,kebersihan


In [91]:
df_kebersihan_clean = data_preprocessor(kebersihan_combined, 'kebersihan')
df_kebersihan_clean.head()

Proses Cleaning: 100%|██████████| 298/298 [00:00<00:00, 23820.37it/s]
Proses Negation Handling: 100%|██████████| 298/298 [00:00<00:00, 149242.10it/s]
Proses Tokenizing: 100%|██████████| 298/298 [00:00<00:00, 11355.73it/s]
Proses Word Normalizing:   2%|▏         | 5/298 [00:00<00:06, 44.04it/s]

Proses Word Normalizing: 100%|██████████| 298/298 [00:04<00:00, 66.85it/s]
Proses Stopword Removal: 100%|██████████| 298/298 [00:00<00:00, 12417.94it/s]
Proses Stemming: 100%|██████████| 298/298 [00:00<00:00, 31463.09it/s]


Unnamed: 0,review,label
0,nyaman bersih tv tidakbisa lihat,pos
1,hotel bersih nyaman keyika masuk kamar bau bek...,pos
2,bersih nyaman asri sayang air mati lokasi bkn ...,pos
3,kamar bersih nyaman sarap minimal menu air pan...,pos
4,bersih tempat suka air panas macet kmrn,pos


In [92]:
df_kebersihan_clean.to_csv('datasets/cleaned_datasets/cleaned-kebersihan-data.csv', index=False)

## Aspek Service

In [93]:
service_combined.head()

Unnamed: 0,review,kebersihan,linen,service,Gabungan,aspek
0,"Pelayanan ramah, staf semua ramah dan baik.",neut,neut,pos,neutneutpos,service
1,Pelayanan bagus sangat ramah.,neut,neut,pos,neutneutpos,service
2,"Layanan yang memuaskan, pelayanannya ramah.",neut,neut,pos,neutneutpos,service
3,"Pelayanan ok, tapi...",neut,neut,pos,neutneutpos,service
4,Layanan yang memuaskan bagai di rumah sendiri.,neut,neut,pos,neutneutpos,service


In [94]:
df_service_clean = data_preprocessor(service_combined, 'service')
df_service_clean.head()

Proses Cleaning: 100%|██████████| 298/298 [00:00<00:00, 42571.61it/s]
Proses Negation Handling: 100%|██████████| 298/298 [00:00<00:00, 297737.64it/s]
Proses Tokenizing: 100%|██████████| 298/298 [00:00<00:00, 14288.85it/s]
Proses Word Normalizing: 100%|██████████| 298/298 [00:03<00:00, 91.81it/s] 
Proses Stopword Removal: 100%|██████████| 298/298 [00:00<00:00, 11867.33it/s]
Proses Stemming: 100%|██████████| 298/298 [00:00<00:00, 33120.53it/s]


Unnamed: 0,review,label
0,layan ramah staf ramah,pos
1,layan bagus ramah,pos
2,layan muas layan ramah,pos
3,layan ok,pos
4,layan muas rumah,pos


In [95]:
df_service_clean.to_csv('datasets/cleaned_datasets/cleaned-service-data.csv', index=False)

## Aspek Linen

In [96]:
linen_combined.head()

Unnamed: 0,review,kebersihan,linen,service,Gabungan,aspek
0,"Saya senang dengan kualitas tidurnya, bantal-b...",neut,pos,neut,neutposneut,linen
1,"Meskipun ada sedikit kekurangan, namun kamar c...",neut,pos,neut,neutposneut,linen
2,"Kualitas kamarnya bagus, terutama ukuran tempa...",neut,pos,neut,neutposneut,linen
3,Kamar terjangkau dengan fasilitas makanan ring...,neut,pos,neut,neutposneut,linen
4,Kamarnya bagus dan pertama kali menggunakan Ai...,neut,pos,neut,neutposneut,linen


In [97]:
df_linen_clean = data_preprocessor(linen_combined, 'linen')
df_linen_clean.head()

Proses Cleaning: 100%|██████████| 298/298 [00:00<00:00, 42577.41it/s]
Proses Negation Handling: 100%|██████████| 298/298 [00:00<00:00, 297454.21it/s]
Proses Tokenizing: 100%|██████████| 298/298 [00:00<00:00, 16476.66it/s]
Proses Word Normalizing: 100%|██████████| 298/298 [00:03<00:00, 94.12it/s] 
Proses Stopword Removal: 100%|██████████| 298/298 [00:00<00:00, 17598.07it/s]
Proses Stemming: 100%|██████████| 298/298 [00:00<00:00, 38977.85it/s]


Unnamed: 0,review,label
0,senang kualitas tidur bantal bantal lembut tid...,pos
1,kurang kamar luas kasur empuk,pos
2,kualitas kamar bagus ukur tidur fasilitas lengkap,pos
3,kamar jangkau fasilitas makan ringan sarap sen...,pos
4,kamar bagus kali airy muas makan ringan alat t...,pos


In [98]:
df_linen_clean.to_csv('datasets/cleaned_datasets/cleaned-linen-data.csv', index=False)