# Libraries

In [50]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
from wordcloud import WordCloud
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tqdm import tqdm
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

# Tokenizing
nltk.download('punkt')
# Stopwords removal
nltk.download('stopwords')
# Lemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

print('Completed')

Completed


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TrisnaWahyudi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TrisnaWahyudi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TrisnaWahyudi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\TrisnaWahyudi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\TrisnaWahyudi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [51]:
# get slang word data from github
indo_slang_word = pd.read_csv('datasets/utils/indo_slang_word.csv')
indo_slang_word.head()

Unnamed: 0,slang,formal,In-dictionary,context,category1,category2,category3
0,woww,wow,1,wow,elongasi,0,0
1,aminn,amin,1,Selamat ulang tahun kakak tulus semoga panjang...,elongasi,0,0
2,met,selamat,1,Met hari netaas kak!? Wish you all the best @t...,abreviasi,0,0
3,netaas,menetas,1,Met hari netaas kak!? Wish you all the best @t...,afiksasi,elongasi,0
4,keberpa,keberapa,0,Birthday yg keberpa kak?,abreviasi,0,0


# Preprocessing methods

## Cleaning


In [52]:
def cleaning(text):
    # Case folding
    text = text.lower()
    # Trim text
    text = text.strip()
    # Remove punctuations, special characters, and double whitespace
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\\s+', ' ', text)  # Corrected line
    # Number removal
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    # Remove number and whitespaces
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    return text

## Tokenization

In [53]:
from nltk.tokenize import word_tokenize

## Word Normalization

In [54]:
def replace_slang_word(doc,slang_word):
    for index in range(0,len(doc)-1):
        index_slang = slang_word.slang==doc[index]
        formal = list(set(slang_word[index_slang].formal))
        if len(formal)==1:
            doc[index]=formal[0]
    return doc

## Stopword removal

In [55]:
filtering = stopwords.words('indonesian')
def stopword_removal(review):

    x = []
    data = []
    def myFunc(x):
      if x in filtering:
        return False
      else:
        return True

    fit = filter(myFunc, review)
    for x in fit:
      data.append(x)

    return data
  


## Stemming

In [56]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Menerapkan stemming pada setiap kata dalam daftar tokens
tqdm.pandas(desc="Proses Stemming")

## Preprocessor

In [57]:
def data_preprocessor(data, label):
  tqdm.pandas(desc="Proses Cleaning")
  data['review'] = data['review'].progress_apply(lambda x: cleaning(x))
  
  tqdm.pandas(desc="Proses Tokenizing")
  data['review'] = data['review'].progress_apply(lambda x: word_tokenize(x))
  
  tqdm.pandas(desc="Proses Word Normalizing")
  data['review'] = data['review'].progress_apply(lambda x: replace_slang_word(x, indo_slang_word))
  
  tqdm.pandas(desc="Proses Stopword Removal")
  data['review'] = data['review'].progress_apply(stopword_removal)
  
  tqdm.pandas(desc="Proses Stemming")
  data['review'] = data['review'].progress_apply(lambda x: [stemmer.stem(word) for word in x])
  data["review"] = data["review"].apply(' '.join)
  
  # drop unused coloumns
  data['label'] = data[label]
  data = data.drop(["kebersihan", "linen", "service", "Gabungan", "aspek"], axis=1)
  
  return data

# Datasets Preprocessing

In [58]:
kebersihan_positive = pd.read_csv('datasets/kebersihan-positive.csv')
kebersihan_negative = pd.read_csv('datasets/kebersihan-negative.csv')

service_positive = pd.read_csv('datasets/service-positive.csv')
service_negative = pd.read_csv('datasets/service-negative.csv')

linen_positive = pd.read_csv('datasets/linen-positive.csv')
linen_negative = pd.read_csv('datasets/linen-negative.csv')


# Combine dataframes
kebersihan_combined = pd.concat([kebersihan_positive, kebersihan_negative])
service_combined = pd.concat([service_positive, service_negative])
linen_combined = pd.concat([linen_positive, linen_negative])

# aspect labeling
kebersihan_combined['aspek'] = 'kebersihan'
service_combined['aspek'] =  'service'
linen_combined['aspek'] =  'linen'

# Combine all dataframes
all_data = pd.concat([kebersihan_combined, service_combined, linen_combined])

all_data.head()

Unnamed: 0,review,kebersihan,linen,service,Gabungan,aspek
0,Tempat nyaman bersih tapi tv terlalu tinggi ti...,pos,neut,neut,posneutneut,kebersihan
1,"hotelnya bersih dan nyaman, tp keyika masuk ka...",pos,neut,neut,posneutneut,kebersihan
2,"Bersih,nyaman & asri. Cuma syg prnh airnya mat...",pos,neut,neut,posneutneut,kebersihan
3,"Kamar bersih dan nyaman, sarapan minimalis, cu...",pos,neut,neut,posneutneut,kebersihan
4,bersih tempatnya. saya suka. cuma air panasnya...,pos,neut,neut,posneutneut,kebersihan


In [59]:
# save combined dataset as new csv file
all_data.to_csv('datasets/hotel-review.csv', index=False)

## Clustering Datasets

In [60]:
df = pd.read_csv('datasets/hotel-review.csv')
df.head(10)

Unnamed: 0,review,kebersihan,linen,service,Gabungan,aspek
0,Tempat nyaman bersih tapi tv terlalu tinggi ti...,pos,neut,neut,posneutneut,kebersihan
1,"hotelnya bersih dan nyaman, tp keyika masuk ka...",pos,neut,neut,posneutneut,kebersihan
2,"Bersih,nyaman & asri. Cuma syg prnh airnya mat...",pos,neut,neut,posneutneut,kebersihan
3,"Kamar bersih dan nyaman, sarapan minimalis, cu...",pos,neut,neut,posneutneut,kebersihan
4,bersih tempatnya. saya suka. cuma air panasnya...,pos,neut,neut,posneutneut,kebersihan
5,"Bersih, nyaman, tapi WiFi lola banget",pos,neut,neut,posneutneut,kebersihan
6,"Kamar bagus,rapi,bersih dan wangi tapi sayang ...",pos,neut,neut,posneutneut,kebersihan
7,"bersih, kamar yg nyaman to kurang 1 yaitu WiFi...",pos,neut,neut,posneutneut,kebersihan
8,kamar bersih namun air panas tidak teesedia,pos,neut,neut,posneutneut,kebersihan
9,"haha..keliatan kolam renang besar,gk tau buat ...",pos,neut,neut,posneutneut,kebersihan


In [61]:
df_clean = data_preprocessor(df, 'aspek')
df_clean.head()

Proses Cleaning: 100%|██████████| 894/894 [00:00<00:00, 45775.59it/s]
Proses Tokenizing: 100%|██████████| 894/894 [00:00<00:00, 15683.64it/s]
Proses Word Normalizing: 100%|██████████| 894/894 [00:13<00:00, 63.93it/s]
Proses Stopword Removal: 100%|██████████| 894/894 [00:00<00:00, 12303.56it/s]
Proses Stemming: 100%|██████████| 894/894 [00:52<00:00, 16.99it/s] 


Unnamed: 0,review,label
0,nyaman bersih tv lihat,kebersihan
1,hotel bersih nyaman keyika masuk kamar bau bek...,kebersihan
2,bersih nyaman asri sayang air mati lokasi bkn ...,kebersihan
3,kamar bersih nyaman sarap minimal menu air pan...,kebersihan
4,bersih tempat suka air panas macet kmrn,kebersihan


In [62]:
# save data to csv file
df_clean.to_csv('datasets/cleaned_datasets/clean_clustering_dataset.csv', index=False)

## Aspek Kebersihan

In [75]:
kebersihan_combined.head()

Unnamed: 0,review,kebersihan,linen,service,Gabungan,aspek,label
0,nyaman bersih tv lihat,pos,neut,neut,posneutneut,kebersihan,pos
1,hotel bersih nyaman keyika masuk kamar bau bek...,pos,neut,neut,posneutneut,kebersihan,pos
2,bersih nyaman asri sayang air mati lokasi bkn ...,pos,neut,neut,posneutneut,kebersihan,pos
3,kamar bersih nyaman sarap minimal menu air pan...,pos,neut,neut,posneutneut,kebersihan,pos
4,bersih tempat suka air panas macet kmrn,pos,neut,neut,posneutneut,kebersihan,pos


In [76]:
df_kebersihan_clean = data_preprocessor(kebersihan_combined, 'kebersihan')
df_kebersihan_clean.head()

TypeError: data_preprocessor() got an unexpected keyword argument 'index'

In [77]:
df_kebersihan_clean.to_csv('datasets/cleaned_datasets/cleaned-kebersihan-data.csv', index=False)

## Aspek Service

In [68]:
service_combined.head()

Unnamed: 0,review,kebersihan,linen,service,Gabungan,aspek
0,"Pelayanan ramah, namun air panas shower perlu ...",neut,neut,pos,neutneutpos,service
1,staff semua ramah dan baik. namun lokasi sang...,neut,neut,pos,neutneutpos,service
2,"(+) kamar cukup luas, staff ramah, sarapan ena...",neut,neut,pos,neutneutpos,service
3,pelayanan bagus sangat ramah. cuma ac kurang d...,neut,neut,pos,neutneutpos,service
4,"pelayanan baik, kondisi kamar baik, cuma tolon...",neut,neut,pos,neutneutpos,service


In [69]:
df_service_clean = data_preprocessor(service_combined, 'service')
df_service_clean.head()

Proses Cleaning: 100%|██████████| 298/298 [00:00<00:00, 33098.60it/s]
Proses Tokenizing: 100%|██████████| 298/298 [00:00<00:00, 14195.21it/s]
Proses Word Normalizing: 100%|██████████| 298/298 [00:04<00:00, 59.97it/s]
Proses Stopword Removal: 100%|██████████| 298/298 [00:00<00:00, 11457.64it/s]
Proses Stemming: 100%|██████████| 298/298 [00:00<00:00, 29882.67it/s]


Unnamed: 0,review,label
0,layan ramah air panas shower baik,pos
1,staff ramah lokasi harga mahal fasilitas minim...,pos
2,kamar luas staff ramah sarap enak air kamar ma...,pos
3,layan bagus ramah ac dingin channel tv lengkap,pos
4,layan kondisi kamar tolong perhati kamar mandi...,pos


In [78]:
df_service_clean.to_csv('datasets/cleaned_datasets/cleaned-service-data.csv', index=False)

## Aspek Linen

In [71]:
linen_combined.head()

Unnamed: 0,review,kebersihan,linen,service,Gabungan,aspek
0,Ada harga ada rupa.. Tanpa makan pagi tapi dap...,neut,pos,neut,neutposneut,linen
1,bantalnya enak.. kamarnya dingin.. keset ga ad...,neut,pos,neut,neutposneut,linen
2,"tampilan luar boleh biasa tapi kamarnya luas, ...",neut,pos,neut,neutposneut,linen
3,"Kamar terjangkau, tersedia snack dan makan pag...",neut,pos,neut,neutposneut,linen
4,good.. ada snack di stiap kamar. cm syg pas dp...,neut,pos,neut,neutposneut,linen


In [72]:
df_linen_clean = data_preprocessor(linen_combined, 'linen')
df_linen_clean.head()

Proses Cleaning: 100%|██████████| 298/298 [00:00<00:00, 29832.03it/s]
Proses Tokenizing: 100%|██████████| 298/298 [00:00<00:00, 14909.26it/s]
Proses Word Normalizing: 100%|██████████| 298/298 [00:04<00:00, 65.10it/s]
Proses Stopword Removal: 100%|██████████| 298/298 [00:00<00:00, 11054.73it/s]
Proses Stemming: 100%|██████████| 298/298 [00:00<00:00, 26946.27it/s]


Unnamed: 0,review,label
0,harga rupa makan pagi snack kurang air panas f...,pos
1,bantal enak kamar dingin keset nha,pos
2,tampil kamar luas bed empuk fasilitas lengkap ...,pos
3,kamar jangkau sedia snack makan pagi kamar bau...,pos
4,good snack kamar sayang pas kamar ac nya dingi...,pos


In [79]:
df_linen_clean.to_csv('datasets/cleaned_datasets/cleaned-linen-data.csv', index=False)