In [1]:
import pandas as pd
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

## Import Data

In [2]:
DF_PATH = '../data/interim/'
df_food = pd.read_csv(DF_PATH + '0_food_labelled_resampled.csv')
df_electronic = pd.read_csv(DF_PATH + '0_electronic_labelled_resampled.csv')
df_fashion = pd.read_csv(DF_PATH + '0_fashion_labelled_resampled.csv')

In [3]:
df_food.head()

Unnamed: 0,reviews,label
0,Harga:sedang\nKualitas:sedang\nRasa:blm tau\n\...,1
1,"Bagusss bangettttt ,pengemasan aman bangett re...",1
2,Kualitas:bsgus\nHarga:standar\n\nPaketny dah D...,1
3,Bumbunya mantappppppppppppppppppppppppppppppp ...,1
4,Harga:ok\nRasa:coklat\nKualitas:ok\n\nTerima k...,1


In [4]:
df_electronic.head()

Unnamed: 0,reviews,label
0,Harga murah speaker lumayan kenceng bsa karoke...,1
1,"Terima kasih barang sudah sampai, pengiriman s...",1
2,"Mantap barang nya pengiriman cepat,seller rama...",1
3,Mantap paketnya sdh sampai sesuai pesanan,1
4,"Terimakasih kak seller dan shoppe, barangnya ...",1


In [5]:
df_fashion.head()

Unnamed: 0,reviews,label
0,"Alhamdulillah jilbab sya sdh sampai, kualitas ...",1
1,Bahan nya suka bgt adem dan lembut sangat memu...,1
2,Hoodie yang tebal dan bagus,1
3,Respon penjual ramah baik. Pengiriman lumayan ...,1
4,"bagussss bangettt, bahann tebal, pengiriman ce...",1


## Clean Reviews

In [6]:
sastrawi = StopWordRemoverFactory()
stopword = sastrawi.get_stop_words()
factory = StemmerFactory()
stemmer = factory.create_stemmer()

### Lower, Remove Newline, Stemming, & Remove Stopword

In [7]:
def clean_text_1(text):
    text = text.lower() # lower all sentence
    text = text.strip() # remove newline
    text = stemmer.stem(text) # Stemming kata berimbuhan menjadi kata dasar
    text = ' '.join(word for word in text.split() if word not in stopword)
    return text

In [9]:
print('Cleaning Food Reviews...')
df_food['clean_review_v1'] = df_food['reviews'].apply(clean_text_1)  
print('Cleaning Electronic Reviews...')
df_electronic['clean_review_v1'] = df_electronic['reviews'].apply(clean_text_1) 
print('Cleaning Fashion Reviews...') 
df_fashion['clean_review_v1'] = df_fashion['reviews'].apply(clean_text_1)
print('Cleaning Data Done.')  


Cleaning Food Reviews...
Cleaning Electronic Reviews...
Cleaning Fashion Reviews...
Cleaning Data Done.


In [11]:
df_food.to_csv('../data/interim/1_food_cleaned_v1.csv', index = False)
df_electronic.to_csv('../data/interim/1_electronic_cleaned_v1.csv', index = False)
df_fashion.to_csv('../data/interim/1_fashion_cleaned_v1.csv', index = False)

### Remove Kata Singkat & Kata dengan Huruf Acak

In [21]:
def clean_text_2(text):
    text = ' '.join(word for word in text.split() if len(word) > 3 and len(word) < 15)
    return text

In [23]:
print('Cleaning Food Reviews...')
df_food['clean_review_v2'] = df_food['clean_review_v1'].apply(clean_text_2)  
print('Cleaning Electronic Reviews...')
df_electronic['clean_review_v2'] = df_electronic['clean_review_v1'].apply(clean_text_2) 
print('Cleaning Fashion Reviews...') 
df_fashion['clean_review_v2'] = df_fashion['clean_review_v1'].apply(clean_text_2)
print('Cleaning Data Done.')  

Cleaning Food Reviews...
Cleaning Electronic Reviews...
Cleaning Fashion Reviews...
Cleaning Data Done.


In [24]:
df_food

Unnamed: 0,reviews,label,clean_review_v1,clean_review_v2
0,Harga:sedang\nKualitas:sedang\nRasa:blm tau\n\...,1,harga sedang kualitas sedang rasa blm tau pack...,harga sedang kualitas sedang rasa packing rapi...
1,"Bagusss bangettttt ,pengemasan aman bangett re...",1,bagusss bangettttt emas aman bangett recomend ...,bagusss bangettttt emas aman bangett recomend ...
2,Kualitas:bsgus\nHarga:standar\n\nPaketny dah D...,1,kualitas bsgus harga standar paketny dah daten...,kualitas bsgus harga standar paketny dateng se...
3,Bumbunya mantappppppppppppppppppppppppppppppp ...,1,bumbu mantappppppppppppppppppppppppppppppp lah...,bumbu lahhhhhh
4,Harga:ok\nRasa:coklat\nKualitas:ok\n\nTerima k...,1,harga rasa coklat kualitas terima kasih banyak...,harga rasa coklat kualitas terima kasih banyak...
...,...,...,...,...
10417,"Mengapa tidak ada plastik bungkusan susu nya, ...",0,plastik bungkus susu nya padahal mau jual,plastik bungkus susu padahal jual
10418,bocor nih,0,bocor nih,bocor
10419,"PELIT AMAT AMA KARDUS, BARANG BOCOR SEMUA PAS ...",0,pelit ama kardus barang bocor semua pas sampe ...,pelit kardus barang bocor semua sampe packing ...
10420,LAMA,0,lama,lama


In [25]:
df_food.clean_review_v2

0        harga sedang kualitas sedang rasa packing rapi...
1        bagusss bangettttt emas aman bangett recomend ...
2        kualitas bsgus harga standar paketny dateng se...
3                                           bumbu lahhhhhh
4        harga rasa coklat kualitas terima kasih banyak...
                               ...                        
10417                    plastik bungkus susu padahal jual
10418                                                bocor
10419    pelit kardus barang bocor semua sampe packing ...
10420                                                 lama
10421    kualitas packing jelek buble lembar pdhl beli ...
Name: clean_review_v2, Length: 10422, dtype: object