In [39]:
import re
import pandas as pd
import matplotlib.pyplot as plt
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

## Import Data

In [3]:
DF_PATH = '../data/interim/'
df_food = pd.read_csv(DF_PATH + '0_food_labelled_resampled.csv')
df_electronic = pd.read_csv(DF_PATH + '0_electronic_labelled_resampled.csv')
df_fashion = pd.read_csv(DF_PATH + '0_fashion_labelled_resampled.csv')

In [3]:
df_food.head()

Unnamed: 0,reviews,label
0,Harga:sedang\nKualitas:sedang\nRasa:blm tau\n\...,1
1,"Bagusss bangettttt ,pengemasan aman bangett re...",1
2,Kualitas:bsgus\nHarga:standar\n\nPaketny dah D...,1
3,Bumbunya mantappppppppppppppppppppppppppppppp ...,1
4,Harga:ok\nRasa:coklat\nKualitas:ok\n\nTerima k...,1


In [4]:
df_electronic.head()

Unnamed: 0,reviews,label
0,Harga murah speaker lumayan kenceng bsa karoke...,1
1,"Terima kasih barang sudah sampai, pengiriman s...",1
2,"Mantap barang nya pengiriman cepat,seller rama...",1
3,Mantap paketnya sdh sampai sesuai pesanan,1
4,"Terimakasih kak seller dan shoppe, barangnya ...",1


In [5]:
df_fashion.head()

Unnamed: 0,reviews,label
0,"Alhamdulillah jilbab sya sdh sampai, kualitas ...",1
1,Bahan nya suka bgt adem dan lembut sangat memu...,1
2,Hoodie yang tebal dan bagus,1
3,Respon penjual ramah baik. Pengiriman lumayan ...,1
4,"bagussss bangettt, bahann tebal, pengiriman ce...",1


## Clean Reviews

In [6]:
sastrawi = StopWordRemoverFactory()
stopword = sastrawi.get_stop_words()
factory = StemmerFactory()
stemmer = factory.create_stemmer()

### Lower, Remove Newline, Stemming, & Remove Stopword

In [7]:
def clean_text_1(text):
    text = text.lower() # lower all sentence
    text = text.strip() # remove newline
    text = stemmer.stem(text) # Stemming kata berimbuhan menjadi kata dasar
    text = ' '.join(word for word in text.split() if word not in stopword)
    return text

In [9]:
print('Cleaning Food Reviews...')
df_food['clean_review_v1'] = df_food['reviews'].apply(clean_text_1)  
print('Cleaning Electronic Reviews...')
df_electronic['clean_review_v1'] = df_electronic['reviews'].apply(clean_text_1) 
print('Cleaning Fashion Reviews...') 
df_fashion['clean_review_v1'] = df_fashion['reviews'].apply(clean_text_1)
print('Cleaning Data Done.')  


Cleaning Food Reviews...
Cleaning Electronic Reviews...
Cleaning Fashion Reviews...
Cleaning Data Done.


In [11]:
df_food.to_csv('../data/interim/1_food_cleaned_v1.csv', index = False)
df_electronic.to_csv('../data/interim/1_electronic_cleaned_v1.csv', index = False)
df_fashion.to_csv('../data/interim/1_fashion_cleaned_v1.csv', index = False)

### Cleaning Character Repetition

In [58]:
df_food_v1 = pd.read_csv(DF_PATH + '1_food_cleaned_v1.csv')
df_electronic_v1 = pd.read_csv(DF_PATH + '1_electronic_cleaned_v1.csv')
df_fashion_v1 = pd.read_csv(DF_PATH + '1_fashion_cleaned_v1.csv')

In [59]:
def replace_two_or_more(s):
    #look for 2 or more repetitions of character
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL) 
    return pattern.sub(r"\1", s)

In [60]:
def clean_text_2(text):
    text = ' '.join(replace_two_or_more(word) for word in str(text).split())
    return text

In [61]:
print('Cleaning Food Reviews...')
df_food_v1['clean_review_v2'] = df_food_v1['clean_review_v1'].apply(clean_text_2)  
print('Cleaning Electronic Reviews...')
df_electronic_v1['clean_review_v2'] = df_electronic_v1['clean_review_v1'].apply(clean_text_2) 
print('Cleaning Fashion Reviews...') 
df_fashion_v1['clean_review_v2'] = df_fashion_v1['clean_review_v1'].apply(clean_text_2)
print('Cleaning Data Done.')  

Cleaning Food Reviews...
Cleaning Electronic Reviews...
Cleaning Fashion Reviews...
Cleaning Data Done.


In [62]:
df_fashion_v1

Unnamed: 0,reviews,label,clean_review_v1,clean_review_v2
0,"Alhamdulillah jilbab sya sdh sampai, kualitas ...",1,alhamdulillah jilbab sya sdh kualitas nya bagu...,alhamdulilah jilbab sya sdh kualitas nya bagus...
1,Bahan nya suka bgt adem dan lembut sangat memu...,1,bahan nya suka bgt adem lembut sangat muas sekali,bahan nya suka bgt adem lembut sangat muas sekali
2,Hoodie yang tebal dan bagus,1,hoodie tebal bagus,hodie tebal bagus
3,Respon penjual ramah baik. Pengiriman lumayan ...,1,respon jual ramah baik kirim lumayan cepat har...,respon jual ramah baik kirim lumayan cepat har...
4,"bagussss bangettt, bahann tebal, pengiriman ce...",1,bagussss bangettt bahann tebal kirim cepatt me...,bagus banget bahan tebal kirim cepat memuaskan...
...,...,...,...,...
113075,Tasnya jelek..respon penjual sangat lama utk m...,0,tas jelek respon jual sangat lama utk urus pak...,tas jelek respon jual sangat lama utk urus pak...
113076,Kecewa bangetttttt\nPertama kali gua dapat bar...,0,kecewa bangetttttt pertama kali gua barang rus...,kecewa banget pertama kali gua barang rusak la...
113077,Bagus tapi pengiriman lama banget kemasan kura...,0,bagus kirim lama banget kemas kurang aman cuma...,bagus kirim lama banget kemas kurang aman cuma...
113078,Kaku dan kecil,0,kaku kecil,kaku kecil


### Menghapus Kata yang memiliki frekuensi <= 3

In [63]:
def count_word(df):
    counter = df.clean_review_v2.str.split(expand=True).stack().value_counts().reset_index()
    counter.columns = ['Word', 'Frequency'] 
    return counter

In [64]:
df_food_freq = count_word(df_food_v1)
food_freq_mino = list(df_food_freq[df_food_freq['Frequency'] <= 3]['Word'])

df_electronic_freq = count_word(df_electronic_v1)
electronic_freq_mino = list(df_electronic_freq[df_electronic_freq['Frequency'] <= 3]['Word'])

df_fashion_freq = count_word(df_fashion_v1)
fashion_freq_mino = list(df_fashion_freq[df_fashion_freq['Frequency'] <= 3]['Word'])

In [65]:
fashion_freq_mino

['raden',
 'binjai',
 '10x10',
 'muncung',
 'ridha',
 'ancam',
 'testinya',
 'ujuranya',
 'keq',
 'dahlah',
 'tenda',
 'roma',
 'voly',
 'thre',
 'peckinganya',
 'ahmad',
 'obl',
 'kahdvsbdks',
 'bajau',
 'bep',
 'mudh',
 'costumers',
 'mantulity',
 '39-40',
 'lcin',
 'shofie',
 'wasalamu',
 'y20',
 'smartphone',
 'urai',
 'ngurusnya',
 'tele',
 'tinginy',
 'srius',
 'checking',
 '230',
 'deui',
 'kncingnya',
 'e-comerce',
 'hv',
 '4harian',
 'zx',
 'cekik',
 'lahb',
 'keceqa',
 'pojok',
 'bhnx',
 'sebenerny',
 'sndr',
 'dipermasalahin',
 'kupon',
 'dhaldhd',
 'atasny',
 'ngulot',
 'serum',
 'brang2',
 'pg',
 'stardart',
 'nyantol',
 'aldjdhvajhdbd',
 'budog',
 'embara',
 'asah',
 'peact',
 'heheheheheh',
 'bluetoth',
 'kepleset',
 'msti',
 'sangatsangat',
 'orisinal',
 'semuany',
 '8k',
 'po-nya',
 '175cm',
 'sts',
 'adan',
 '20pcs',
 'gasampe',
 'nominus',
 'kantonya',
 'ganyampe',
 'banjarmasin',
 'gbl',
 'abisin',
 'things',
 'bsrang',
 'warnahnya',
 'talk',
 'dicuekin',
 'telur',


In [66]:
def clean_text_3(df, stopwords):
    list_text = []
    for text in df.clean_review_v2:
        list_text.append(' '.join(word for word in str(text).split() if word not in stopwords))
    df['clean_review_v3'] = list_text
    return df

In [67]:
print('Cleaning Food Reviews...')
df_food_v1 = clean_text_3(df_food_v1, food_freq_mino)
print('Cleaning Electronic Reviews...')
df_electronic_v1 = clean_text_3(df_electronic_v1, electronic_freq_mino)
print('Cleaning Fashion Reviews...')
df_fashion_v1 = clean_text_3(df_fashion_v1, fashion_freq_mino)
print('Cleaning Done.')

Cleaning Food Reviews...
Cleaning Electronic Reviews...
Cleaning Fashion Reviews...
Cleaning Done.


In [68]:
df_food_v1

Unnamed: 0,reviews,label,clean_review_v1,clean_review_v2,clean_review_v3
0,Harga:sedang\nKualitas:sedang\nRasa:blm tau\n\...,1,harga sedang kualitas sedang rasa blm tau pack...,harga sedang kualitas sedang rasa blm tau pack...,harga sedang kualitas sedang rasa blm tau pack...
1,"Bagusss bangettttt ,pengemasan aman bangett re...",1,bagusss bangettttt emas aman bangett recomend ...,bagus banget emas aman banget recomend pokoknya,bagus banget emas aman banget recomend pokoknya
2,Kualitas:bsgus\nHarga:standar\n\nPaketny dah D...,1,kualitas bsgus harga standar paketny dah daten...,kualitas bsgus harga standar paketny dah daten...,kualitas harga standar dah dateng sesuai pesan...
3,Bumbunya mantappppppppppppppppppppppppppppppp ...,1,bumbu mantappppppppppppppppppppppppppppppp lah...,bumbu mantap lah,bumbu mantap lah
4,Harga:ok\nRasa:coklat\nKualitas:ok\n\nTerima k...,1,harga rasa coklat kualitas terima kasih banyak...,harga rasa coklat kualitas terima kasih banyak...,harga rasa coklat kualitas terima kasih banyak...
...,...,...,...,...,...
10417,"Mengapa tidak ada plastik bungkusan susu nya, ...",0,plastik bungkus susu nya padahal mau jual,plastik bungkus susu nya padahal mau jual,plastik bungkus susu nya padahal mau jual
10418,bocor nih,0,bocor nih,bocor nih,bocor nih
10419,"PELIT AMAT AMA KARDUS, BARANG BOCOR SEMUA PAS ...",0,pelit ama kardus barang bocor semua pas sampe ...,pelit ama kardus barang bocor semua pas sampe ...,pelit ama kardus barang bocor semua pas sampe ...
10420,LAMA,0,lama,lama,lama


### Menghapus kata dengan panjang 3 karakter
Kata dengan 3 karakter biasanya merupakan kata imbuhan ataupun kata sambung

In [69]:
def clean_text_4(text):
    text = ' '.join(word for word in str(text).split() if len(word) > 3)
    return text

In [70]:
print('Cleaning Food Reviews...')
df_food_v1['clean_review_v4'] = df_food_v1['clean_review_v3'].apply(clean_text_4)  
print('Cleaning Electronic Reviews...')
df_electronic_v1['clean_review_v4'] = df_electronic_v1['clean_review_v3'].apply(clean_text_4) 
print('Cleaning Fashion Reviews...') 
df_fashion_v1['clean_review_v4'] = df_fashion_v1['clean_review_v3'].apply(clean_text_4)
print('Cleaning Data Done.')  

Cleaning Food Reviews...
Cleaning Electronic Reviews...
Cleaning Fashion Reviews...
Cleaning Data Done.


In [71]:
df_food_v1

Unnamed: 0,reviews,label,clean_review_v1,clean_review_v2,clean_review_v3,clean_review_v4
0,Harga:sedang\nKualitas:sedang\nRasa:blm tau\n\...,1,harga sedang kualitas sedang rasa blm tau pack...,harga sedang kualitas sedang rasa blm tau pack...,harga sedang kualitas sedang rasa blm tau pack...,harga sedang kualitas sedang rasa packing rapi...
1,"Bagusss bangettttt ,pengemasan aman bangett re...",1,bagusss bangettttt emas aman bangett recomend ...,bagus banget emas aman banget recomend pokoknya,bagus banget emas aman banget recomend pokoknya,bagus banget emas aman banget recomend pokoknya
2,Kualitas:bsgus\nHarga:standar\n\nPaketny dah D...,1,kualitas bsgus harga standar paketny dah daten...,kualitas bsgus harga standar paketny dah daten...,kualitas harga standar dah dateng sesuai pesan...,kualitas harga standar dateng sesuai pesan lam...
3,Bumbunya mantappppppppppppppppppppppppppppppp ...,1,bumbu mantappppppppppppppppppppppppppppppp lah...,bumbu mantap lah,bumbu mantap lah,bumbu mantap
4,Harga:ok\nRasa:coklat\nKualitas:ok\n\nTerima k...,1,harga rasa coklat kualitas terima kasih banyak...,harga rasa coklat kualitas terima kasih banyak...,harga rasa coklat kualitas terima kasih banyak...,harga rasa coklat kualitas terima kasih banyak...
...,...,...,...,...,...,...
10417,"Mengapa tidak ada plastik bungkusan susu nya, ...",0,plastik bungkus susu nya padahal mau jual,plastik bungkus susu nya padahal mau jual,plastik bungkus susu nya padahal mau jual,plastik bungkus susu padahal jual
10418,bocor nih,0,bocor nih,bocor nih,bocor nih,bocor
10419,"PELIT AMAT AMA KARDUS, BARANG BOCOR SEMUA PAS ...",0,pelit ama kardus barang bocor semua pas sampe ...,pelit ama kardus barang bocor semua pas sampe ...,pelit ama kardus barang bocor semua pas sampe ...,pelit kardus barang bocor semua sampe packing ...
10420,LAMA,0,lama,lama,lama,lama


In [72]:
counter = df_fashion_v1.clean_review_v4.str.split(expand=True).stack().value_counts().reset_index()
counter.columns = ['Word', 'Frequency']
counter

Unnamed: 0,Word,Frequency
0,bagus,44681
1,sesuai,31905
2,harga,26525
3,kirim,23611
4,banget,23455
...,...,...
6446,dipesan,4
6447,prah,4
6448,gembira,4
6449,klop,4


In [73]:
df_food_v1.to_csv('../data/interim/2_food_cleaned_v2.csv', index = False)
df_electronic_v1.to_csv('../data/interim/2_electronic_cleaned_v2.csv', index = False)
df_fashion_v1.to_csv('../data/interim/2_fashion_cleaned_v2.csv', index = False)

### Menghapus Data Missing Value

In [86]:
df_food_v2 = pd.read_csv('../data/interim/2_food_cleaned_v2.csv')
df_electronic_v2 = pd.read_csv('../data/interim/2_electronic_cleaned_v2.csv')
df_fashion_v2 = pd.read_csv('../data/interim/2_fashion_cleaned_v2.csv')

In [87]:
df_food_v2 = df_food_v2[['clean_review_v4', 'label']]
df_electronic_v2 = df_electronic_v2[['clean_review_v4', 'label']]
df_fashion_v2 = df_fashion_v2[['clean_review_v4', 'label']]

#### Cek Missing Value

In [88]:
# Missing Value Data Food
df_food_v2.isna().sum()

clean_review_v4    132
label                0
dtype: int64

In [89]:
# Missing Value Data Electronic
df_electronic_v2.isna().sum()

clean_review_v4    886
label                0
dtype: int64

In [90]:
# Missing Value Data Fashion
df_fashion_v2.isna().sum()

clean_review_v4    2194
label                 0
dtype: int64

#### Hapus Missing Value

In [91]:
df_food_v2 = df_food_v2.dropna(axis=0).reset_index(drop=True)
df_electronic_v2 = df_electronic_v2.dropna(axis=0).reset_index(drop=True)
df_fashion_v2 = df_fashion_v2.dropna(axis=0).reset_index(drop=True)

#### Cek Ulang Missing Value

In [92]:
# Missing Value Data Food
df_food_v2.isna().sum()

clean_review_v4    0
label              0
dtype: int64

In [93]:
# Missing Value Data Electronic
df_electronic_v2.isna().sum()

clean_review_v4    0
label              0
dtype: int64

In [94]:
# Missing Value Data Fashion
df_fashion_v2.isna().sum()

clean_review_v4    0
label              0
dtype: int64

### Rename Kolom

In [95]:
df_food_v2 = df_food_v2.rename(columns = {'clean_review_v4' : 'reviews'})
df_electronic_v2 = df_electronic_v2.rename(columns = {'clean_review_v4' : 'reviews'})
df_fashion_v2 = df_fashion_v2.rename(columns = {'clean_review_v4' : 'reviews'})

In [96]:
df_food_v2

Unnamed: 0,reviews,label
0,harga sedang kualitas sedang rasa packing rapi...,1
1,bagus banget emas aman banget recomend pokoknya,1
2,kualitas harga standar dateng sesuai pesan lam...,1
3,bumbu mantap,1
4,harga rasa coklat kualitas terima kasih banyak...,1
...,...,...
10285,plastik bungkus susu padahal jual,0
10286,bocor,0
10287,pelit kardus barang bocor semua sampe packing ...,0
10288,lama,0


In [97]:
df_electronic_v2

Unnamed: 0,reviews,label
0,harga murah speaker lumayan kenceng karoke mantap,1
1,terima kasih barang kirim cepat mantap banget,1
2,mantap barang kirim cepat seler ramah pokok ma...,1
3,mantap paket sesuai pesan,1
4,terimakasih seler shope barang aman bagus,1
...,...,...
48831,kira pakai pakai nonton super,0
48832,semut,0
48833,baru rusak,0
48834,barang terima terima kasih,0


In [98]:
df_fashion_v2

Unnamed: 0,reviews,label
0,alhamdulilah jilbab kualitas bagus banget kecewa,1
1,bahan suka adem lembut sangat muas sekali,1
2,hodie tebal bagus,1
3,respon jual ramah baik kirim lumayan cepat har...,1
4,bagus banget bahan tebal kirim cepat memuaskan...,1
...,...,...
110881,jelek respon jual sangat lama urus paket bahan...,0
110882,kecewa banget pertama kali barang rusak lama b...,0
110883,bagus kirim lama banget kemas kurang aman cuma...,0
110884,kaku kecil,0


In [99]:
df_food_v2.to_csv('../data/interim/3_food_cleaned_v3.csv', index = False)
df_electronic_v2.to_csv('../data/interim/3_electronic_cleaned_v3.csv', index = False)
df_fashion_v2.to_csv('../data/interim/3_fashion_cleaned_v3.csv', index = False)