Use environment from ***analysis.ipynb***

# Load Dataset

In [92]:
import pandas as pd

In [93]:
df = pd.read_csv('./../dataset/PRDECT-ID Dataset.csv')
df.head()

Unnamed: 0,Category,Product Name,Location,Price,Overall Rating,Number Sold,Total Review,Customer Rating,Customer Review,Sentiment,Emotion
0,Computers and Laptops,Wireless Keyboard i8 Mini TouchPad Mouse 2.4G ...,Jakarta Utara,53500,4.9,5449,2369,5,Alhamdulillah berfungsi dengan baik. Packaging...,Positive,Happy
1,Computers and Laptops,PAKET LISENSI WINDOWS 10 PRO DAN OFFICE 2019 O...,Kota Tangerang Selatan,72000,4.9,2359,1044,5,"barang bagus dan respon cepat, harga bersaing ...",Positive,Happy
2,Computers and Laptops,SSD Midasforce 128 Gb - Tanpa Caddy,Jakarta Barat,213000,5.0,12300,3573,5,"barang bagus, berfungsi dengan baik, seler ram...",Positive,Happy
3,Computers and Laptops,ADAPTOR CHARGER MONITOR LCD LED TV LG merek LG...,Jakarta Timur,55000,4.7,2030,672,5,bagus sesuai harapan penjual nya juga ramah. t...,Positive,Happy
4,Computers and Laptops,ADAPTOR CHARGER MONITOR LCD LED TV LG merek LG...,Jakarta Timur,55000,4.7,2030,672,5,"Barang Bagus, pengemasan Aman, dapat Berfungsi...",Positive,Happy


# Data Preparation and Cleaning

In [94]:
print('Emotion Value Count Group by Sentiment:')
print(df.groupby('Sentiment')['Emotion'].value_counts())

Emotion Value Count Group by Sentiment:
Sentiment  Emotion
Negative   Sadness    1202
           Fear        920
           Anger       699
Positive   Happy      1770
           Love        809
Name: count, dtype: int64


In [95]:
# Drop unnecessary columns
df.drop(columns=[
    'Category', 'Product Name', 'Location', 'Price', 
    'Overall Rating', 'Number Sold', 'Total Review', 
    'Customer Rating', 'Emotion'
], inplace=True)

In [96]:
# Shape before cleaning
print('Shape before Cleaning: ', df.shape)

Shape before Cleaning:  (5400, 2)


In [97]:
# Check for missing values
print('Missing Values:')
print(df.isnull().sum())

Missing Values:
Customer Review    0
Sentiment          0
dtype: int64


In [98]:
# Check for duplicates
print('Before Drop Duplicates:')
print('Duplicates: ', df.duplicated().sum())
print('Duplicate Rows:')
print(df[df.duplicated()])

Before Drop Duplicates:
Duplicates:  95
Duplicate Rows:
                                        Customer Review Sentiment
29    mantap kipasnya kenceng, barangnya berkualitas...  Positive
72    tidak berfungsi, tapi yaa sudahlah sudah ditam...  Negative
78      Sangat kecewa. Baru 4 bulan scroll sudah rusak.  Negative
189   biarkan gambar bercerita.. pesan 20 bh yg rusa...  Negative
190   Seharusnya chat dijadikan makesure barang, war...  Negative
...                                                 ...       ...
4032                             Pengiriman sangat lama  Negative
4144  Packing seadanya, bubble hanya selapis saja di...  Negative
4910                        packingnya kurang memuaskan  Negative
5173  Pengiriman cepat. Pesanan sesuai. Gak ada caca...  Positive
5278                                       bagus banget  Positive

[95 rows x 2 columns]


In [99]:
# Drop duplicates
df.drop_duplicates(inplace=True)

# Check for duplicates
print('After Drop Duplicates:')
print('Duplicates: ', df.duplicated().sum())
print('Duplicate Rows:')
print(df[df.duplicated()])

After Drop Duplicates:
Duplicates:  0
Duplicate Rows:
Empty DataFrame
Columns: [Customer Review, Sentiment]
Index: []


In [100]:
# Shape after cleaning
print('Shape after Cleaning: ', df.shape)

Shape after Cleaning:  (5305, 2)


# Label Id Dictionary

In [101]:
labels = df['Sentiment'].tolist()
labels = list(set(labels))
print(f'Label Names: {labels}')

utils = {
    'id2label': {idx: label for idx, label in enumerate(labels)},
    'label2id': {label: idx for idx, label in enumerate(labels)}
}



Label Names: ['Negative', 'Positive']


# Preprocess Text

In [102]:
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import string

In [103]:
stemmer = StemmerFactory().create_stemmer()

stopwords = StopWordRemoverFactory().get_stop_words()
# add slang words
stopwords.extend(['yg', 'nya', 'aja', 'dg', 'dgn', 'ga', 'gak', 'gk', 'tdk', 'tpi', 'jg', 'krn', 'udah', 'sy', 'sdh', 'lah', 'deh', 'ngga']) 

punctuation = string.punctuation

# add number words
num_words = ['nol', 'satu', 'dua', 'tiga', 'empat', 'lima', 'enam', 'tujuh', 'delapan', 'sembilan', 'sepuluh']

print(f'Stopwords: {stopwords}')
print(f'Punctuation: {punctuation}')
print(f'Number Words: {num_words}')

utils['stopwords'] = stopwords
utils['punctuation'] = punctuation
utils['num_words'] = num_words

print(f'Utils: {utils}')

Stopwords: ['yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 'dia', 'dua', 'ia', 'seperti', 'jika', 'jika', 'sehingga', 'kembali', 'dan', 'tidak', 'ini', 'karena', 'kepada', 'oleh', 'saat', 'harus', 'sementara', 'setelah', 'belum', 'kami', 'sekitar', 'bagi', 'serta', 'di', 'dari', 'telah', 'sebagai', 'masih', 'hal', 'ketika', 'adalah', 'itu', 'dalam', 'bisa', 'bahwa', 'atau', 'hanya', 'kita', 'dengan', 'akan', 'juga', 'ada', 'mereka', 'sudah', 'saya', 'terhadap', 'secara', 'agar', 'lain', 'anda', 'begitu', 'mengapa', 'kenapa', 'yaitu', 'yakni', 'daripada', 'itulah', 'lagi', 'maka', 'tentang', 'demi', 'dimana', 'kemana', 'pula', 'sambil', 'sebelum', 'sesudah', 'supaya', 'guna', 'kah', 'pun', 'sampai', 'sedangkan', 'selagi', 'sementara', 'tetapi', 'apakah', 'kecuali', 'sebab', 'selain', 'seolah', 'seraya', 'seterusnya', 'tanpa', 'agak', 'boleh', 'dapat', 'dsb', 'dst', 'dll', 'dahulu', 'dulunya', 'anu', 'demikian', 'tapi', 'ingin', 'juga', 'nggak', 'mari', 'nanti', 'me

In [104]:
def preprocess_text(sentence):
    word_list = word_tokenize(sentence.lower()) 
    word_list = [word for word in word_list if word not in punctuation]
    word_list = [word for word in word_list if ((word not in num_words) and (word.isalpha()))]
    word_list = [stemmer.stem(word) for word in word_list if word not in stopwords]
    return ' '.join(word_list)

In [105]:
df_clean = pd.DataFrame(columns=['text', 'id'])
df_clean['text'] = df['Customer Review'].apply(preprocess_text)
df_clean['id'] = df['Sentiment'].map(utils['label2id'])

# Drop empty text
empty_text = df_clean[df_clean['text'] == '']
df_clean.drop(empty_text.index, inplace=True)
df_clean.reset_index(drop=True, inplace=True)

df_clean.head()

Unnamed: 0,text,id
0,alhamdulillah fungsi baik packaging aman respo...,1
1,barang bagus respon cepat harga saing,1
2,barang bagus fungsi baik seler ramah kirim cepat,1
3,bagus sesuai harap jual ramah trimakasih lapak,1
4,barang bagus emas aman fungsi baik,1


# Save Cleaned DataFrame and Utils Dictionary

In [106]:
import json

# Save the dictionary
with open('./utils.json', 'w') as f:
    json.dump(utils, f)

In [107]:
df_clean.to_csv('./../dataset/cleaned_dataset.csv', index=False)