In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns 

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split

In [2]:
dataset = pd.read_csv(r'before_preprocessing.csv')
dataset.head()

Unnamed: 0,review_id,review_text,price,packaging,product,aroma
0,708,sunscreen termahal yang pernah gue beli ini ka...,-1,0,1.0,0
1,838,"ini enak banget dipakainya, enteng banget diwa...",0,0,1.0,0
2,1166,the description is quite right. produk ini ema...,0,0,1.0,1
3,1374,bisa untuk wajah dan badan dengan harga yang s...,1,1,1.0,0
4,1421,saya beli produk ini karena suka banget wangin...,-1,0,0.0,1


In [3]:
import re 
# casefolding 

def clean(dataset):
  temp_text = []

  for txt in dataset:
    # removal of @name[mention]
    txt = re.sub(r"(?:\@|https?\://)\S+", "", txt)

    # removal of links[https://blabala.com]
    # tw = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", tw)
    txt = re.sub(r"http\S+", "", txt)

    # removal of new line
    txt = re.sub('\n', '', txt)

    # removal of RT
    txt = re.sub('RT', '', txt)

    # removal of punctuations and numbers
    txt = re.sub("[^a-zA-Z^']", " ", txt)
    txt = re.sub(" {2,}", " ", txt)

    # remove leading and trailing whitespace
    txt = txt.strip()

    # remove whitespace with a single space
    txt = re.sub(r'\s+', ' ', txt)

    # convert text to Lowercase
    text = txt.lower();
    temp_text.append(txt)
  return temp_text 

dataset['Clean_text'] = clean(dataset['review_text'])
dataset.head()

Unnamed: 0,review_id,review_text,price,packaging,product,aroma,Clean_text
0,708,sunscreen termahal yang pernah gue beli ini ka...,-1,0,1.0,0,sunscreen termahal yang pernah gue beli ini ka...
1,838,"ini enak banget dipakainya, enteng banget diwa...",0,0,1.0,0,ini enak banget dipakainya enteng banget diwaj...
2,1166,the description is quite right. produk ini ema...,0,0,1.0,1,the description is quite right produk ini eman...
3,1374,bisa untuk wajah dan badan dengan harga yang s...,1,1,1.0,0,bisa untuk wajah dan badan dengan harga yang s...
4,1421,saya beli produk ini karena suka banget wangin...,-1,0,0.0,1,saya beli produk ini karena suka banget wangin...


In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
#TOKENISASI
def token(dataset):
  return dataset.apply(nltk.word_tokenize)

dataset['Clean_text'] = token(dataset['Clean_text'])
dataset.head()

[nltk_data] Downloading package punkt to C:\Users\Yusrifa Deta
[nltk_data]     Kirana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Yusrifa Deta
[nltk_data]     Kirana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,review_id,review_text,price,packaging,product,aroma,Clean_text
0,708,sunscreen termahal yang pernah gue beli ini ka...,-1,0,1.0,0,"[sunscreen, termahal, yang, pernah, gue, beli,..."
1,838,"ini enak banget dipakainya, enteng banget diwa...",0,0,1.0,0,"[ini, enak, banget, dipakainya, enteng, banget..."
2,1166,the description is quite right. produk ini ema...,0,0,1.0,1,"[the, description, is, quite, right, produk, i..."
3,1374,bisa untuk wajah dan badan dengan harga yang s...,1,1,1.0,0,"[bisa, untuk, wajah, dan, badan, dengan, harga..."
4,1421,saya beli produk ini karena suka banget wangin...,-1,0,0.0,1,"[saya, beli, produk, ini, karena, suka, banget..."


In [5]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory # stemming indonesian language
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.corpus import stopwords

#Stopwordremoval

def stop_words(dataset) :
  stop_words = set(stopwords.words('indonesian','english'))
  return dataset.apply(lambda x: [item for item in x if item not in stop_words])

dataset['Clean_text'] = stop_words(dataset['Clean_text'])
dataset.head()

Unnamed: 0,review_id,review_text,price,packaging,product,aroma,Clean_text
0,708,sunscreen termahal yang pernah gue beli ini ka...,-1,0,1.0,0,"[sunscreen, termahal, gue, beli, kayanya, but,..."
1,838,"ini enak banget dipakainya, enteng banget diwa...",0,0,1.0,0,"[enak, banget, dipakainya, enteng, banget, diw..."
2,1166,the description is quite right. produk ini ema...,0,0,1.0,1,"[the, description, is, quite, right, produk, e..."
3,1374,bisa untuk wajah dan badan dengan harga yang s...,1,1,1.0,0,"[wajah, badan, harga, affordable, dipakai, waj..."
4,1421,saya beli produk ini karena suka banget wangin...,-1,0,0.0,1,"[beli, produk, suka, banget, wanginya, jarang,..."


In [6]:
#Stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory # stemming indonesia

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem(dataset) :
    hasil = stemmer.stem(dataset)
    return hasil


dataset['Clean_text'] = dataset['Clean_text'].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
dataset.head()

Unnamed: 0,review_id,review_text,price,packaging,product,aroma,Clean_text
0,708,sunscreen termahal yang pernah gue beli ini ka...,-1,0,1.0,0,sunscreen termahal gue beli kayanya but it 's ...
1,838,"ini enak banget dipakainya, enteng banget diwa...",0,0,1.0,0,enak banget dipakainya enteng banget diwajah l...
2,1166,the description is quite right. produk ini ema...,0,0,1.0,1,the description is quite right produk emang co...
3,1374,bisa untuk wajah dan badan dengan harga yang s...,1,1,1.0,0,wajah badan harga affordable dipakai wajah oks...
4,1421,saya beli produk ini karena suka banget wangin...,-1,0,0.0,1,beli produk suka banget wanginya jarang suka p...


In [7]:
dataset['Clean_text'] = dataset.apply(lambda row : stem(row['Clean_text']), axis = 1)

In [8]:
dataset.to_csv('data_preprocessing_tanpaslangword.csv', index=False, sep= ';')