In [1]:
# Import Library
# text preprocessing
import pandas as pd
import numpy as np
import csv
import requests
import io
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import re # regular expression
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory # stemming indonesian language
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.corpus import stopwords
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer # to create Bag of words
from sklearn.feature_extraction.text import TfidfVectorizer # tfid Vector 
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score # confussion matrix
from sklearn.preprocessing import LabelEncoder # to convert classes to number 
from sklearn.model_selection import train_test_split  # for splitting data 
from sklearn.metrics import accuracy_score # to calculate accuracy
from sklearn.pipeline import Pipeline
#from mlxtend.plotting import plot_confusion_matrix

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
dataset = pd.read_csv(r'before_preprocessing.csv')

In [3]:
dataset.head()

Unnamed: 0,review_id,review_text,price,packaging,product,aroma
0,708,sunscreen termahal yang pernah gue beli ini ka...,-1,0,1.0,0
1,838,"ini enak banget dipakainya, enteng banget diwa...",0,0,1.0,0
2,1166,the description is quite right. produk ini ema...,0,0,1.0,1
3,1374,bisa untuk wajah dan badan dengan harga yang s...,1,1,1.0,0
4,1421,saya beli produk ini karena suka banget wangin...,-1,0,0.0,1


In [4]:
dataset.isnull().sum()

review_id      0
review_text    0
price          0
packaging      0
product        1
aroma          0
dtype: int64

In [5]:
dataset = dataset.fillna(method="ffill")
dataset.isnull().sum()

review_id      0
review_text    0
price          0
packaging      0
product        0
aroma          0
dtype: int64

In [6]:
dataset.head()

Unnamed: 0,review_id,review_text,price,packaging,product,aroma
0,708,sunscreen termahal yang pernah gue beli ini ka...,-1,0,1.0,0
1,838,"ini enak banget dipakainya, enteng banget diwa...",0,0,1.0,0
2,1166,the description is quite right. produk ini ema...,0,0,1.0,1
3,1374,bisa untuk wajah dan badan dengan harga yang s...,1,1,1.0,0
4,1421,saya beli produk ini karena suka banget wangin...,-1,0,0.0,1


In [7]:
# casefolding 

def clean(dataset):
  temp_text = []

  for txt in dataset:
    # removal of @name[mention]
    txt = re.sub(r"(?:\@|https?\://)\S+", "", txt)

    # removal of links[https://blabala.com]
    # tw = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", tw)
    txt = re.sub(r"http\S+", "", txt)

    # removal of new line
    txt = re.sub('\n', '', txt)

    # removal of RT
    txt = re.sub('RT', '', txt)

    # removal of punctuations and numbers
    txt = re.sub("[^a-zA-Z^']", " ", txt)
    txt = re.sub(" {2,}", " ", txt)

    # remove leading and trailing whitespace
    txt = txt.strip()

    # remove whitespace with a single space
    txt = re.sub(r'\s+', ' ', txt)

    # convert text to Lowercase
    text = txt.lower();
    temp_text.append(txt)
  return temp_text 

dataset['Clean_text'] = clean(dataset['review_text'])
dataset.head()

Unnamed: 0,review_id,review_text,price,packaging,product,aroma,Clean_text
0,708,sunscreen termahal yang pernah gue beli ini ka...,-1,0,1.0,0,sunscreen termahal yang pernah gue beli ini ka...
1,838,"ini enak banget dipakainya, enteng banget diwa...",0,0,1.0,0,ini enak banget dipakainya enteng banget diwaj...
2,1166,the description is quite right. produk ini ema...,0,0,1.0,1,the description is quite right produk ini eman...
3,1374,bisa untuk wajah dan badan dengan harga yang s...,1,1,1.0,0,bisa untuk wajah dan badan dengan harga yang s...
4,1421,saya beli produk ini karena suka banget wangin...,-1,0,0.0,1,saya beli produk ini karena suka banget wangin...


In [8]:
#TOKENISASI
def token(dataset):
  return dataset.apply(nltk.word_tokenize)

dataset['Clean_text'] = token(dataset['Clean_text'])
dataset.head()


Unnamed: 0,review_id,review_text,price,packaging,product,aroma,Clean_text
0,708,sunscreen termahal yang pernah gue beli ini ka...,-1,0,1.0,0,"[sunscreen, termahal, yang, pernah, gue, beli,..."
1,838,"ini enak banget dipakainya, enteng banget diwa...",0,0,1.0,0,"[ini, enak, banget, dipakainya, enteng, banget..."
2,1166,the description is quite right. produk ini ema...,0,0,1.0,1,"[the, description, is, quite, right, produk, i..."
3,1374,bisa untuk wajah dan badan dengan harga yang s...,1,1,1.0,0,"[bisa, untuk, wajah, dan, badan, dengan, harga..."
4,1421,saya beli produk ini karena suka banget wangin...,-1,0,0.0,1,"[saya, beli, produk, ini, karena, suka, banget..."


In [9]:
#Stopwordremoval

def stop_words(dataset) :
  stop_words = set(stopwords.words('indonesian'))
  return dataset.apply(lambda x: [item for item in x if item not in stop_words])

dataset['Clean_text'] = stop_words(dataset['Clean_text'])
dataset.head()

Unnamed: 0,review_id,review_text,price,packaging,product,aroma,Clean_text
0,708,sunscreen termahal yang pernah gue beli ini ka...,-1,0,1.0,0,"[sunscreen, termahal, gue, beli, kayanya, but,..."
1,838,"ini enak banget dipakainya, enteng banget diwa...",0,0,1.0,0,"[enak, banget, dipakainya, enteng, banget, diw..."
2,1166,the description is quite right. produk ini ema...,0,0,1.0,1,"[the, description, is, quite, right, produk, e..."
3,1374,bisa untuk wajah dan badan dengan harga yang s...,1,1,1.0,0,"[wajah, badan, harga, affordable, dipakai, waj..."
4,1421,saya beli produk ini karena suka banget wangin...,-1,0,0.0,1,"[beli, produk, suka, banget, wanginya, jarang,..."


In [10]:
dataset['Clean_text'] = dataset['Clean_text'].apply(lambda x: " ".join(x) if isinstance(x, list) else x)
dataset.head()

Unnamed: 0,review_id,review_text,price,packaging,product,aroma,Clean_text
0,708,sunscreen termahal yang pernah gue beli ini ka...,-1,0,1.0,0,sunscreen termahal gue beli kayanya but it 's ...
1,838,"ini enak banget dipakainya, enteng banget diwa...",0,0,1.0,0,enak banget dipakainya enteng banget diwajah l...
2,1166,the description is quite right. produk ini ema...,0,0,1.0,1,the description is quite right produk emang co...
3,1374,bisa untuk wajah dan badan dengan harga yang s...,1,1,1.0,0,wajah badan harga affordable dipakai wajah oks...
4,1421,saya beli produk ini karena suka banget wangin...,-1,0,0.0,1,beli produk suka banget wanginya jarang suka p...


In [11]:
slang_word = requests.get('https://raw.githubusercontent.com/louisowen6/NLP_bahasa_resources/master/combined_slang_words.txt').text
dict_slang = eval(slang_word)

df_acronym = pd.read_csv('https://raw.githubusercontent.com/ramaprakoso/analisis-sentimen/master/kamus/acronym.txt', header=None, sep='=')
df_acronym.columns = ['Singkatan', 'kepanjangan']
df_acronym.kepanjangan = df_acronym.kepanjangan.apply(lambda x: x.strip().lower())
df_acronym.Singkatan = df_acronym.Singkatan.apply(lambda x: x.strip().lower())
dict_singkatan = pd.Series(df_acronym.kepanjangan.values,index=df_acronym.Singkatan).to_dict()

dict_clean = {**dict_singkatan, **dict_slang}

In [12]:
def preprocessing_slang_singkatan(review_text, dict_clean=dict_clean):
  regex = r"\b(?:"+"|".join(re.escape(word) for word in dict_clean) + r")\b"
  reobj = re.compile(regex, re.I)
  
  return reobj.sub(lambda x:dict_clean[x.group(0)], review_text)

In [13]:
def clean_all(x, dict_clean=dict_clean):
  x = preprocessing_slang_singkatan(x, dict_clean=dict_clean)
  return x

In [14]:
tqdm.pandas()

In [15]:
dataset['Clean_text'] = dataset.Clean_text.progress_apply(lambda x: clean_all(x, dict_clean=dict_clean))

100%|██████████| 3960/3960 [00:23<00:00, 171.74it/s]


In [16]:
a = dataset[['Clean_text']]
a

Unnamed: 0,Clean_text
0,sunscreen termahal saya beli kayaknya bentuk u...
1,enak banget dipakainya enteng banget diwajah l...
2,the description is quite right produk emang co...
3,wajah badan harga affordable dipakai wajah oks...
4,beli produk suka banget wanginya jarang suka p...
...,...
3955,penyesalan berujung sudah beli sayang uangnya ...
3956,kulit yang oily dehydrated plus sensitive prod...
3957,singkatnya produk teksturnya terang gampang me...
3958,hai pretty kulit fresh elastis kaya bayi cobai...


In [17]:
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer, CountVectorizer
tfidf_transformer = TfidfTransformer()
tfidf_vectorizer = TfidfVectorizer()
count_vectorize = CountVectorizer()

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'Lupa satu kali',
    'Lupa dua kali',
    'Lupa tiga kali',
    'Lupa lagi']

vectorizer = TfidfVectorizer()

# contoh
a = vectorizer.fit_transform(corpus)
a.toarray()

array([[0.        , 0.49248889, 0.        , 0.40264194, 0.77157901,
        0.        ],
       [0.77157901, 0.49248889, 0.        , 0.40264194, 0.        ,
        0.        ],
       [0.        , 0.49248889, 0.        , 0.40264194, 0.        ,
        0.77157901],
       [0.        , 0.        , 0.88654763, 0.46263733, 0.        ,
        0.        ]])

In [19]:
a = dataset['Clean_text']
# Count TF_IDF Vectorizer
count_vectorizer = CountVectorizer()
count_vector = count_vectorizer.fit_transform(dataset['Clean_text'])
count_vector.shape

(3960, 10604)

In [20]:
a = dataset['Clean_text']
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer = tfidf_vectorizer.fit_transform(a)
print(tfidf_vectorizer)

  (0, 9385)	0.22951448514599457
  (0, 9735)	0.20241065964446703
  (0, 8434)	0.23851851705640392
  (0, 771)	0.07643694218914435
  (0, 4397)	0.1209786893655621
  (0, 813)	0.11720540005730368
  (0, 10197)	0.09870090157641434
  (0, 9812)	0.12297513385473198
  (0, 4068)	0.19064353252227764
  (0, 9086)	0.12823110932403758
  (0, 10483)	0.1224633308709759
  (0, 8973)	0.07414854791637204
  (0, 311)	0.10906969242502612
  (0, 10437)	0.14019898138631903
  (0, 1998)	0.14907152062367354
  (0, 1414)	0.16176861020530625
  (0, 129)	0.17939346138418633
  (0, 9355)	0.18873077725945817
  (0, 9379)	0.20002785704985376
  (0, 3733)	0.18720381998245783
  (0, 9190)	0.11515154158640542
  (0, 9874)	0.16834342515692532
  (0, 6061)	0.1570893305165104
  (0, 10417)	0.13783979572297111
  (0, 1485)	0.14991014341242972
  :	:
  (3959, 8842)	0.0851408663879182
  (3959, 6066)	0.1458654393433273
  (3959, 1670)	0.08455757537671851
  (3959, 547)	0.07581407806474447
  (3959, 6849)	0.12303331019290666
  (3959, 1396)	0.17383608

In [21]:
tfidf_vectorizer.shape

(3960, 10604)

In [22]:
#seleksi fitur
import numpy as np
from scipy import stats
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputRegressor,RegressorChain
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import ClassifierChain
from sklearn import datasets
from sklearn.feature_selection import SelectKBest,chi2 , f_regression, SelectFromModel, SelectPercentile
from sklearn.preprocessing import LabelBinarizer
from sklearn.datasets import load_iris
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MultiLabelBinarizer

In [23]:
x = tfidf_vectorizer.copy()

In [24]:
y = np.array(dataset[['price','packaging','product','aroma']])

In [25]:
print(y)

[[-1.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  1.]
 ...
 [ 0.  0. -1. -1.]
 [ 0.  0.  1.  0.]
 [ 0.  0. -1.  0.]]


In [26]:
x.shape, y.shape

((3960, 10604), (3960, 4))

In [27]:
Y = MultiLabelBinarizer().fit_transform(y)

In [28]:
Y.shape

(3960, 3)

In [29]:
x_new = SelectKBest(chi2, k=20).fit_transform(x, Y)

In [30]:
print(x_new)

  (0, 19)	0.1224633308709759
  (4, 7)	0.3456614005109887
  (4, 6)	0.08861959437778001
  (4, 18)	0.09239193871494003
  (5, 18)	0.04853565488290948
  (7, 6)	0.15803541951793348
  (7, 19)	0.21790797007104298
  (8, 17)	0.131098882285691
  (9, 8)	0.08586127886408136
  (9, 19)	0.12595836712194688
  (11, 8)	0.08002532868819434
  (12, 6)	0.1349249691554246
  (13, 17)	0.18473551273241381
  (13, 6)	0.14410463055488715
  (13, 19)	0.1986994283802922
  (14, 17)	0.18924171511629048
  (15, 8)	0.2043845417111551
  (16, 2)	0.17993698964562324
  (16, 5)	0.11467223215396563
  (17, 0)	0.16207885687663268
  (17, 5)	0.06586088495285938
  (17, 17)	0.06680130641025007
  (18, 18)	0.09920624408959677
  (19, 5)	0.07898264715210614
  (19, 8)	0.1174723436321029
  :	:
  (3917, 1)	0.18774222155509349
  (3917, 6)	0.11247018808204348
  (3922, 9)	0.06818511942639537
  (3925, 6)	0.12268636286476017
  (3928, 6)	0.09311286841211237
  (3931, 9)	0.14306635614387703
  (3933, 16)	0.13918189190570818
  (3933, 1)	0.148138430806

In [31]:
Kbest = SelectKBest(score_func = chi2, k=100).fit(x, Y)

In [32]:
skor_chi = pd.DataFrame({'features': x_new, 'Chi2Score': Kbest.scores_, 'pValue':Kbest.pvalues_ })
skor_chi

Unnamed: 0,features,Chi2Score,pValue
0,"(0, 19)\t0.1224633308709759\n (4, 7)\t0.345...",0.090314,0.955847
1,"(0, 19)\t0.1224633308709759\n (4, 7)\t0.345...",0.272176,0.872766
2,"(0, 19)\t0.1224633308709759\n (4, 7)\t0.345...",0.097568,0.952387
3,"(0, 19)\t0.1224633308709759\n (4, 7)\t0.345...",0.309692,0.856547
4,"(0, 19)\t0.1224633308709759\n (4, 7)\t0.345...",0.098911,0.951748
5,"(0, 19)\t0.1224633308709759\n (4, 7)\t0.345...",0.142349,0.931299
6,"(0, 19)\t0.1224633308709759\n (4, 7)\t0.345...",0.265517,0.875676
7,"(0, 19)\t0.1224633308709759\n (4, 7)\t0.345...",0.057947,0.971442
8,"(0, 19)\t0.1224633308709759\n (4, 7)\t0.345...",0.152782,0.926454
9,"(0, 19)\t0.1224633308709759\n (4, 7)\t0.345...",0.105203,0.948758


In [33]:
skor_chi.shape

(10604, 3)

In [34]:
from sklearn.model_selection import train_test_split
x_new_train, x_new_test, Y_train, Y_test = train_test_split(x_new,Y, test_size=0.2, random_state=0)

In [35]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor


knn = KNeighborsClassifier(n_neighbors=11)
## Fit the model on the training data.
classifier = MultiOutputClassifier(knn, n_jobs=-1)
classifier.fit(x_new_train, Y_train)
## See how the model performs on the test data.
predictions = classifier.predict(x_new_test)

In [36]:
knn_skor=classifier.score(x_new_test,np.array(Y_test))
print(knn_skor * 100 ,'%')

69.6969696969697 %


In [37]:
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import recall_score
from sklearn.metrics import  multilabel_confusion_matrix

multilabel = multilabel_confusion_matrix(Y_test,predictions)
f1_score = metrics.f1_score(Y_test,predictions, average='macro', labels=np.unique(predictions))
precision_score = metrics.precision_score(Y_test,predictions, average='macro', labels=np.unique(predictions))
recall_score = metrics.recall_score(Y_test,predictions, average='macro', labels=np.unique(predictions))
        
print('akurasi f1 score        :', f1_score* 100 ,'%')
print('akurasi precision score :', precision_score* 100 ,'%')
print('akurasi recall score    :', recall_score* 100 ,'%')

akurasi f1 score        : 83.32830177829531 %
akurasi precision score : 91.45940537449971 %
akurasi recall score    : 78.0564263322884 %
