In [6]:
import pandas as pd
import numpy as np
import string 
import re 
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from scipy.stats import mode
import swifter
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import cross_val_score

In [7]:
data = pd.read_csv('Data.csv', index_col=None)
df = pd.DataFrame(data)
df

Unnamed: 0,komentar,sentimen
0,"Lebih baik dijelaskan dahulu , positif-negati...",Negatif
1,pasti ingin mengambil uang ukt mahal,Negatif
2,sepertinya berita menarik agar bagus Untuk kamu,Positif
3,nahkan lebih bagus ikut diskusinya sana kalau ada,Positif
4,UB bagus sudah meningkat,Positif
...,...,...
415,PTNBH memang keren.,Positif
416,Akhir-akhir ini sedang tren setelah lulus kuli...,Positif
417,Waktu demo yang mengangkat isu dinaikan UKT da...,Negatif
418,UB tidak tahu-menahu karena persetujuan PTNBH ...,Positif


# Case Folding

In [8]:
df['komentar'] = df['komentar'].str.casefold()
df['komentar'].head()

0    lebih  baik dijelaskan dahulu , positif-negati...
1                 pasti ingin mengambil uang ukt mahal
2      sepertinya berita menarik agar bagus untuk kamu
3    nahkan lebih bagus ikut diskusinya sana kalau ada
4                             ub bagus sudah meningkat
Name: komentar, dtype: object

# Tokenization

In [9]:
def pembersihan_kata(kata):
    kata = kata.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\'," ")
    kata = kata.encode('ascii', 'replace').decode('ascii')
    kata = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", kata).split())
    return kata.replace("http://", " ").replace("https://", " ")

df['komentar'] = df['komentar'].apply(pembersihan_kata)

def hapus_nomor(kata):
    return  re.sub(r"\d+", "", kata)

df['komentar'] = df['komentar'].apply(hapus_nomor)

def hapus_tanda_baca(kata):
    hapus = string.punctuation
    return kata.translate(str.maketrans(hapus,' '*len(hapus)))

df['komentar'] = df['komentar'].apply(hapus_tanda_baca)

def hapus_spasi_diawal_dan_akhir(kata):
    return kata.strip()

df['komentar'] = df['komentar'].apply(hapus_spasi_diawal_dan_akhir)

def hapus_multiple_spasi(kata):
    return re.sub('\s+',' ',kata)

df['komentar'] = df['komentar'].apply(hapus_multiple_spasi)

def hapus_satu_kata(kata):
    return re.sub(r"\b[a-zA-Z]\b", " ", kata)

df['komentar'] = df['komentar'].apply(hapus_satu_kata)

def diubah_tokenize(kata):
    return word_tokenize(kata)

df['komentar'] = df['komentar'].apply(diubah_tokenize)

print('Tokenizing Result : \n') 
print(df['komentar'].head(5))

Tokenizing Result : 

0    [lebih, baik, dijelaskan, dahulu, positif, neg...
1          [pasti, ingin, mengambil, uang, ukt, mahal]
2    [sepertinya, berita, menarik, agar, bagus, unt...
3    [nahkan, lebih, bagus, ikut, diskusinya, sana,...
4                        [ub, bagus, sudah, meningkat]
Name: komentar, dtype: object


# Stopwords Removal

In [10]:
list_stopwords = stopwords.words('indonesian') #Mengambil Stopwords yang ada di library 
tambahan_stopwords = pd.read_csv('kumpulan_stopwords final.txt', names=['stopwords'], header=None) #Membuat stopwords tambahan didalam file txt
list_stopwords.extend(tambahan_stopwords['stopwords'][0].split(' '))#Menambahkan stopwords secara manual
list_stopwords = set(list_stopwords) - set(('tidak','kurang','baik'))
def penghapusan_stopwords(words):
    return [word for word in words if word not in list_stopwords]
df['komentar'] = df['komentar'].apply(penghapusan_stopwords)
print(df['komentar'].head(5))

0    [baik, positif, negatifnya, ptnbh, keterangann...
1                                         [ukt, mahal]
2                                              [bagus]
3                          [nahkan, bagus, diskusinya]
4                                          [ub, bagus]
Name: komentar, dtype: object


# Stemming

In [11]:
stmfactory = StemmerFactory()
stm = stmfactory.create_stemmer()
def diubah_stemmer(txt):
    return stm.stem(txt)
hasil_kata = {}
for doc in df['komentar']:
    for txt in doc:
        if txt not in hasil_kata:
            hasil_kata[txt] = ' '
for txt in hasil_kata:
    hasil_kata[txt] = diubah_stemmer(txt)
    print(txt, ":", hasil_kata[txt])
def ambil_kata_stemmed(doc):
    return [hasil_kata[txt] for txt in doc]
df['komentar'] = df['komentar'].swifter.apply(ambil_kata_stemmed) #swifter melakukan pandas processing lebih cepat

print(df['komentar'])

baik : baik
positif : positif
negatifnya : negatif
ptnbh : ptnbh
keterangannya : terang
menggiring : giring
opini : opini
minus : minus
tidak : tidak
ptn : ptn
temen : temen
kebijakan : bijak
jumpa : jumpa
pers : pers
penjelasan : jelas
ketimbang : ketimbang
menilai : nilai
berduka : duka
status : status
kasihan : kasihan
bingung : bingung
dampaknya : dampak
terpersuasi : persuasi
paham : paham
berbuat : buat
ukt : ukt
mahal : mahal
bagus : bagus
nahkan : nahkan
diskusinya : diskusi
ub : ub
dampak : dampak
positifnya : positif
pastinya : pasti
coba : coba
berbadan : badan
murah : murah
statusnya : status
pengelolahan : pengelolahan
uangnya : uang
senang : senang
bebas : bebas
sepemikiran : pikir
postingan : postingan
fakta : fakta
studi : studi
literatur : literatur
akademis : akademis
jurnal : jurnal
ilmiah : ilmiah
tesis : tesis
disertasi : disertasi
mengindikasikan : indikasi
cenderung : cenderung
merugikan : rugi
mahasiswa : mahasiswa
menimbulkan : timbul
beragam : agam
polemik : p

Pandas Apply: 100%|██████████| 420/420 [00:00<00:00, 208301.72it/s]

merekrut : rekrut
kaum : kaum
universitasnya : universitas
menerapkan : terap
prinsip : prinsip
membayangkan : bayang
tren : tren
kedokteran : dokter
dilanjutkan : lanjut
doktor : doktor
situs : situs
web : web
persetujuan : tuju
dirancang : rancang
investor : investor
0      [baik, positif, negatif, ptnbh, terang, giring...
1                                           [ukt, mahal]
2                                                [bagus]
3                               [nahkan, bagus, diskusi]
4                                            [ub, bagus]
                             ...                        
415                                       [ptnbh, keren]
416    [tren, lulus, kuliah, dokter, lanjut, saing, d...
417    [demo, angkat, naik, ukt, ptnbh, retas, situs,...
418    [ub, tidak, nahu, tuju, ptnbh, rencana, selesa...
419                                    [ptnbh, investor]
Name: komentar, Length: 420, dtype: object





In [12]:
df.to_csv("Data Asli Sesudah Preprocessing.csv")

In [13]:
data = pd.read_csv("Data Asli Sesudah Preprocessing.csv")
df = pd.DataFrame(data)
del df['Unnamed: 0']
df.head()

Unnamed: 0,komentar,sentimen
0,"['baik', 'positif', 'negatif', 'ptnbh', 'teran...",Negatif
1,"['ukt', 'mahal']",Negatif
2,['bagus'],Positif
3,"['nahkan', 'bagus', 'diskusi']",Positif
4,"['ub', 'bagus']",Positif


In [14]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
df['sentimen'] = labelencoder.fit_transform(df['sentimen'])

In [15]:
# Sentimen Positif
df[df['sentimen'] == 1].tail()

Unnamed: 0,komentar,sentimen
414,"['ptn', 'ptnbh', 'boleh', 'saing', 'bijak', 'm...",1
415,"['ptnbh', 'keren']",1
416,"['tren', 'lulus', 'kuliah', 'dokter', 'lanjut'...",1
418,"['ub', 'tidak', 'nahu', 'tuju', 'ptnbh', 'renc...",1
419,"['ptnbh', 'investor']",1


In [16]:
# Sentimen Negatif
df[df['sentimen'] == 0].tail()

Unnamed: 0,komentar,sentimen
408,"['dekat', 'hotel', 'kuliah', 'biaya', 'keluar'...",0
409,"['status', 'ptnbh', 'kedok', 'rekrut', 'kaum',...",0
410,"['tidak', 'erti', 'maksud', 'kuliah', 'ptn', '...",0
411,"['prinsip', 'ptnbh', 'ukt', 'naik', 'korupsi',...",0
417,"['demo', 'angkat', 'naik', 'ukt', 'ptnbh', 're...",0


## TF-IDF

In [17]:
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(df['komentar'])
y = df['sentimen']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [19]:
print(X_train.shape, X_test.shape)

(336, 666) (84, 666)


In [20]:
# Mengubah X_train menjadi dataframe
dataframenames = vectorizer.get_feature_names()
datadense_X_train = X_train.todense()
datalist = datadense_X_train.tolist()
X_train = pd.DataFrame(datalist, columns=dataframenames)

In [21]:
# Mengubah X_test menjadi dataframe
dataframenames = vectorizer.get_feature_names()
datadense_X_test = X_test.todense()
datalist = datadense_X_test.tolist()
X_test = pd.DataFrame(datalist, columns=dataframenames)

# Improved KNN

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

data_cosine = cosine_similarity(X_test,X_train)
data_cosine_final = pd.DataFrame(data_cosine)

In [23]:
def total_kelas(label):
    hasilakhir = {}
    for i in label:
        hasilakhir[i] = 0
    for j in label:
        if j in hasilakhir:
            hasilakhir[j] += 1
    return hasilakhir

def nilai_n(k, label):
    maks = max(label[0],label[1])
    hasil = {}
    for i in label:
        hasil[i] = round(k*label[i]/maks)
    return hasil

def hasil_Data_Latih(data,k):
    CosSim_Data_Latih = []
    for i in range(len(X_train)):  
        CosSim_Data_Latih.append((data[i]))
    CosSim_Data_Latih.sort(reverse=True)
    return CosSim_Data_Latih[:k]

def hasil_positif(data,kbaru):
    CosSim_Positif = []
    for i in range(len(X_train)):
        if y_train.iloc[i] == 1 :
            CosSim_Positif.append((data[i]))
    CosSim_Positif.sort(reverse=True)
    return CosSim_Positif[:kbaru[1]]

def hasil_negatif(data,kbaru):
    CosSim_Negatif = []
    for i in range(len(X_train)):
        if y_train.iloc[i] == 0 :
            CosSim_Negatif.append((data[i]))
    CosSim_Negatif.sort(reverse=True)
    return CosSim_Negatif[:kbaru[0]]   

def probabilitas(data,k):
    Hasil_Prob = []
    jumlahsentimen = total_kelas(y_train)
    kbaru = nilai_n(k,jumlahsentimen)
    CosSim_Data_latih = hasil_Data_Latih(data,k)
    CosSim_Data_positif = hasil_positif(data,kbaru)
    CosSim_Data_negatif = hasil_negatif(data,kbaru)
    Prob_positif = sum(CosSim_Data_positif) / sum(CosSim_Data_latih)
    Prob_negatif = sum(CosSim_Data_negatif) / sum(CosSim_Data_latih)
    if Prob_positif > Prob_negatif :
        Hasil_Prob.append(1)
    else :
        Hasil_Prob.append(0)
    return Hasil_Prob

In [24]:
y_predict=[]
for i in range(len(data_cosine_final)):
    data=data_cosine_final.iloc[i]
    data=np.array(data)   
    nei=probabilitas(data,50)
    y_predict.append(nei)
for i in range(len(y_predict)):
    y_predict[i]=y_predict[i][0]
    
y_prediction=np.array(y_predict)

  Prob_positif = sum(CosSim_Data_positif) / sum(CosSim_Data_latih)
  Prob_negatif = sum(CosSim_Data_negatif) / sum(CosSim_Data_latih)


In [25]:
print('Akurasi Score nya dari I-KNN :', accuracy_score(y_test, y_prediction))

Akurasi Score nya dari I-KNN : 0.8690476190476191


In [26]:
conf_matrix = confusion_matrix(y_test,y_prediction)
clas_report = classification_report(y_test, y_prediction)
 
print("-----Confusion Matrix---")
print(conf_matrix)
print('\n')
print("-----Classification Report ------")
print(clas_report)
print('\n')

-----Confusion Matrix---
[[46  1]
 [10 27]]


-----Classification Report ------
              precision    recall  f1-score   support

           0       0.82      0.98      0.89        47
           1       0.96      0.73      0.83        37

    accuracy                           0.87        84
   macro avg       0.89      0.85      0.86        84
weighted avg       0.88      0.87      0.87        84





Reference
1. Stopword list nya dari Putra Pandu Adikara (link :http://hikaruyuuki.lecture.ub.ac.id/kamus-kata-dasar-dan-stopword-list-bahasa-indonesia/)