##Read Dataset


In [None]:
#Membaca dataset dengan library pandas
import pandas as pd

df = pd.read_csv('Dataset_Sentimen_Covid_19.csv')
df.head()

Unnamed: 0,Tweet,Label,Sentimen,Waktu
0,... dapat membuat proses vaksinasi Covid-19 me...,0,Negatif,2020-11-29 09:50:01+00:00
1,'Everything happens for a good reason''. Segal...,1,Positif,2021-01-08 11:38:19+00:00
2,""" bagaimana dampak covid-19 terhadap kenaikan ...",0,Negatif,2020-11-29 06:40:45+00:00
3,"""Di indonesia doang covid 19 paling bertahan""",0,Negatif,2020-11-29 13:40:50+00:00
4,"""Indonesia jangan sampai kena gelombang 2 covi...",0,Negatif,2020-12-31 16:10:09+00:00


In [None]:
#melihat kolom pada data
list(df.columns.values) 

['Tweet', 'Label', 'Sentimen', 'Waktu']

In [None]:
#Mengetahuan jumlah data dan masing-masing kolom
number_of_tweets = df.Tweet.count()
sentiment_counts = df.Sentimen.value_counts()

#Jumlah tweet
print(number_of_tweets)
#Jumlah setiap sentimen 
print(sentiment_counts) 

1302
Positif    797
Negatif    505
Name: Sentimen, dtype: int64


##Cleansing


In [None]:
#Prose cleansing
import re

def cleansing(text):
  #_menghapus URL_
  text = re.sub('(https?://\S+|www\.\S+)|(www\.[^\s]+)',' ', text)
  #_menghapus username
  text = re.sub('@[^\s]+','', text)
  #_menghapus hashtag
  text = re.sub('#([^\s]+)', '', text)
  #_menghapus punctuation dan emoticon
  text = re.sub('[^\w\s]+', '', text)
  #_menghapus angka
  text = re.sub('\d{2,9}','', text)
  #_menghapus whitespace
  text = re.sub('[\s]+',' ', text)
  return text

df["cleansing"] = df["Tweet"].apply(lambda text: cleansing(text))
df.head()

Unnamed: 0,Tweet,Label,Sentimen,Waktu,cleansing
0,... dapat membuat proses vaksinasi Covid-19 me...,0,Negatif,2020-11-29 09:50:01+00:00,dapat membuat proses vaksinasi Covid menjadi ...
1,'Everything happens for a good reason''. Segal...,1,Positif,2021-01-08 11:38:19+00:00,Everything happens for a good reason Segala se...
2,""" bagaimana dampak covid-19 terhadap kenaikan ...",0,Negatif,2020-11-29 06:40:45+00:00,bagaimana dampak covid terhadap kenaikan angk...
3,"""Di indonesia doang covid 19 paling bertahan""",0,Negatif,2020-11-29 13:40:50+00:00,Di indonesia doang covid paling bertahan
4,"""Indonesia jangan sampai kena gelombang 2 covi...",0,Negatif,2020-12-31 16:10:09+00:00,Indonesia jangan sampai kena gelombang 2 covid...


##Case Folding

In [None]:
df["text_lower"] = df["cleansing"].str.lower()
df.head()

Unnamed: 0,Tweet,Label,Sentimen,Waktu,cleansing,text_lower
0,... dapat membuat proses vaksinasi Covid-19 me...,0,Negatif,2020-11-29 09:50:01+00:00,dapat membuat proses vaksinasi Covid menjadi ...,dapat membuat proses vaksinasi covid menjadi ...
1,'Everything happens for a good reason''. Segal...,1,Positif,2021-01-08 11:38:19+00:00,Everything happens for a good reason Segala se...,everything happens for a good reason segala se...
2,""" bagaimana dampak covid-19 terhadap kenaikan ...",0,Negatif,2020-11-29 06:40:45+00:00,bagaimana dampak covid terhadap kenaikan angk...,bagaimana dampak covid terhadap kenaikan angk...
3,"""Di indonesia doang covid 19 paling bertahan""",0,Negatif,2020-11-29 13:40:50+00:00,Di indonesia doang covid paling bertahan,di indonesia doang covid paling bertahan
4,"""Indonesia jangan sampai kena gelombang 2 covi...",0,Negatif,2020-12-31 16:10:09+00:00,Indonesia jangan sampai kena gelombang 2 covid...,indonesia jangan sampai kena gelombang 2 covid...


##Tokenizing


In [None]:
import nltk
nltk.download('punkt')

def tokenizing(text):
  tokens = nltk.word_tokenize(text)
  return tokens

df["text_tokenize"] = df["text_lower"].apply(lambda text: tokenizing(text))
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Unnamed: 0,Tweet,Label,Sentimen,Waktu,cleansing,text_lower,text_tokenize
0,... dapat membuat proses vaksinasi Covid-19 me...,0,Negatif,2020-11-29 09:50:01+00:00,dapat membuat proses vaksinasi Covid menjadi ...,dapat membuat proses vaksinasi covid menjadi ...,"[dapat, membuat, proses, vaksinasi, covid, men..."
1,'Everything happens for a good reason''. Segal...,1,Positif,2021-01-08 11:38:19+00:00,Everything happens for a good reason Segala se...,everything happens for a good reason segala se...,"[everything, happens, for, a, good, reason, se..."
2,""" bagaimana dampak covid-19 terhadap kenaikan ...",0,Negatif,2020-11-29 06:40:45+00:00,bagaimana dampak covid terhadap kenaikan angk...,bagaimana dampak covid terhadap kenaikan angk...,"[bagaimana, dampak, covid, terhadap, kenaikan,..."
3,"""Di indonesia doang covid 19 paling bertahan""",0,Negatif,2020-11-29 13:40:50+00:00,Di indonesia doang covid paling bertahan,di indonesia doang covid paling bertahan,"[di, indonesia, doang, covid, paling, bertahan]"
4,"""Indonesia jangan sampai kena gelombang 2 covi...",0,Negatif,2020-12-31 16:10:09+00:00,Indonesia jangan sampai kena gelombang 2 covid...,indonesia jangan sampai kena gelombang 2 covid...,"[indonesia, jangan, sampai, kena, gelombang, 2..."


##Normalization



In [None]:
normalization_word = pd.read_csv('colloquial-indonesian-lexicon.csv')
normalization_word_dict = {}

for index, row in normalization_word.iterrows():
    if row[0] not in normalization_word_dict:
        normalization_word_dict[row[0]] = row[1]

def normalization(document):
    return [normalization_word_dict[term] if term in normalization_word_dict else term for term in document]

df['normalization'] = df['text_tokenize'].apply(normalization)

from nltk.tokenize.treebank import TreebankWordDetokenizer
df['normalization']=df['normalization'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))
df.head()

Unnamed: 0,Tweet,Label,Sentimen,Waktu,cleansing,text_lower,text_tokenize,normalization
0,... dapat membuat proses vaksinasi Covid-19 me...,0,Negatif,2020-11-29 09:50:01+00:00,dapat membuat proses vaksinasi Covid menjadi ...,dapat membuat proses vaksinasi covid menjadi ...,"[dapat, membuat, proses, vaksinasi, covid, men...",dapat membuat proses vaksinasi covid menjadi t...
1,'Everything happens for a good reason''. Segal...,1,Positif,2021-01-08 11:38:19+00:00,Everything happens for a good reason Segala se...,everything happens for a good reason segala se...,"[everything, happens, for, a, good, reason, se...",everything happens for a good reason segala se...
2,""" bagaimana dampak covid-19 terhadap kenaikan ...",0,Negatif,2020-11-29 06:40:45+00:00,bagaimana dampak covid terhadap kenaikan angk...,bagaimana dampak covid terhadap kenaikan angk...,"[bagaimana, dampak, covid, terhadap, kenaikan,...",bagaimana dampak covid terhadap kenaikan angka...
3,"""Di indonesia doang covid 19 paling bertahan""",0,Negatif,2020-11-29 13:40:50+00:00,Di indonesia doang covid paling bertahan,di indonesia doang covid paling bertahan,"[di, indonesia, doang, covid, paling, bertahan]",di indonesia doang covid paling bertahan
4,"""Indonesia jangan sampai kena gelombang 2 covi...",0,Negatif,2020-12-31 16:10:09+00:00,Indonesia jangan sampai kena gelombang 2 covid...,indonesia jangan sampai kena gelombang 2 covid...,"[indonesia, jangan, sampai, kena, gelombang, 2...",indonesia jangan sampai kena gelombang 2 covid...


##Filtering

In [None]:
pip install Sastrawi

Collecting Sastrawi
[?25l  Downloading https://files.pythonhosted.org/packages/6f/4b/bab676953da3103003730b8fcdfadbdd20f333d4add10af949dd5c51e6ed/Sastrawi-1.0.1-py2.py3-none-any.whl (209kB)
[K     |█▋                              | 10kB 14.6MB/s eta 0:00:01[K     |███▏                            | 20kB 20.5MB/s eta 0:00:01[K     |████▊                           | 30kB 16.6MB/s eta 0:00:01[K     |██████▎                         | 40kB 14.4MB/s eta 0:00:01[K     |███████▉                        | 51kB 6.4MB/s eta 0:00:01[K     |█████████▍                      | 61kB 6.2MB/s eta 0:00:01[K     |███████████                     | 71kB 6.9MB/s eta 0:00:01[K     |████████████▌                   | 81kB 7.7MB/s eta 0:00:01[K     |██████████████                  | 92kB 8.2MB/s eta 0:00:01[K     |███████████████▋                | 102kB 6.8MB/s eta 0:00:01[K     |█████████████████▏              | 112kB 6.8MB/s eta 0:00:01[K     |██████████████████▊             | 122kB 6.8MB

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

def filtering(text):
  factory = StopWordRemoverFactory()
  stopword = factory.create_stop_word_remover()
  filter = stopword.remove(text)
  return filter

df["text_filtering"] = df["normalization"].apply(lambda text: filtering(text))
df.head()

Unnamed: 0,Tweet,Label,Sentimen,Waktu,cleansing,text_lower,text_tokenize,normalization,text_filtering
0,... dapat membuat proses vaksinasi Covid-19 me...,0,Negatif,2020-11-29 09:50:01+00:00,dapat membuat proses vaksinasi Covid menjadi ...,dapat membuat proses vaksinasi covid menjadi ...,"[dapat, membuat, proses, vaksinasi, covid, men...",dapat membuat proses vaksinasi covid menjadi t...,membuat proses vaksinasi covid menjadi pasti t...
1,'Everything happens for a good reason''. Segal...,1,Positif,2021-01-08 11:38:19+00:00,Everything happens for a good reason Segala se...,everything happens for a good reason segala se...,"[everything, happens, for, a, good, reason, se...",everything happens for a good reason segala se...,everything happens for a good reason segala te...
2,""" bagaimana dampak covid-19 terhadap kenaikan ...",0,Negatif,2020-11-29 06:40:45+00:00,bagaimana dampak covid terhadap kenaikan angk...,bagaimana dampak covid terhadap kenaikan angk...,"[bagaimana, dampak, covid, terhadap, kenaikan,...",bagaimana dampak covid terhadap kenaikan angka...,bagaimana dampak covid kenaikan angka kemiskin...
3,"""Di indonesia doang covid 19 paling bertahan""",0,Negatif,2020-11-29 13:40:50+00:00,Di indonesia doang covid paling bertahan,di indonesia doang covid paling bertahan,"[di, indonesia, doang, covid, paling, bertahan]",di indonesia doang covid paling bertahan,indonesia doang covid paling bertahan
4,"""Indonesia jangan sampai kena gelombang 2 covi...",0,Negatif,2020-12-31 16:10:09+00:00,Indonesia jangan sampai kena gelombang 2 covid...,indonesia jangan sampai kena gelombang 2 covid...,"[indonesia, jangan, sampai, kena, gelombang, 2...",indonesia jangan sampai kena gelombang 2 covid...,indonesia jangan kena gelombang 2 covid gelomb...


##Stemming

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

def stemming(text):
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()
  stem = stemmer.stem(text)
  return stem

df["text_stemming"] = df["text_filtering"].apply(lambda text: stemming(text))
df.head()

Unnamed: 0,Tweet,Label,Sentimen,Waktu,cleansing,text_lower,text_tokenize,normalization,text_filtering,text_stemming
0,... dapat membuat proses vaksinasi Covid-19 me...,0,Negatif,2020-11-29 09:50:01+00:00,dapat membuat proses vaksinasi Covid menjadi ...,dapat membuat proses vaksinasi covid menjadi ...,"[dapat, membuat, proses, vaksinasi, covid, men...",dapat membuat proses vaksinasi covid menjadi t...,membuat proses vaksinasi covid menjadi pasti t...,buat proses vaksinasi covid jadi pasti tidak j...
1,'Everything happens for a good reason''. Segal...,1,Positif,2021-01-08 11:38:19+00:00,Everything happens for a good reason Segala se...,everything happens for a good reason segala se...,"[everything, happens, for, a, good, reason, se...",everything happens for a good reason segala se...,everything happens for a good reason segala te...,everything happens for a good reason segala ja...
2,""" bagaimana dampak covid-19 terhadap kenaikan ...",0,Negatif,2020-11-29 06:40:45+00:00,bagaimana dampak covid terhadap kenaikan angk...,bagaimana dampak covid terhadap kenaikan angk...,"[bagaimana, dampak, covid, terhadap, kenaikan,...",bagaimana dampak covid terhadap kenaikan angka...,bagaimana dampak covid kenaikan angka kemiskin...,bagaimana dampak covid naik angka miskin indon...
3,"""Di indonesia doang covid 19 paling bertahan""",0,Negatif,2020-11-29 13:40:50+00:00,Di indonesia doang covid paling bertahan,di indonesia doang covid paling bertahan,"[di, indonesia, doang, covid, paling, bertahan]",di indonesia doang covid paling bertahan,indonesia doang covid paling bertahan,indonesia doang covid paling tahan
4,"""Indonesia jangan sampai kena gelombang 2 covi...",0,Negatif,2020-12-31 16:10:09+00:00,Indonesia jangan sampai kena gelombang 2 covid...,indonesia jangan sampai kena gelombang 2 covid...,"[indonesia, jangan, sampai, kena, gelombang, 2...",indonesia jangan sampai kena gelombang 2 covid...,indonesia jangan kena gelombang 2 covid gelomb...,indonesia jangan kena gelombang 2 covid gelomb...


##Featur Extraction (TF-IDF)

In [None]:
#Creating Bag of Words Model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(df['text_stemming']).toarray()
y = df.iloc[:, 1].values

In [None]:
#Creating TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X= tfidfconverter.fit_transform(X).toarray()

In [None]:
print(X)

##Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import svm
clf = svm.SVC(C = 0.5, degree = 2, gamma = 'scale', kernel = 'sigmoid')
scores = cross_val_score(clf, X, y, cv=10)

In [None]:
print(scores)
scores.mean()

[0.70229008 0.75572519 0.82307692 0.82307692 0.81538462 0.76153846
 0.80769231 0.85384615 0.76923077 0.80769231]


0.7919553728714034

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
clf2 = MultinomialNB(fit_prior=False)
scores2 = cross_val_score(clf2, X, y, cv=10)

In [None]:
print(scores2)
scores2.mean()

[0.70992366 0.77862595 0.81538462 0.76923077 0.79230769 0.76153846
 0.80769231 0.83846154 0.77692308 0.76923077]


0.7819318849089841

##GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
svm = svm.SVC()
param = {'C': (0.5, 1),
         'kernel':('rbf', 'linear', 'sigmoid', 'poly'),
         'gamma' : ('scale', 'auto'),
         'degree' : (2, 3, 4)
         }
grid = GridSearchCV(svm, param)
grid.fit(X,y)
grid.best_params_

{'C': 0.5, 'degree': 2, 'gamma': 'scale', 'kernel': 'sigmoid'}

In [None]:
grid.best_score_

0.7819068670792808

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
param = {'fit_prior' : ('False', 'True')}
grid = GridSearchCV(nb, param)
grid.fit(X,y)
grid.best_params_ 

{'fit_prior': 'False'}

In [None]:
print(scores)
grid.best_score_

[0.70229008 0.75572519 0.82307692 0.82307692 0.81538462 0.76153846
 0.80769231 0.85384615 0.76923077 0.80769231]


0.7089301503094607

##Split Dataset

In [None]:
#Splittting dataset into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 0)

##SVM


In [None]:
from sklearn import svm

classifierSVM = svm.SVC(C = 0.5, degree = 2, gamma = 'scale', kernel = 'sigmoid')
# Proses Pelatihan
classifierSVM.fit(X_train, y_train)
# Prediksi data test
y_pred_SVM = classifierSVM.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
cm_SVM = confusion_matrix(y_test, y_pred_SVM)
print("confusion matrix :")
print(cm_SVM)

from sklearn.metrics import accuracy_score
acc = (accuracy_score(y_test, y_pred_SVM))
print("Accuracy : ", (acc))

from sklearn.metrics import precision_score
pre = (precision_score(y_test, y_pred_SVM))
print("Precision : ", (pre))

from sklearn.metrics import recall_score
rec= (recall_score(y_test, y_pred_SVM))
print("Recall : ", (rec))

from sklearn.metrics import f1_score
f1= (f1_score(y_test, y_pred_SVM))
print("Skor F1: ", (f1))

confusion matrix :
[[136  51]
 [ 49 285]]
Accuracy :  0.8080614203454894
Precision :  0.8482142857142857
Recall :  0.8532934131736527
Skor F1:  0.8507462686567163


In [None]:
from sklearn.metrics import classification_report

print (classification_report(y_test, y_pred_SVM))

              precision    recall  f1-score   support

           0       0.74      0.73      0.73       187
           1       0.85      0.85      0.85       334

    accuracy                           0.81       521
   macro avg       0.79      0.79      0.79       521
weighted avg       0.81      0.81      0.81       521



##Prediksi SVM

In [None]:
def prediksi_SVM(text):
    text = cleansing(text)
    text = str.lower(text)
    text = tokenizing(text)
    text = normalization(text)
    text = TreebankWordDetokenizer().detokenize(text)
    text = filtering(text)
    text = stemming(text)
    text = cv.transform([text]).toarray()
    text = tfidfconverter.transform(text).toarray()
    text = classifierSVM.predict(text)
    if text[0] == 0:
        print('Sentimen Negatif')
    else:
        print('Sentimen Positif')
    return text

In [None]:
SVM = prediksi_SVM("korupsi bansos")

Sentimen Negatif


##Naive Bayes


In [None]:
from sklearn.naive_bayes import MultinomialNB

classifierNB = MultinomialNB(fit_prior=False)
# Proses Pelatihan
classifierNB.fit(X_train, y_train)
# Prediksi data test
y_pred_NB = classifierNB.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
cm_NB = confusion_matrix(y_test, y_pred_NB)
print("confusion matrix :")
print(cm_NB)

from sklearn.metrics import accuracy_score
acc = (accuracy_score(y_test, y_pred_NB))
print("Accuracy : ", (acc))

from sklearn.metrics import precision_score
pre = (precision_score(y_test, y_pred_NB))
print("Precision : ", (pre))

from sklearn.metrics import recall_score
rec= (recall_score(y_test, y_pred_NB))
print("Recall : ", (rec))

from sklearn.metrics import f1_score
f1= (f1_score(y_test, y_pred_NB))
print("Skor F1: ", (f1))

confusion matrix :
[[119  68]
 [ 37 297]]
Accuracy :  0.7984644913627639
Precision :  0.8136986301369863
Recall :  0.8892215568862275
Skor F1:  0.8497854077253219


In [None]:
from sklearn.metrics import classification_report

print (classification_report(y_test, y_pred_NB))

              precision    recall  f1-score   support

           0       0.76      0.64      0.69       187
           1       0.81      0.89      0.85       334

    accuracy                           0.80       521
   macro avg       0.79      0.76      0.77       521
weighted avg       0.80      0.80      0.79       521



##Prediksi Naive Bayes

In [None]:
def prediksi_NB(text):
    text = cleansing(text)
    text = str.lower(text)
    text = tokenizing(text)
    text = normalization(text)
    text = TreebankWordDetokenizer().detokenize(text)
    text = filtering(text)
    text = stemming(text)
    text = cv.transform([text]).toarray()
    text = tfidfconverter.transform(text).toarray()
    text = classifierNB.predict(text)
    if text[0] == 0:
        print('Sentimen Negatif')
    else:
        print('Sentimen Positif')
    return text

In [None]:
Naive_Bayes = prediksi_NB("indonesia vaksin gratis")

Sentimen Positif
