In [1]:
#Membaca data menggunakan pandas
#import library
import pandas as pd
import numpy as np
import nltk
import Sastrawi
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from nltk.tag import CRFTagger
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
raw_data = pd.read_csv('dataset/train_set.csv', encoding = "Latin-1")
print(raw_data.shape)
raw_data.head()

(3462, 3)


Unnamed: 0,id,sentimen,tweet
0,1,1,oks kak semangat ya kalian kalian
1,2,0,sekarang harus kaya orang bodoh lagi bodoh sangat
2,3,1,"Begitu diumumkan lulus 100%, mereka semua suju..."
3,4,0,[USERNAME] [USERNAME] Katanya Bapak Reformasi ...
4,5,0,macet macetan perut kosong akhirnya mampir dah...


In [2]:
print ("Column Data Type Before Transformation \n", raw_data.dtypes)

Column Data Type Before Transformation 
 id           int64
sentimen     int64
tweet       object
dtype: object


In [3]:
# Fungsi Normalisasi

def normalisasi(tweet):
    normal_tw = tweet.lower() #lowercase
    normal_tw = re.sub('\s+', ' ', normal_tw) # remove extra space
    normal_tw = normal_tw.strip() #trim depan belakang
    normal_tw = re.sub(r'[^\w\s]','',normal_tw) #buang punctuation
    normal_regex = re.compile(r"(.)\1{1,}", re.IGNORECASE) #regex huruf yang berulang kaya haiiii (untuk fitur unigram)
    normal_tw = normal_regex.sub(r"\1\1", normal_tw) #buang huruf yang berulang
    return normal_tw

In [4]:
# Fungsi Remove Stopwords

def remove_stopwords(tweet):
    stopwords = pd.read_csv("dataset/stopwords.csv")
    special_list = ['username', 'url', 'sensitive-no']
    token = nltk.word_tokenize(tweet)
    token_afterremoval = []
    for k in token:
        if k not in stopwords and k not in special_list:
            token_afterremoval.append(k)
    
    str_clean = ' '.join(token_afterremoval)
    return str_clean

In [5]:
# Fungsi Stemming

def Stemming(tweet):
    token = nltk.word_tokenize(tweet)
    stem_kalimat = []
    for k in token:
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()
        stem_kata = stemmer.stem(clean_tw)
        stem_kalimat.append(stem_kata)
        
    stem_kalimat_str = ' '.join(stem_kalimat)

In [6]:
# Prapemrosesan Data

def pra_pemrosesan(list_tweet):
    tweet_clean = []
    for tw in list_tweet:
        normal_tweet = normalisasi(tw)
        nosw_tweet = remove_stopwords(normal_tweet)
        #stem_tweet = Stemming(nosw_tweet)
        tweet_clean.append(nosw_tweet)
    return tweet_clean

raw_tweet = raw_data['tweet']
label = raw_data['sentimen'].tolist()

clean_tweet = pra_pemrosesan(raw_tweet)
clean_tweet[:3]

['oks kak semangat ya kalian kalian',
 'sekarang harus kaya orang bodoh lagi bodoh sangat',
 'begitu diumumkan lulus 100 mereka semua sujud syukur dan langsung mengambil bungasaat dia menghampiri langsung memeluk menciumku air mata tak kuasa kubendungmom this is my birthday present for u']

In [7]:
# Ekstraksi Bag Of Word

def EkstraksiBoW(tweet):
    unigram = CountVectorizer(ngram_range=(1,1))
    unigram_matrix = unigram.fit_transform(np.array(tweet)).todense()
    nama_fitur = unigram.get_feature_names()
    return unigram_matrix, nama_fitur, unigram

unigram_feat, feat_name, unigram_used = EkstraksiBoW(clean_tweet)
print(unigram_feat[:3])
print(feat_name[:10])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
['00', '0015', '011060039617518', '013150189591518', '02', '0217280858728086', '029', '06', '06062018', '0630']


In [8]:
# Leksikon

def EkstraksiSentimen(list_tweet):
    pos = pd.read_csv("dataset/positif_vania.txt", header=None, names=['pos'])
    list_pos = pos['pos'].tolist()
    neg = pd.read_csv("dataset/negatif_vania.txt", header=None, names=['neg'])
    list_neg = neg['neg'].tolist()
    
    fitur_sentimen_all = []
    for tweet in list_tweet:
        ##inisiasi value
        emosi = ["positif", "negatif"]
        value = [0,0]
        emosi_value = {}
        for i in range(len(emosi)):
            emosi_value[emosi[i]] = value[i]
        list_kata = tweet.split()
        for k in list_kata:
            if k in list_pos:
                emosi_value["positif"] += 1
            if k in list_neg:
                emosi_value["negatif"] += 1
        
        
        fitur_sentimen_perkalimat = list(emosi_value.values())
        fitur_sentimen_all.append(fitur_sentimen_perkalimat)
        
    return fitur_sentimen_all

sentlex_feat = EkstraksiSentimen(clean_tweet)
print(sentlex_feat[:10])

[[1, 0], [0, 3], [1, 0], [2, 0], [0, 2], [0, 2], [0, 0], [2, 0], [1, 0], [0, 0]]


In [9]:
# POSTag

def EkstraksiPOS(list_tweet):
    ct = CRFTagger()
    ct.set_model_file("dataset/all_indo_man_tag_corpus_model.crf.tagger")
    pos_feat_list = []
    count_tag = []
    for tweet in list_tweet: 
        token = nltk.word_tokenize(tweet)
        tag = ct.tag_sents([token])
        flat_tag = [item for sublist in tag for item in sublist]
        pos_count = Counter([j for i,j in flat_tag])
        pos_feat = (pos_count['JJ'], pos_count['NEG'])
        pos_feat_list.append(pos_feat)
    return pos_feat_list

postag_feat = EkstraksiPOS(clean_tweet)
print(postag_feat[:3]) 

[(0, 0), (1, 0), (0, 1)]


In [10]:
# Ortografi

def EkstraksiOrtografi(raw_tweet):
    all_orto_feat = []
    for tw in raw_tweet:
        capital_count = sum(1 for c in tw if c.isupper())
        exclamation_count = sum((1 for c in tw if c == "!"))
        word_len = len(nltk.word_tokenize(tw))
        char_len = len(tw)
        orto_feat = [capital_count, exclamation_count, word_len, char_len]
        all_orto_feat.append(orto_feat)
    return all_orto_feat

orto_feat = EkstraksiOrtografi(raw_tweet)
orto_feat[:3] 

[[0, 0, 6, 33], [0, 0, 8, 49], [4, 0, 46, 219]]

In [28]:
# Klasifikasi

feat_list = [unigram_feat, sentlex_feat, postag_feat, orto_feat]
feat_name = ["Unigram", "Sentimen", "POS", "Ortografi"]
for f, n in zip(feat_list, feat_name):
    X = f
    y = label
    scoring = ['accuracy', 'f1_macro']
    nb = MultinomialNB()
    scores = cross_validate(nb, X, y, cv=10, scoring=scoring)
    acc = np.mean(scores['test_accuracy'])
    f1 = np.mean(scores['test_f1_macro'])
    print("Jenis Fitur : ", n)
    print("Akurasi :", acc)
    print("F1-Score :", f1)
    print("---------------")

Jenis Fitur :  Unigram
Akurasi : 0.8573103896320233
F1-Score : 0.8572418531351632
---------------
Jenis Fitur :  Sentimen
Akurasi : 0.792892838700005
F1-Score : 0.7916838705373135
---------------
Jenis Fitur :  POS
Akurasi : 0.4939323016441506
F1-Score : 0.4930192614836379
---------------
Jenis Fitur :  Ortografi
Akurasi : 0.5213864503339941
F1-Score : 0.49134936369412757
---------------


In [11]:
# Leksikon Koto

def EkstraksiSentimenKoto(list_tweet):
    pos = pd.read_csv("dataset/positive_koto.tsv", delimiter='\t', index_col=False, header=None, names=['pos', 'sentimen'])
    list_pos = pos['pos'].tolist()
    list_sent_pos = pos['sentimen'].tolist()
    dicti_pos = dict(zip(list_pos, list_sent_pos))
    
    neg = pd.read_csv("dataset/negative_koto.tsv", delimiter='\t', index_col=False, header=None, names=['neg', 'sentimen'])
    list_neg = neg['neg'].tolist()
    list_sent_neg = neg['sentimen'].tolist()
    dicti_neg = dict(zip(list_neg, list_sent_neg))
    
    fitur_sentimen_all = []
    for tweet in list_tweet:
        ##inisiasi value
        emosi = ["positif", "negatif"]
        value = [0,0]
        emosi_value = {}
        for i in range(len(emosi)):
            emosi_value[emosi[i]] = value[i]
        list_kata = tweet.split()
        for k in list_kata:
            if k in dicti_pos.keys():
                emosi_value["positif"] += dicti_pos[k]
            if k in dicti_neg.keys():
                emosi_value["negatif"] += (-1 * dicti_neg[k])
        
        
        fitur_sentimen_perkalimat = list(emosi_value.values())
        fitur_sentimen_all.append(fitur_sentimen_perkalimat)
        
    return fitur_sentimen_all

sentlex_koto_feat = EkstraksiSentimenKoto(clean_tweet)
print(sentlex_koto_feat[:10])

[[8, 1], [5, 15], [27, 10], [5, 42], [7, 8], [13, 20], [11, 13], [19, 14], [32, 19], [5, 21]]


In [46]:
# Klasifikasi Logistic Regression dan Decision Tree
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

models = {}
models['Logistic Regression'] = LogisticRegression()
models['Decision Tree'] = DecisionTreeClassifier()
models['Multinomial Naive Bayes'] = MultinomialNB()
trainedModels = {}

feat_list = [unigram_feat, sentlex_feat, postag_feat, orto_feat, sentlex_koto_feat]
feat_name = ["Unigram", "Sentimen", "POS", "Ortografi", "Sentimen Koto"]
zip_feat = zip(feat_list, feat_name)
for f, n in zip_feat:
    X = f
    y = label
    scoring = ['accuracy', 'f1_macro']
    for i in models.keys():
        model = models[i]
        scores = cross_validate(model, X, y, cv=10, scoring=scoring)
        acc = np.mean(scores['test_accuracy'])
        f1 = np.mean(scores['test_f1_macro'])
        print("Jenis Model : ", i)
        print("Jenis Fitur : ", n)
        print("Akurasi :", acc)
        print("F1-Score :", f1)
        print("---------------")

Jenis Model :  Logistic Regression
Jenis Fitur :  Unigram
Akurasi : 0.8457455314754043
F1-Score : 0.8456257675854106
---------------
Jenis Model :  Decision Tree
Jenis Fitur :  Unigram
Akurasi : 0.7634472189368825
F1-Score : 0.7633470199669499
---------------
Jenis Model :  Multinomial Naive Bayes
Jenis Fitur :  Unigram
Akurasi : 0.8472022788226082
F1-Score : 0.8471433455046606
---------------
Jenis Model :  Logistic Regression
Jenis Fitur :  Sentimen
Akurasi : 0.792892838700005
F1-Score : 0.7916838705373135
---------------
Jenis Model :  Decision Tree
Jenis Fitur :  Sentimen
Akurasi : 0.7900034981926005
F1-Score : 0.7890882123922349
---------------
Jenis Model :  Multinomial Naive Bayes
Jenis Fitur :  Sentimen
Akurasi : 0.792892838700005
F1-Score : 0.7916838705373135
---------------
Jenis Model :  Logistic Regression
Jenis Fitur :  POS
Akurasi : 0.5242582998783962
F1-Score : 0.5157061608164644
---------------
Jenis Model :  Decision Tree
Jenis Fitur :  POS
Akurasi : 0.5216521463910313

In [12]:
# Klasifikasi Logistic Regression dan Decision Tree
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
test_data = pd.read_csv('dataset/test_set.csv', encoding = "Latin-1")
print(test_data.shape)

(8000, 2)


In [13]:
logResModel = LogisticRegression()
logResModel.fit(unigram_feat, label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [14]:
clean_test_tweet = pra_pemrosesan(test_data['tweet'])
unigram_feat_test = unigram_used.transform(clean_test_tweet)
unigram_res = logResModel.predict(unigram_feat_test)
df_unigram = test_data[['test_ID']].copy()
df_unigram['pred'] = unigram_res
df_unigram.to_csv('first_test_pred_unixx1.csv', sep=',', encoding='utf-8', index=False, header=False)

In [15]:
MNBUni = MultinomialNB()
MNBUni.fit(unigram_feat, label)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
unigram_res_MNB = MNBUni.predict(unigram_feat_test)
df_unigram_MNB = test_data[['test_ID']].copy()
df_unigram_MNB['pred'] = unigram_res_MNB
df_unigram_MNB.to_csv('first_test_pred_uni3_MNBxx1.csv', sep=',', encoding='utf-8', index=False, header=False)
df_unigram_MNB

Unnamed: 0,test_ID,pred
0,0,1
1,1,0
2,2,1
3,3,1
4,4,1
5,5,1
6,6,1
7,7,1
8,8,0
9,9,1


In [17]:
logResModelNew = LogisticRegression()
logResModelNew.fit(sentlex_feat, label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
sentlex_feat_test = EkstraksiSentimen(clean_test_tweet)
sentiment_res = logResModelNew.predict(sentlex_feat_test)
df_sentiment = test_data[['test_ID']].copy()
df_sentiment['pred'] = sentiment_res
df_sentiment.to_csv('first_test_pred_sentimentxx1.csv', sep=',', encoding='utf-8', index=False, header=False)
df_sentiment

Unnamed: 0,test_ID,pred
0,0,1
1,1,0
2,2,1
3,3,1
4,4,1
5,5,1
6,6,1
7,7,1
8,8,1
9,9,1
