In [548]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# 0. Read the data

In [549]:
positive = open("../positive.txt", "r").read()
negative = open("../negative.txt", "r").read()

# 1. Text-Preprocessing

Tokenizing

In [550]:
from nltk.tokenize import word_tokenize, sent_tokenize

list_words = word_tokenize(positive) + word_tokenize(negative)

Words cleaning (stopwords, stemming, POS tagging)

In [551]:
from nltk.corpus import stopwords
from string import punctuation

eng_stopwords = stopwords.words('english')

list_words = [word for word in list_words if word not in eng_stopwords]
list_words = [word for word in list_words if word not in punctuation]
list_words = [word for word in list_words if word.isalpha()]

pos tagging for NER

In [552]:
from nltk.tag import pos_tag
tagged = pos_tag(list_words) # used for ner

In [553]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

list_words = [wordnet_lemmatizer.lemmatize(word) for word in list_words]

In [554]:
from nltk.chunk import ne_chunk
ner = ne_chunk(tagged)
# ner.draw() # uncomment to see the tree

# 2. Pemanfaatan wordnet, load corpora, dsb (gaada hubungan sama AI Model)

In [555]:
from nltk.corpus import gutenberg
emma = gutenberg.raw('austen-emma.txt')

wordnet

In [556]:
from nltk.corpus import wordnet

for word in list_words:
    synsets = wordnet.synsets(word)
    if synsets:
        print(f"{word}: {synsets[0].definition()}")

watch: a small portable timepiece
really: in accordance with truth or fact or reality
good: benefit
durability: permanence by virtue of the power to resist stress or force
battery: group of guns or missile launchers operated together at one place
life: a characteristic state or mode of living
amazing: affect with wonder
I: a nonmetallic element belonging to the halogens; used especially in medicine and photography and in dyes; occurs naturally only in combination in small quantities (as in sea water or rocks)
love: a strong positive emotion of regard and affection
clarity: free from obscurity and easy to understand; the comprehensibility of clear expression
laptop: a portable computer small enough to use in your lap
screen: a white or silvered surface where pictures can be projected for viewing
sound: the particular auditory effect produced by a given cause
quality: an essential and distinguishing attribute of something or someone; --Shakespeare
headphone: electro-acoustic transducer f

# 3. Memperkecil range

In [557]:
from nltk.probability import FreqDist
fdist = FreqDist(list_words)

list_words = [word for word, count in fdist.most_common(10000)]

# 4. Labeling

In [558]:
labeled_sentences = [] 
for sentence in positive.split("\n"):
    labeled_sentences.append((sentence, "positive"))

for sentence in negative.split("\n"):
    if len(sentence) == 1:
        continue
    labeled_sentences.append((sentence, "negative"))

# 5. Dataset making

In [559]:
dataset = []
for sentence, label in labeled_sentences:
    dict = {}
    words = word_tokenize(sentence)
    for feature in list_words:
        key = feature
        value = feature in words
        dict[key] = value
    dataset.append((dict, label))

import random
random.shuffle(dataset)

train_set = dataset[:int(len(dataset) * 0.8)]
test_set = dataset[int(len(dataset) * 0.2):]

# 6. Import Naive Bayes Algorithms

In [560]:
from nltk.classify import NaiveBayesClassifier, accuracy
classifier = NaiveBayesClassifier.train(train_set)
accuracy(classifier, test_set)

0.9011494252873563

# 7. Move to pickle

In [561]:
import pickle
file = open("classifier.pickle", "wb")
pickle.dump(classifier, file)

# 8. Show most informative features

In [562]:
classifier.show_most_informative_features(5)

Most Informative Features
                    hard = True           negati : positi =      4.3 : 1.0
                 durable = True           positi : negati =      3.6 : 1.0
              everything = True           positi : negati =      3.6 : 1.0
                    fast = True           positi : negati =      3.4 : 1.0
                 tracker = True           negati : positi =      3.1 : 1.0


# 9. Input Test

In [563]:
review = input()
words = word_tokenize(review)
print(classifier.classify(FreqDist(words)))

negative
