In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.probability import FreqDist
from nltk.corpus import gutenberg
from nltk.corpus import wordnet
import string

In [43]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('gutenberg')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [3]:
def split_string(document) :
    words = word_tokenize(document)
    return words

def remove_stopwords(words) :
    ENGLISH_STOPWORDS = stopwords.words('english')
    words = [word for word in words if word.lower() not in ENGLISH_STOPWORDS]
    return words

def remove_punctuation(words) :
    words = [word for word in words if word not in string.punctuation]
    return words

def remove_br(words) :
    words = [word for word in words if word != "br"]
    words = [word for word in words if word != "</br >"]
    return words

def stem_document(words) :
    stemmer = SnowballStemmer('english')
    words_return = []
    for word in words:
        stemmed = stemmer.stem(word)
        words_return.append(stemmed)

    return words_return

def tag_words(words) :
    tagged = pos_tag(words)
    return tagged

def NER_words(tagged_words) :
    ner = ne_chunk(tagged_words)
    ner.draw()

def frequence_distribution(document) :
    fd = FreqDist(document)
    return fd

def remove_alpha(words) :
    words = [word for word in words if word.isalpha()]
    return words

def words_to_adjective(words) :
    wnl = WordNetLemmatizer()
    words_return = []
    for word in words :
        adjective = wnl.lemmatize(word, pos='r')
        words_return.append(adjective)
    return words_return

def get_pos_for_word_net(part_of_speech) :
    if part_of_speech.startswith('J'):
        return wordnet.ADJ
    elif part_of_speech.startswith('V'):
        return wordnet.VERB
    elif part_of_speech.startswith('N'):
        return wordnet.NOUN
    elif part_of_speech.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_words(words) :
    words_return = []
    pos = pos_tag(words)
    wnl = WordNetLemmatizer()
    for word, part_of_speech in pos :
        current_word = wnl.lemmatize(word, pos=get_pos_for_word_net(part_of_speech))
        words_return.append(current_word)
    return words_return

def preprocess_sentence(sentence) :
    words = split_string(sentence)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    words = remove_br(words)
    words = remove_alpha(words)
    words = lemmatize_words(words)


    return words

In [35]:
# corpora
class Corpora :
    def __init__ (self) :
        self.GUTENBERG_RAW_FILE = 'bible-kjv.txt'
        self.corpora = gutenberg.raw(self.GUTENBERG_RAW_FILE)
        self.words = split_string(self.corpora)
        self.preprocess_words()

    def preprocess_words(self) :
        self.words = remove_stopwords(self.words)
        self.words = remove_punctuation(self.words)

    def get_synset_definitions(self, definitionAmount=5) :
        currentAmount = 0
        for word in self.words :
            if(currentAmount >= definitionAmount) :
                return
            synsets = wordnet.synsets(word)
            for synset in synsets:
                print(f"{synset} : {synset.definition()}")
                for lemma in synset.lemmas() :
                    print(f"The Sysnonims : {lemma.name()}")
                    for antonym in lemma.antonyms():
                        print(f"Antonym: {antonym.name()}")
            currentAmount += 1

    def get_pos_tag(self, tagAmmount=5) :
        tagged = tag_words(self.words)
        currentAmount = 0
        for first, second in tagged :
            if(currentAmount >= tagAmmount) :
                return
            print(f"Word : {first}")
            print(f"Tag : {second}")
            currentAmount += 1

    def get_name_entity_information(self) :
        NER_words(self.words)

    def get_frequency_distribution(self) :
        freqDist = frequence_distribution(self.words)
        print(f'100 Most Commond Words :  {freqDist.most_common(100)}')

    def get_stem(self) :
        stem_document(self.words)


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd

dataset = pd.read_csv("./drive/MyDrive/IMDB Dataset.csv")

dataset

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [7]:
reviews = dataset['review'][:500]
sentiments = dataset['sentiment'][:500]

list_words = []
labeled_sentence = []

for review, sentiment in zip(reviews, sentiments) :
    words = preprocess_sentence(review)
    list_words = list_words + words
    labeled_sentence.append((review, sentiment))


In [8]:
print(list_words)



In [9]:
import random
random.shuffle(labeled_sentence)
training_dataset = []
for sentence, label in labeled_sentence:
  dictionary = {}
  words = preprocess_sentence(sentence)
  for feature in list_words:
    dictionary[feature] = feature in words
  training_dataset.append((dictionary, label))


In [10]:
counter = int(len(training_dataset) * 0.8)
random.shuffle(training_dataset)
testing_dataset = training_dataset[counter:]
training_dataset = training_dataset[:counter]

In [11]:
from nltk.classify import NaiveBayesClassifier, accuracy

print(training_dataset)
classifier = NaiveBayesClassifier.train(training_dataset)
accuracy(classifier, testing_dataset)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



0.73

In [14]:
import pickle
FILE_NAME = 'model.pickle'

In [16]:
file = open(FILE_NAME, 'wb')

pickle.dump(classifier, file)

file.close()

In [17]:
file = open(FILE_NAME, 'rb')
loaded_classifier = pickle.load(file)
file.close()

In [20]:
print(accuracy(loaded_classifier, testing_data))

0.73


In [37]:
def corpora_menu():
    c.get_pos_tag()
    c.get_synset_definitions()
    c.get_name_entity_information()
    c.get_frequency_distribution()

def naive_bayes_menu() :
    review = input("Input IMDB Review : ")
    words = preprocess_sentence(review)
    result = loaded_classifier.classify(FreqDist(words))
    print(result)

In [45]:
while(True) :
    c = Corpora()
    print("1. Corpora Examples")
    print("2. Naive Bayes")
    print(">> ", end='')
    choice = input()

    if choice == '1' :
        corpora_menu()
    elif choice == '2' :
        naive_bayes_menu()
    else :
        break

1. Corpora Examples
2. Naive Bayes
>> 2
Input IMDB Review : Good
positive


KeyboardInterrupt: 