## Text preprocessing

In [0]:
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
import random
import pandas as pd

In [2]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
print(documents[1])

(['on', 'april', '12th', ',', '1912', ',', 'the', 'most', 'astonishing', 'shipwreck', 'in', 'the', 'history', 'of', 'the', 'world', 'occurred', '.', 'on', 'that', 'fateful', 'night', ',', 'the', 'titanic', 'sunk', '.', 'now', ',', 'more', 'than', '50', 'years', 'later', ',', 'a', 'film', 'has', 'been', 'made', '.', 'and', 'what', 'a', 'film', 'that', 'is', '.', 'james', 'cameron', "'", 's', 'newest', 'movie', 'is', 'a', 'landmark', 'in', 'storytelling', ',', 'emotion', ',', 'and', 'special', 'effects', '.', 'it', 'starts', 'in', 'the', 'present', ',', 'where', 'bill', 'paxton', 'and', 'his', 'band', 'of', 'scientists', 'explore', 'the', 'depths', 'of', 'the', 'ship', "'", 's', 'wreckage', '.', 'exploring', 'an', 'old', 'chest', ',', 'paxton', 'comes', 'across', 'a', 'nude', 'drawing', 'of', 'a', 'young', 'woman', '.', 'the', 'drawing', 'is', 'televised', 'and', 'the', 'woman', '(', 'gloria', 'stuart', ')', 'whose', 'portrait', 'the', 'painting', 'is', 'of', 'comes', 'forward', 'and', '

In [3]:
all_words = []
for w in movie_reviews.words():
  all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))
print(all_words['stupid'])

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]
253


## Word as feature

In [4]:
word_features = list(all_words.keys())[:3000]
print(word_features)  



In [0]:
def find_features(document):
  words = set(document)
  features = {}
  for w in word_features:
    features[w] = (w in words)
  return features

In [6]:
feature_sets = [(find_features(rev), category) for (rev, category) in documents]
print(feature_sets[1])
print(len(feature_sets))

2000


## Naive bayes classification & Saving Model

In [7]:
training_set = feature_sets[:1900]
testing_set = feature_sets[1900:]
print("Training set:", len(training_set), "\nTesting set:", len(testing_set))

Training set: 1900 
Testing set: 100


In [51]:
import pickle
try:
  with open('/content/drive/My Drive/Colab Notebooks/github/text-classification-naive-bayes/training/naive-bayes-classifier.pickle', 'rb') as f:
    classifier = pickle.load(f)
    print('Pickle Loaded...')
except:
  classifier = nltk.NaiveBayesClassifier.train(training_set)
  with open('naive-bayes-classifier.pickle', 'wb') as f:
    pickle.dump(classifier, f)
print("Classifier accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)

Pickle Loaded...
Classifier accuracy percent:  83.0


In [52]:
classifier.show_most_informative_features(15)

Most Informative Features
                   sucks = True              neg : pos    =      9.9 : 1.0
                  annual = True              pos : neg    =      8.9 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
           unimaginative = True              neg : pos    =      7.7 : 1.0
                 frances = True              pos : neg    =      7.6 : 1.0
                 idiotic = True              neg : pos    =      7.3 : 1.0
              schumacher = True              neg : pos    =      7.1 : 1.0
                  shoddy = True              neg : pos    =      7.1 : 1.0
                  suvari = True              neg : pos    =      7.1 : 1.0
               atrocious = True              neg : pos    =      7.1 : 1.0
                    mena = True              neg : pos    =      7.1 : 1.0
                  regard = True              pos : neg    =      6.9 : 1.0
                  turkey = True              neg : pos    =      6.8 : 1.0

## SciKit Learn Incorporation

In [0]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [0]:
def run_classifier(name, classifier):
  nb_classifier = SklearnClassifier(classifier)
  nb_classifier.train(training_set)
  print(name, "accuracy percent:", (nltk.classify.accuracy(nb_classifier, testing_set)) * 100)

#### Multinomial Naive Bayes Classifier

In [55]:
MNB_classifier = run_classifier('MNB_classifier', MultinomialNB())

MNB_classifier accuracy percent: 82.0


#### Bernoulli Naive Bayes Classfier

In [56]:
BNB_classifier = run_classifier('BNB_classifier', MultinomialNB())

BNB_classifier accuracy percent: 82.0


#### Gaussian Naive Bayes Classfier

In [57]:
  GNB_classifier = run_classifier('GNB_classifier', MultinomialNB())

GNB_classifier accuracy percent: 82.0


#### LogisticRegression, SGDClassifier - Naive Bayes Classifier

In [59]:
LRNB_classifier = run_classifier('LRNB_classifier', LogisticRegression())
SGDNB_classifier = run_classifier('SGDNB_classifier', SGDClassifier())

LRNB_classifier accuracy percent: 83.0
SGDNB_classifier accuracy percent: 83.0


#### LinearSVC, SVC, NuSVC - Naive Bayes Classifier

In [60]:
LSVCNB_classfier = run_classifier('LSVCNB_classfier', LinearSVC())
SVCNB_classfier = run_classifier('SVCNB_classfier', SVC())
NuSVCNB_classfier = run_classifier('NuSVCNB_classfier', NuSVC())

LSVCNB_classfier accuracy percent: 80.0
SVCNB_classfier accuracy percent: 86.0
NuSVCNB_classfier accuracy percent: 87.0
