## Text preprocessing

In [1]:
import nltk
import random
import pandas as pd
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
short_pos = open(r'/content/drive/My Drive/Colab Notebooks/github/text-classification-naive-bayes/data/positive.txt', encoding="latin-1").read()
short_neg = open(r'/content/drive/My Drive/Colab Notebooks/github/text-classification-naive-bayes/data/negative.txt', encoding="latin-1").read()

documents = []

for r in short_pos.split('\n'):
  documents.append((r, 'pos'))

for r in short_neg.split('\n'):
  documents.append((r, 'neg'))

all_words = []
short_pos_words = nltk.word_tokenize(short_pos)
short_neg_words = nltk.word_tokenize(short_neg)

for w in short_pos_words:
  all_words.append(w.lower())

for w in short_neg_words:
  all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))

[('.', 14010), ('the', 10113), (',', 10037), ('a', 7307), ('and', 6202), ('of', 6063), ('to', 4234), ('is', 3559), ("'s", 3537), ('it', 3422), ('that', 2657), ('in', 2630), ('as', 1803), ('but', 1641), ('film', 1590)]


## Word as feature

In [3]:
word_features = list(all_words.keys())[:5000]
print(word_features[:20])  

['the', 'rock', 'is', 'destined', 'to', 'be', '21st', 'century', "'s", 'new', '``', 'conan', 'and', 'that', 'he', 'going', 'make', 'a', 'splash', 'even']


In [0]:
def find_features(document):
  words = set(nltk.word_tokenize(document))
  features = {}
  for w in word_features:
    features[w] = (w in words)
  return features

In [5]:
feature_sets = [(find_features(rev), category) for (rev, category) in documents]
random.shuffle(feature_sets)
print(feature_sets[1])
print(len(feature_sets))

({'the': True, 'rock': False, 'is': True, 'destined': False, 'to': False, 'be': False, '21st': False, 'century': False, "'s": False, 'new': False, '``': True, 'conan': False, 'and': False, 'that': False, 'he': False, 'going': False, 'make': False, 'a': True, 'splash': False, 'even': False, 'greater': False, 'than': False, 'arnold': False, 'schwarzenegger': False, ',': True, 'jean-claud': False, 'van': False, 'damme': False, 'or': False, 'steven': False, 'segal': False, '.': True, 'gorgeously': False, 'elaborate': False, 'continuation': False, 'of': True, 'lord': False, 'rings': False, 'trilogy': False, 'so': False, 'huge': False, 'column': False, 'words': False, 'can': False, 'not': False, 'adequately': False, 'describe': False, 'co-writer/director': False, 'peter': False, 'jackson': False, 'expanded': False, 'vision': False, 'j': False, 'r': False, 'tolkien': False, 'middle-earth': False, 'effective': False, 'but': True, 'too-tepid': False, 'biopic': False, 'if': False, 'you': False, 

## Naive bayes classification & Saving Model

In [6]:
training_set = feature_sets[:3000]
testing_set = feature_sets[10000:]
print("Training set:", len(training_set), "\nTesting set:", len(testing_set))

Training set: 3000 
Testing set: 664


In [7]:
import pickle
try:
  with open('/content/drive/My Drive/Colab Notebooks/github/text-classification-naive-bayes/training/naive-bayes-classifier-new.pickle', 'rb') as f:
    classifier = pickle.load(f)
    print('Pickle Loaded...')
except:
  classifier = nltk.NaiveBayesClassifier.train(training_set)
  with open('naive-bayes-classifier-new.pickle', 'wb') as f:
    pickle.dump(classifier, f)
print("Classifier accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)

Pickle Loaded...
Classifier accuracy percent:  82.53012048192771


In [8]:
classifier.show_most_informative_features(15)

Most Informative Features
              engrossing = True              pos : neg    =     21.0 : 1.0
               inventive = True              pos : neg    =     15.7 : 1.0
                    warm = True              pos : neg    =     12.2 : 1.0
               wonderful = True              pos : neg    =     11.8 : 1.0
            refreshingly = True              pos : neg    =     11.7 : 1.0
                powerful = True              pos : neg    =     11.6 : 1.0
                    ages = True              pos : neg    =     11.0 : 1.0
                provides = True              pos : neg    =     10.6 : 1.0
                touching = True              pos : neg    =     10.4 : 1.0
               realistic = True              pos : neg    =     10.3 : 1.0
                  unless = True              neg : pos    =     10.3 : 1.0
                captures = True              pos : neg    =      9.8 : 1.0
                  stupid = True              neg : pos    =      9.8 : 1.0

## SciKit Learn Incorporation

In [0]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [0]:
def run_classifier(name, classifier):
  nb_classifier = SklearnClassifier(classifier)
  nb_classifier.train(training_set)
  print(name, "accuracy percent:", (nltk.classify.accuracy(nb_classifier, testing_set)) * 100)
  return nb_classifier

#### Multinomial Naive Bayes Classifier

In [11]:
MNB_classifier = run_classifier('MNB_classifier', MultinomialNB())

MNB_classifier accuracy percent: 67.92168674698796


#### Bernoulli Naive Bayes Classfier

In [12]:
BNB_classifier = run_classifier('BNB_classifier', BernoulliNB())

BNB_classifier accuracy percent: 66.86746987951807


#### SGDClassifier - Naive Bayes Classifier

In [13]:
SGDNB_classifier = run_classifier('SGDNB_classifier', SGDClassifier())
LRNB_classifier = run_classifier('LRNB_classifier', LogisticRegression(max_iter=2000))

SGDNB_classifier accuracy percent: 66.71686746987952
LRNB_classifier accuracy percent: 69.87951807228916


#### LinearSVC, SVC, NuSVC - Naive Bayes Classifier

In [14]:
LSVCNB_classfier = run_classifier('LSVCNB_classfier', LinearSVC())
SVCNB_classfier = run_classifier('SVCNB_classfier', SVC())
NuSVCNB_classfier = run_classifier('NuSVCNB_classfier', NuSVC())

LSVCNB_classfier accuracy percent: 66.56626506024097
SVCNB_classfier accuracy percent: 70.33132530120481
NuSVCNB_classfier accuracy percent: 72.43975903614458


## Combining classifiers with voting system

In [0]:
from nltk.classify import ClassifierI
from statistics import mode

In [0]:
class VoteClassifier(ClassifierI):
  def __init__(self, *classifiers):
      self._classifiers = classifiers

  def classify(self, features):
    votes = []
    for c in self._classifiers:
      v = c.classify(features)
      votes.append(v)
    return mode(votes)

  def confidence(self, features):
    votes = []
    for c in self._classifiers:
      v = c.classify(features)
      votes.append(v)
    choice_votes = votes.count(mode(votes))
    conf = choice_votes / len(votes)
    return conf
    

#### Voted classifier

In [19]:
voted_classifier = VoteClassifier(classifier, MNB_classifier, BNB_classifier, SGDNB_classifier, LRNB_classifier, SVCNB_classfier, NuSVCNB_classfier)
print("Voted classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set)) * 100)

Voted classifier accuracy percent: 70.33132530120481


In [20]:
print("Classification:", voted_classifier.classify(testing_set[0][0]), "- Confidence Percentage:", round(voted_classifier.confidence(testing_set[0][0]) * 100))
print("Classification:", voted_classifier.classify(testing_set[2][0]), "- Confidence Percentage:", round(voted_classifier.confidence(testing_set[2][0]) * 100))
print("Classification:", voted_classifier.classify(testing_set[3][0]), "- Confidence Percentage:", round(voted_classifier.confidence(testing_set[3][0]) * 100))
print("Classification:", voted_classifier.classify(testing_set[4][0]), "- Confidence Percentage:", round(voted_classifier.confidence(testing_set[4][0]) * 100))
print("Classification:", voted_classifier.classify(testing_set[5][0]), "- Confidence Percentage:", round(voted_classifier.confidence(testing_set[5][0]) * 100))

Classification: pos - Confidence Percentage: 57
Classification: neg - Confidence Percentage: 71
Classification: neg - Confidence Percentage: 86
Classification: neg - Confidence Percentage: 100
Classification: neg - Confidence Percentage: 100
