## Text preprocessing

In [0]:
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
import random
import pandas as pd

In [17]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
print(documents[1])

(['unfortunately', 'it', 'doesn', "'", 't', 'get', 'much', 'more', 'formulaic', 'than', 'one', 'tough', 'cop', '.', 'there', "'", 's', 'the', 'renegade', 'cop', 'with', 'the', 'loser', 'partner', 'who', 'has', 'to', 'many', 'problems', 'to', 'explain', '.', 'the', 'renegade', 'has', 'to', 'prove', 'his', 'good', 'name', 'and', 'is', 'trapped', 'between', 'the', 'good', 'guys', ',', 'the', 'bad', 'guys', 'and', 'some', 'woman', 'who', 'really', 'has', 'nothing', 'to', 'do', 'with', 'the', 'story', 'other', 'than', 'being', 'there', 'for', 'the', 'purpose', 'of', 'providing', 'sex', 'for', 'the', 'hero', 'in', 'the', 'middle', 'of', 'the', 'film', '.', 'bo', 'dietl', '(', 'pronounced', 'deedle', ',', 'baldwin', ')', 'is', 'one', 'tough', 'cop', ',', 'a', 'guy', 'who', 'is', 'being', 'investigated', 'by', 'hardass', 'fbi', 'agents', 'due', 'to', 'his', 'association', 'with', 'the', 'ny', 'mafia', '.', 'on', 'top', 'of', 'that', 'problem', ',', 'he', 'has', 'a', 'drunk', ',', 'gambling', '

In [23]:
all_words = []
for w in movie_reviews.words():
  all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))
print(all_words['stupid'])

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]
253


## Word as feature

In [25]:
word_features = list(all_words.keys())[:3000]
print(word_features)  



In [0]:
def find_features(document):
  words = set(document)
  features = {}
  for w in word_features:
    features[w] = (w in words)
  return features

In [37]:
feature_sets = [(find_features(rev), category) for (rev, category) in documents]
print(feature_sets[1])
print(len(feature_sets))

2000


## Naive bayes classification

In [42]:
training_set = feature_sets[:1900]
testing_set = feature_sets[1900:]
print("Training set:", len(training_set), "\nTesting set:", len(testing_set))

Training set: 1900 
Testing set: 100


In [43]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier accuracy percent: ", (nltk.classify.accuracy(classifier, testing_set)) * 100)

Classifier accuracy percent:  75.0


In [44]:
classifier.show_most_informative_features(15)

Most Informative Features
                   sucks = True              neg : pos    =     16.9 : 1.0
              schumacher = True              neg : pos    =     11.6 : 1.0
                  annual = True              pos : neg    =      9.1 : 1.0
                  welles = True              neg : pos    =      7.6 : 1.0
             silverstone = True              neg : pos    =      7.6 : 1.0
           unimaginative = True              neg : pos    =      7.6 : 1.0
                 idiotic = True              neg : pos    =      7.2 : 1.0
                  regard = True              pos : neg    =      7.1 : 1.0
                  shoddy = True              neg : pos    =      6.9 : 1.0
               atrocious = True              neg : pos    =      6.5 : 1.0
                  turkey = True              neg : pos    =      6.5 : 1.0
                 frances = True              pos : neg    =      6.4 : 1.0
                  suvari = True              neg : pos    =      6.3 : 1.0