In [1]:
!pip install nltk




In [2]:
import nltk
from nltk.corpus import movie_reviews
import random


In [3]:
nltk.download('movie_reviews')

# Create a list of documents, each document is a tuple (list_of_words, category)
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents to mix positive and negative samples
random.shuffle(documents)

# View the first document (optional)
print(documents[0])


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


(['plot', ':', 'a', 'young', 'recruit', 'gets', 'plucked', 'out', 'of', 'nowhere', 'by', 'the', 'number', 'one', 'gangster', 'in', 'london', 'and', 'quickly', 'becomes', 'a', 'staple', 'by', 'his', 'side', '.', 'but', 'this', 'new', 'guy', 'is', 'not', 'like', 'all', 'the', 'others', ',', 'he', "'", 's', 'got', 'ambition', ',', 'he', "'", 's', 'got', 'goals', 'and', 'he', 'loves', 'everything', 'he', 'sees', 'about', 'being', 'the', 'number', 'one', 'gangster', '.', 'violence', ',', 'lots', 'of', 'swearing', 'and', 'betrayals', 'ensue', '.', 'critique', ':', 'a', 'very', 'good', ',', 'serious', '-', 'minded', ',', 'unique', 'british', 'gangster', 'flick', 'which', 'misses', 'the', '"', 'great', '"', 'mark', 'by', 'way', 'of', 'an', 'annoying', 'voice', '-', 'over', 'popping', 'up', 'throughout', 'the', 'film', 'and', 'a', 'so', '-', 'so', 'ending', '.', 'here', "'", 's', 'yet', 'another', 'cool', 'crime', 'movie', 'that', 'actually', 'manages', 'to', 'bring', 'something', 'new', 'into'

In [4]:
# Get all words from all reviews
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

# Use the 2000 most common words as features
word_features = list(all_words)[:2000]

# Define a function to extract features from a document
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in document_words)
    return features


In [5]:
# Create feature sets for all documents
featuresets = [(document_features(d), c) for (d, c) in documents]

# Split into training and testing sets
train_set, test_set = featuresets[:1900], featuresets[1900:]


In [6]:
import nltk
classifier = nltk.NaiveBayesClassifier.train(train_set)


In [7]:
print("Classifier accuracy:", nltk.classify.accuracy(classifier, test_set))


Classifier accuracy: 0.88


In [8]:
classifier.show_most_informative_features(10)


Most Informative Features
   contains(outstanding) = True              pos : neg    =     10.6 : 1.0
         contains(mulan) = True              pos : neg    =      9.2 : 1.0
        contains(seagal) = True              neg : pos    =      8.1 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.5 : 1.0
         contains(damon) = True              pos : neg    =      6.4 : 1.0
         contains(flynt) = True              pos : neg    =      5.8 : 1.0
    contains(ridiculous) = True              neg : pos    =      5.6 : 1.0
        contains(poorly) = True              neg : pos    =      5.5 : 1.0
          contains(lame) = True              neg : pos    =      5.3 : 1.0
        contains(wasted) = True              neg : pos    =      5.1 : 1.0
