In [1]:
import nltk
from nltk.corpus import reuters
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('reuters')

documents = [(list(reuters.words(fileid)), category)
             for fileid in reuters.fileids()
             for category in reuters.categories(fileid)]

import random
random.shuffle(documents)

[nltk_data] Downloading package reuters to /Users/andrey/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


In [2]:
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features[f'contains({word})'] = (word in document_words)
    return features

all_words = nltk.FreqDist(w.lower() for w in reuters.words())

num_features = 2000

word_features = list(all_words)[:num_features]

featuresets = [(document_features(doc, word_features), category) for (doc, category) in documents]

In [3]:
split_ratio = int(len(featuresets) * 0.8)
train_set, test_set = featuresets[:split_ratio], featuresets[split_ratio:]

In [4]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [5]:
accuracy = nltk.classify.accuracy(classifier, test_set)
print(f"Classifier Accuracy: {accuracy:.2%}")

Classifier Accuracy: 54.84%


In [6]:
print(classifier.show_most_informative_features(10))

Most Informative Features
          contains(palm) = True           palm-o : earn   =   2100.0 : 1.0
    contains(economists) = True             rand : earn   =   1866.7 : 1.0
        contains(rubber) = True           rubber : earn   =   1832.5 : 1.0
        contains(coffee) = True           coffee : earn   =   1790.1 : 1.0
       contains(farmers) = True           copra- : earn   =   1777.8 : 1.0
       contains(follows) = True           lin-oi : earn   =   1777.8 : 1.0
         contains(ounce) = True           pallad : earn   =   1777.8 : 1.0
   contains(commodities) = True           ground : earn   =   1600.0 : 1.0
        contains(dealer) = True              nkr : earn   =   1600.0 : 1.0
          contains(ease) = True              nkr : earn   =   1600.0 : 1.0
None
