# Classifying Philosophical Texts

Here, we put the classifier developed and evaluated in Stéfan Sinclair's notebook to work ...

In [7]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Import the training corpus (while filtering out philosohpical ouliers)
data_dir = "texts"
corpus = nltk.corpus.reader.plaintext.PlaintextCorpusReader(data_dir, ".*\.txt")
filtered_fileids = [fileid for fileid in corpus.fileids() if "GameofLogic" not in fileid and "ThusSpakeZarathustr" not in fileid]
print('Number of (filtered) training files:', len(filtered_fileids))

Number of (filtered) training files: 38


In [None]:
# Create relative frequency vectorizer and vectorize training corpus
vectorizer = TfidfVectorizer(use_idf=False, stop_words=nltk.corpus.stopwords.words("english"), max_features=10000)
X_train = vectorizer.fit_transform([corpus.raw(fileid) for fileid in filtered_fileids])
categories = ["Philosophy" if "Philosophy" in fileid else "Other" for fileid in filtered_fileids]

In [31]:
# What does the training matrix look like?
print('Shape of the training matrix:', X_train.shape)

Shape of the training matrix: (38, 10000)


In [15]:
# Create and fit a multinomial NB classiier
clf = MultinomialNB(alpha=.01)
clf.fit(X_train, categories)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [10]:
# Read in test corpus and vectorize
test_dir = "test"
test_corpus = nltk.corpus.reader.plaintext.PlaintextCorpusReader(test_dir, ".*\.txt")
test_fileids = [fileid for fileid in test_corpus.fileids()]
test_categories = ["Philosophy" if "Philosophy" in fileid else "Other" for fileid in test_fileids]

X_test = vectorizer.transform([test_corpus.raw(fileid) for fileid in test_fileids])
print("Shape of the test corpus:", X_test.shape)

Shape of the test corpus: (7, 10000)


In [11]:
# now we should be able to predict new instances given a frequencies vector
pred = clf.predict(X_test)

In [12]:
# Check and print the results
# Loop over test fileids, compare to predicted result
for i in range(len(test_fileids)):
    print(test_fileids[i]) 
    if test_categories[i] == pred[i]:
        print('\t', test_categories[i])
    else:
        print('  ### ERROR ###')
        print('  Actual:   ', test_categories[i])
        print('  Predicted:', pred[i])

print('\n*** Accuracy ***:', round(clf.score(X_test, test_categories),3))

Other-MobyDick-Melville.txt
	 Other
Other-RobinsonCrusoe-Defoe.txt
	 Other
Philosophy-CommunistManifesto-Marx.txt
	 Philosophy
Philosophy-GameofLogic-Carroll.txt
  ### ERROR ###
  Actual:    Philosophy
  Predicted: Other
Philosophy-ImprovementUnderstanding-Spinoza.txt
	 Philosophy
Philosophy-TreatiseHumanKnowledge-Berkeley.txt
	 Philosophy
Philosophy-Zarathustra-Nietzsche.txt
  ### ERROR ###
  Actual:    Philosophy
  Predicted: Other

*** Accuracy ***: 0.714


## Getting probabilities

The code above returns just the binary classification. It would be nice to see how confident the classifier is about its decisions. For that, we use the `predict_proba()` method in place of `predict()`.

In [13]:
prob = clf.predict_proba(X_test)
for i in range(len(test_fileids)):
    print(test_fileids[i])
    print('   Actual:   ', test_categories[i])
    print('   Predicted %:\n\tOther', round(prob[i][0]*100, 2), '\tPhil', round(prob[i][1]*100, 2))

Other-MobyDick-Melville.txt
   Actual:    Other
   Predicted %:
	Other 100.0 	Phil 0.0
Other-RobinsonCrusoe-Defoe.txt
   Actual:    Other
   Predicted %:
	Other 100.0 	Phil 0.0
Philosophy-CommunistManifesto-Marx.txt
   Actual:    Philosophy
   Predicted %:
	Other 0.04 	Phil 99.96
Philosophy-GameofLogic-Carroll.txt
   Actual:    Philosophy
   Predicted %:
	Other 66.27 	Phil 33.73
Philosophy-ImprovementUnderstanding-Spinoza.txt
   Actual:    Philosophy
   Predicted %:
	Other 0.0 	Phil 100.0
Philosophy-TreatiseHumanKnowledge-Berkeley.txt
   Actual:    Philosophy
   Predicted %:
	Other 0.0 	Phil 100.0
Philosophy-Zarathustra-Nietzsche.txt
   Actual:    Philosophy
   Predicted %:
	Other 99.46 	Phil 0.54
