In [5]:
from IPython.display import clear_output

import warnings
warnings.filterwarnings('ignore')

In [6]:
!pip install wikipedia
clear_output()

In [7]:
import spacy
import wikipedia
import copy
import numpy as np

nlp = spacy.load('en')   

In [8]:
def pages_to_sentences(*pages):    
  """Return a list of sentences in Wikipedia articles."""    
  sentences = []
  for page in pages:
      p = wikipedia.page(page)
      document = nlp(p.content)
      sentences += [sentence.text for sentence in document.sents]   
  return sentences
    
def get_corpus_labels(pages_titles):
  """Return a tuple (corpus, labels) from a dictionary of Wikipedia articles' titles"""
  corpus = []
  labels = []
  for k, v in pages_titles.items():
    sentences = pages_to_sentences(*v)
    corpus += sentences
    labels += [k]*len(sentences)
  return corpus, labels

In [9]:
from spacy.lang.en import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

def lemmatizer(text):
  return [word.lemma_ for word in nlp(text)]

def get_stop_words_lemma(words=None):
  """Return a lemmatized set of the english stop words, extended, if specified, with the list of given words"""
  stop_words = copy.deepcopy(STOP_WORDS)
  if words != None:
    assert isinstance(words, list), "The passed parameter is not a list!"
    stop_words.update(words)
  stop_words_str = " ".join(stop_words)
  return set(lemmatizer(stop_words_str))

def get_classifier(corpus, labels, exclude_words=None):
  stop_words_lemma = get_stop_words_lemma(exclude_words)
  tfidf = TfidfVectorizer(stop_words=stop_words_lemma, tokenizer=lemmatizer, ngram_range=(1, 2))
  pipe = Pipeline([('vectorizer', tfidf), ('classifier', MultinomialNB())])
  pipe.fit(corpus, labels)
  print("Training accuracy: {:.2f}%".format(pipe.score(corpus, labels)*100))
  return pipe

In [10]:
def get_labels_indexes(classifier, labels):
  """Return a dictionary {index : label} of each label probability index in predict_proba() result array"""
  dic = {}
  y_proba = classifier.predict_proba(labels)
  for i, label in enumerate(labels):
    index = y_proba[i].argmax()
    dic[index] = label
  return dic

def predict(classifier, labels, test_corpus):
  predictions = []
  class_labels = get_labels_indexes(classifier, labels)
  y_proba = classifier.predict_proba(test_corpus)
  for i in range(len(y_proba)):
    max_index = y_proba[i].argmax()
    predictions.append((class_labels[max_index], y_proba[i, max_index]))
  return predictions

def print_predictions(test_corpus, predictions):
  for i in range(len(predictions)):
    print(test_corpus[i], "--> {} at {:g}%".format(predictions[i][0], 100*predictions[i][1]))

# **Test Case : Amazon**

In [11]:
amazon_pages_titles = {'greek': ['Amazons'],
          'company': ['Amazon_(company)'],
          'rainforest': ['Amazon_rainforest']}

corpus, labels = get_corpus_labels(amazon_pages_titles)

classifier = get_classifier(corpus, labels, ['amazon'])

Training accuracy: 85.17%


In [12]:
class_labels = ['greek', 'company', 'rainforest']

test_corpus = ["Amazon.com needed more than private investors to underwrite the expansion.",
               "Bezos dismissed naysayers as not understanding the massive growth potential of the Internet.",
               "As the decade ends, Amazon has set its sights on online advertising.",
               "Finally, here was evidence of the women warriors that could have inspired the Amazon myths.",
               "The triumph of patriarchy brings with it the liberation of the spirit from the manifestations of nature.",
               "The creators of Wonder Woman had no interest in proving an actual link to the past.",
               "The Amazon helps stabilize local and global climate.",
               "The Amazon is a vast region that spans across eight rapidly developing countries.",
               "Amazonia is the largest river basin in the world!"]

test_labels = ['company']*3 + ['greek']*3 + ['rainforest']*3

predictions = predict(classifier, class_labels, test_corpus)

print_predictions(test_corpus, predictions)

Amazon.com needed more than private investors to underwrite the expansion. --> company at 63.5299%
Bezos dismissed naysayers as not understanding the massive growth potential of the Internet. --> company at 75.4074%
As the decade ends, Amazon has set its sights on online advertising. --> company at 58.5215%
Finally, here was evidence of the women warriors that could have inspired the Amazon myths. --> greek at 69.0468%
The triumph of patriarchy brings with it the liberation of the spirit from the manifestations of nature. --> company at 43.6488%
The creators of Wonder Woman had no interest in proving an actual link to the past. --> greek at 53.8367%
The Amazon helps stabilize local and global climate. --> company at 55.1641%
The Amazon is a vast region that spans across eight rapidly developing countries. --> company at 48.2254%
Amazonia is the largest river basin in the world! --> rainforest at 43.9917%


In [13]:
print("Testing accuracy : {:.2f}%".format(classifier.score(test_corpus, test_labels)*100))

Testing accuracy : 66.67%


In [14]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils import shuffle

X_train, X_test, y_train, y_test = train_test_split( *shuffle(corpus, labels), test_size = 0.2, random_state = 12 )

In [15]:
exclude_words =['amazon']
stop_words_lemma = get_stop_words_lemma(exclude_words)
tfidf = TfidfVectorizer(stop_words=stop_words_lemma, tokenizer=lemmatizer, ngram_range=(1, 2))
pipe = Pipeline([('vectorizer', tfidf), ('classifier', MultinomialNB())])

param_grid = {'vectorizer__ngram_range' : [(1,1), (1,2)],
              'vectorizer__tokenizer' : [None, lemmatizer]}

grid_search = GridSearchCV(pipe, param_grid, cv=5, verbose=1)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  1.9min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vectorizer',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 2),
                                                    

In [16]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'vectorizer__ngram_range': (1, 1), 'vectorizer__tokenizer': None}
0.7476614403401541


In [17]:
grid_search.cv_results_

{'mean_fit_time': array([0.02230062, 9.53248277, 0.04151025, 9.57963772]),
 'mean_score_time': array([0.00487766, 1.92911968, 0.00805893, 1.94374962]),
 'mean_test_score': array([0.74766144, 0.73349721, 0.71375233, 0.68456462]),
 'param_vectorizer__ngram_range': masked_array(data=[(1, 1), (1, 1), (1, 2), (1, 2)],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_vectorizer__tokenizer': masked_array(data=[None, <function lemmatizer at 0x7f4af3b32710>, None,
                    <function lemmatizer at 0x7f4af3b32710>],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'vectorizer__ngram_range': (1, 1), 'vectorizer__tokenizer': None},
  {'vectorizer__ngram_range': (1, 1),
   'vectorizer__tokenizer': <function __main__.lemmatizer>},
  {'vectorizer__ngram_range': (1, 2), 'vectorizer__tokenizer': None},
  {'vectorizer__ngram_range': (1, 2),
   'vectorizer__tokenizer': <f

In [18]:
estimator = grid_search.best_estimator_
estimator.score(X_test, y_test)

0.7781954887218046

In [19]:
estimator.score(test_corpus, test_labels)

0.6666666666666666