# Classification 2.1 Reuters data set.

# Loading the needed libraries.

In [None]:
import pandas as pd
import numpy as np
from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from tqdm import tqdm_notebook

import os
import itertools

import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.multiclass import OneVsRestClassifier

from sklearn.decomposition import IncrementalPCA as iPCA

from sklearn.metrics import f1_score, precision_score, recall_score, brier_score_loss

from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.model_selection import train_test_split

In [None]:
import nltk
nltk.download("stopwords")
nltk.download("reuters")
nltk.download('punkt')

In [None]:
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords
 
cachedStopWords = stopwords.words("english")

# Loading the data and taking the first look.

The Reuters Corpus contains 10,788 news documents totaling 1.3 million words. The documents have been classified into 90 topics, and grouped into two sets, called "training" and "test".
This split is for training and testing algorithms that automatically detect the topic of a document.

In [None]:
from nltk.corpus import reuters 

 
def collection_stats():
    
    # List of documents
    documents = reuters.fileids()
    print(str(len(documents)) + " documents")
 
    train_docs = list(filter(lambda doc: doc.startswith("train"),
                        documents))
    print(str(len(train_docs)) + " total train documents")
 
    test_docs = list(filter(lambda doc: doc.startswith("test"),
                       documents));
    print(str(len(test_docs)) + " total test documents")
 
    # List of categories
    categories = reuters.categories()
    print(str(len(categories)) + " categories\n")
 
    # Documents in a category
    category_docs = reuters.fileids("acq")
 
    # Words for a document
    document_id = category_docs[0]
    document_words = reuters.words(category_docs[0])
    print(document_words, "\n")  
 
    # Raw document
    print(reuters.raw(document_id))
    
    document_id = category_docs[1]
    document_words = reuters.words(category_docs[1])
    print(document_words, "\n")  
 
    # Raw document
    print(reuters.raw(document_id))

See readers API descriptions
https://www.nltk.org/api/nltk.corpus.reader.html#module-nltk.corpus.reader.api

https://www.nltk.org/api/nltk.corpus.reader.html?highlight=categorizedplaintextcorpusreader#nltk.corpus.reader.CategorizedPlaintextCorpusReader

In [None]:
collection_stats()

In [None]:
reuters.categories()[:20]

#### print ids of documents in category 'barley'

#### print categories of 'training/9865', 'training/9880' documents

#### calculate number of documents in each category

# Preprocessing

In [None]:
def tokenize(text):
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words
                  if word not in cachedStopWords]
    tokens = (list(map(lambda token: PorterStemmer().stem(token),
                  words)));
    p = re.compile('[a-zA-Z]+');
    filtered_tokens = list(filter(lambda token:
                  p.match(token) and len(token)>=min_length,
         tokens))
    return filtered_tokens

stemmers https://pythonspot.com/nltk-stemming/

In [None]:
# Return the representer, without transforming
def tf_idf(docs):
    tfidf = TfidfVectorizer(tokenizer=tokenize, min_df=3,
                        max_df=0.90, max_features=3000,
                        use_idf=True, sublinear_tf=True,
                        norm='l2')
    tfidf.fit(docs)
    return tfidf

TfIdf Sklearn API

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [None]:
def feature_values(doc, representer):
    doc_representation = representer.transform([doc])
    features = representer.get_feature_names()
    return [(features[index], doc_representation[0, index])
                 for index in doc_representation.nonzero()[1]]

In [None]:
def main():
    train_docs = []
    test_docs = []
 
    for doc_id in reuters.fileids():
        if doc_id.startswith("train"):
            train_docs.append(reuters.raw(doc_id))
        else:
            test_docs.append(reuters.raw(doc_id))
 
    representer = tf_idf(train_docs)
 
    for doc in test_docs:
        print(feature_values(doc, representer))

In [None]:
%%time
main()

In [None]:
# List of document ids
documents = reuters.fileids()
 
train_docs_id = list(filter(lambda doc: doc.startswith("train"),
                            documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"),
                           documents))
 
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

In [None]:
%%time
# Tokenisation
vectorizer = TfidfVectorizer(stop_words=cachedStopWords,
                             tokenizer=tokenize)
 
# Learn and transform train documents
vectorised_train_documents = vectorizer.fit_transform(train_docs)
vectorised_test_documents = vectorizer.transform(test_docs)
 
# Transform multilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id)
                                  for doc_id in train_docs_id])
test_labels = mlb.transform([reuters.categories(doc_id)
                             for doc_id in test_docs_id])

MultiLabelBinarizer API
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html

# Incremental PCA

In [None]:
%%time
n = vectorised_train_documents.shape[0] #how many rows we have in the dataset

n_components = 2400
chunk_size = n_components + 100

ipca = iPCA(n_components=n_components) 

vectorised_train_documents_arr = vectorised_train_documents.toarray()
vectorised_test_documents_arr = vectorised_test_documents.toarray()
print(vectorised_train_documents_arr.shape)

for i in tqdm_notebook(range(0, n // chunk_size)):
    ipca.partial_fit(vectorised_train_documents_arr[i * chunk_size : (i + 1) * chunk_size])

ipca.partial_fit(vectorised_train_documents_arr[(i + 1) * chunk_size:])
    
print(np.sum(ipca.explained_variance_ratio_))

In [None]:
%%time
vectorised_train_ipca = ipca.transform(vectorised_train_documents_arr)
vectorised_test_ipca = ipca.transform(vectorised_test_documents_arr)

print(np.shape(vectorised_train_ipca), np.shape(vectorised_train_documents))

# Modeling and scoring

## Helper functions

In [None]:
def classifier_f(clf, X_train, y_train, X_test):
    # Classifier
    classifier = OneVsRestClassifier(clf)
    classifier.fit(X_train, y_train)

    predictions = classifier.predict(X_test)
    return predictions



def eval_f(test_labels, predictions):
    precision = precision_score(test_labels, predictions,
                                average='micro')
    recall = recall_score(test_labels, predictions,
                          average='micro')
    f1 = f1_score(test_labels, predictions, average='micro')

    print("Micro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}"
            .format(precision, recall, f1))

    precision = precision_score(test_labels, predictions,
                                average='macro')
    recall = recall_score(test_labels, predictions,
                          average='macro')
    f1 = f1_score(test_labels, predictions, average='macro')

    print("Macro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}"
            .format(precision, recall, f1))

## Logistic regression

In [None]:
C_values = [50, 100, 200, 500]

for c in tqdm_notebook(C_values):
    
    predictions_LR = classifier_f( LogisticRegression( C=c, random_state=42), vectorised_train_ipca, train_labels, vectorised_test_ipca)
    print("C = {0:2.2f}".format(c))
    eval_f(test_labels, predictions_LR)
    print("\n")

## SVM classifiers

SV Classifier sclearn API
https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [None]:
%%time

predictions_LinearSVC = classifier_f( LinearSVC(random_state=42), vectorised_train_documents, train_labels, vectorised_test_documents)
eval_f(test_labels, predictions_LinearSVC)

In [None]:
%%time

predictions_LinearSVC_ipca = classifier_f( LinearSVC(random_state=42), vectorised_train_ipca, train_labels, vectorised_test_ipca)
eval_f(test_labels, predictions_LinearSVC_ipca)

In [None]:
C_values = [ 0.05, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 20.0, 50, 100, 200]

for c in tqdm_notebook(C_values):
    
    predictions_LinearSVC = classifier_f( LinearSVC( C=c, random_state=42), vectorised_train_documents, train_labels, vectorised_test_documents)
    print("C = {0:2.2f}".format(c))
    eval_f(test_labels, predictions_LinearSVC)
    print("\n")

In [None]:
%%time

kernels = [ "rbf", "sigmoid"]  
C_values = [5e03, 1e04, 2e04, 5e04, 1e05, 2e05, 5e05]

for kern in kernels:
    for c in tqdm_notebook(C_values):
    
        predictions_SVC = classifier_f( SVC( C=c, kernel=kern, random_state=42), vectorised_train_documents, train_labels, vectorised_test_documents)
        print("Kernel: {}, C = {:.2f}".format(kern, c))
        eval_f(test_labels, predictions_SVC)
        print("\n")

In [None]:
%%time

predictions_SVC_poly = classifier_f( SVC( C=1e04, gamma=0.01, kernel="poly", random_state=42), vectorised_train_documents, train_labels, vectorised_test_documents)
eval_f(test_labels, predictions_SVC_poly)

understanding parameters

https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html

## Naive Bayes classifiers.

In [None]:
%%time

predictions_GaussianNB_ipca = classifier_f( GaussianNB(), vectorised_train_ipca, train_labels, vectorised_test_ipca)

In [None]:
eval_f(test_labels, predictions_GaussianNB_ipca)

In [None]:
%%time

predictions_GaussianNB = classifier_f(  GaussianNB(), vectorised_train_documents_arr, train_labels, vectorised_test_documents_arr)

In [None]:
eval_f(test_labels, predictions_GaussianNB)

In [None]:
%%time

predictions_MultinomialNB = classifier_f( MultinomialNB(), vectorised_train_documents_arr, train_labels, vectorised_test_documents_arr)

In [None]:
eval_f(test_labels, predictions_MultinomialNB)

In [None]:
%%time

predictions_BernoulliNB = classifier_f( BernoulliNB(), vectorised_train_documents_arr, train_labels, vectorised_test_documents_arr)

In [None]:
eval_f(test_labels, predictions_BernoulliNB)

#### run BernoulliNB with reduced by iPCA features