## Import pyserini

In [1]:
from pyserini.search import pysearch
from pyserini.index import pyutils
from pyserini.analysis.pyanalysis import get_lucene_analyzer
import re

## Load index

In [2]:
index_utils_train = pyutils.IndexReaderUtils('lucene-index.20newsgroup.pos+docvectors+rawdocs')
index_utils_test = pyutils.IndexReaderUtils('lucene-index.20newstest.pos+docvectors+rawdocs')

## Get 20Newsgroup Collection and Generator

After adding 20newsgroup collection in pyclass.py Jcollection, we can just call the two line code below to get collection
from pyserini.collection import pycollection      
collection = pycollection.Collection('TwentyNewsgroupsCollection', 'collection/')

In [3]:
from jnius import autoclass,cast
from enum import Enum
class JCollections(Enum):
    TwentyNewsgroupsCollection = autoclass('io.anserini.collection.TwentyNewsgroupsCollection')
TwentyNewsgroupsCollection = autoclass('io.anserini.collection.TwentyNewsgroupsCollection')

In [4]:
import logging
logger = logging.getLogger(__name__)
from pyserini.collection import pycollection
JPaths = autoclass('java.nio.file.Paths')
path = JPaths.get('../20news-bydate-train')
class Collection:
    def __init__(self, path):
        self.collection_path = path
        self.object = JCollections(TwentyNewsgroupsCollection).value(path)
        self.collection_iterator = self.object.iterator()
    def __iter__(self):
        return self

    def __next__(self):
        if self.collection_iterator.hasNext():
            fs = self.collection_iterator.next()
            return FileSegment(self, fs, fs.getSegmentPath())
        else:
            raise StopIteration
class FileSegment:
    """
    Iterable wrapper class for Anserini's FileSegment.
    Parameters
    ----------
    collection : Collection
        Parent collection of the file segment
    segment : io.anserini.collection.FileSegment
        FileSegment object to create wrapper from
    segment_path : str
        Path to file backing the file segment
    """

    def __init__(self, collection, segment, segment_path):
        self.collection = collection
        try:
            self.object = cast(collection.object.getClass().getName() +
                               '$Segment', segment)
        except:
            logger.exception('Exception from casting FileSegment type...')
            self.object = cast('io.anserini.collection.FileSegment', segment)

        self.segment_iterator = self.object.iterator()
        self.segment_path = segment_path
        self.segment_name = re.sub(r'\\|\/', '-', collection.collection_path.relativize(segment_path).toString())

    def __iter__(self):
        return self

    def __next__(self):
        if self.object.iterator().hasNext():
            d = self.object.iterator().next()
            return SourceDocument(self, d)
        else:
            # log if iteration stopped by error
            if self.object.getErrorStatus():
                logger.error(self.segment_name + ': Error from segment iteration, stopping...')
                self.collection.counters.errors.increment()

            # stop iteration and log skipped documents
            skipped = self.object.getSkippedCount()
            if skipped > 0:
                self.collection.counters.skips.increment(skipped)
                logger.warning(self.segment_name + ': ' + str(skipped) + ' documents skipped')
            self.object.close()
            raise StopIteration


class SourceDocument:
    """
    Wrapper class for Anserini's SourceDocument.
    Parameters
    ----------
    segment : FileSegment
        Parent segment of the source document
    document : io.anserini.collection.SourceDocument
        SourceDocument object to create wrapper from
    """

    def __init__(self, segment, document):
        self.segment = segment
        self.object = document
        self.id = self.object.id()
        self.indexable = self.object.indexable()
        self.contents = self.object.contents()
        self.raw = self.object.raw()
collection = Collection(path)

In [5]:
from pyserini.index import pygenerator
generator = pygenerator.Generator('DefaultLuceneDocumentGenerator')

### Get a matching relationship between the category number and docids

In [7]:
import os
category_list = []
for root, dirs, files in os.walk("../20news-bydate-train", topdown=False):
    for name in dirs:
        category_list.append(name) 

In [8]:
category_lst = ['alt.atheism',
'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
'talk.religion.misc']

In [9]:
import os
category_dic = {}
for name in category_list:
    category_dic[category_lst.index(name)] = []
    for root, dirs, files in os.walk("../20news-bydate-train/"+name, topdown=False):
        for file in files:
            category_dic[category_lst.index(name)].append(file)

## Get tfidf dataframe/target list for the training collection

In [10]:
def get_tfidf(docid,index_utils):
    '''get a tiidf dictionary where key is the term and item is tfidf value'''
    tfidf = {}
    tf = index_utils.get_document_vector(docid)
    analyzer = get_lucene_analyzer(stemming=False, stopwords=False)
    df = {term: (index_utils.get_term_counts(term, analyzer=analyzer))[0] for term in tf.keys()}
    idfDict = {}
    for word, val in df.items():
        # no idea why the value val can be 0, but this only happen if the word ends with . or '
        if float(val) !=0:
            idfDict[word] = math.log10(N / float(val) + 1)
        else:
            idfDict[word] = math.log10(N / 1 + 1)
    for word, val in tf.items():
        tfidf[word] = val*idfDict[word]
    return tfidf

In [11]:
import pandas as pd
import math
tfidflist = []
targetlist = []
N = 11314 
for (i, fs) in enumerate(collection):
    for (j, doc) in enumerate(fs):
        parsed = generator.create_document(doc)
        docid = parsed.get('id') 
        for category, docname in category_dic.items():  # for name, age in dictionary.iteritems():  (for Python 2.x)
            if docid in docname:
                '''every time we found the category of the document, 
                we append the category name to targetlist 
                , and add the corresponding tfidf dictionary to a tfidflist'''
                targetlist.append(category)
                tfidf_dic = get_tfidf(docid,index_utils_train)
                tfidflist.append(tfidf_dic)
                break;
#convert the tfidf list to a dataframe
tfidflist_df = pd.DataFrame(tfidflist)
tfidflist_df = tfidflist_df.fillna(0)

In [12]:
tfidflist_df.shape

(11314, 116285)

## Train model

In [18]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(tfidflist_df, targetlist)

## Get test Collection and Index

In [19]:
path_test = JPaths.get('../20news-bydate-test')
collection_test = Collection(path_test)

In [20]:
#category_dic should contain the matching relationship of all categories and documents in the test folder
category_dic = {}
for name in category_list:
    category_dic[category_lst.index(name)] = []
    for root, dirs, files in os.walk("../20news-bydate-test/"+name, topdown=False):
        for file in files:
            category_dic[category_lst.index(name)].append(file)

In [21]:
import numpy as np
tfidflist_test = []
targetlist_test = []
N = 11314
for (i, fs) in enumerate(collection_test):
    for (j, doc) in enumerate(fs):
        parsed = generator.create_document(doc)
        docid = parsed.get('id') 
        for category, docname in category_dic.items():  # for name, age in dictionary.iteritems():  (for Python 2.x)
            if docid in docname:
                targetlist_test.append(category)
                tfidf_dic = get_tfidf(docid,index_utils_test)
                tfidflist_test.append(tfidf_dic)
                break;

Create the test tfidf_dataframe which has the same columns as train tfidf_dataframe

In [22]:
tfidf_df_test = pd.DataFrame(tfidflist_test,columns=tfidflist_df.columns)
tfidf_df_test = tfidf_df_test.fillna(0)

In [23]:
tfidf_df_test.shape

(7532, 116285)

## Predict the test target list 

In [24]:
predicted = clf.predict(tfidf_df_test)
np.mean(predicted == targetlist_test)

0.7919543281996814

There are in total 20 different categories.
The following numbers in the vertical axis mean the index of that category in the category list:

category_lst = ['alt.atheism',
'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
'talk.religion.misc']

In [25]:
from sklearn import metrics
print(metrics.classification_report(targetlist_test, predicted))

             precision    recall  f1-score   support

          0       0.84      0.82      0.83       450
          1       0.65      0.80      0.72       389
          2       0.56      0.02      0.04       394
          3       0.53      0.83      0.64       392
          4       0.66      0.86      0.75       385
          5       0.77      0.84      0.81       395
          6       0.96      0.25      0.40       325
          7       0.76      0.96      0.85       396
          8       0.97      0.69      0.81       154
          9       0.95      0.96      0.96       641
         10       0.95      0.97      0.96       574
         11       0.83      0.94      0.88       396
         12       0.79      0.23      0.35       133
         13       0.86      0.85      0.86       396
         14       0.85      0.92      0.89       394
         15       0.79      0.95      0.87       398
         16       0.75      0.89      0.82       318
         17       0.95      0.96      0.95   