# Lab 3: IR

## Install, import modules and download dataset

In [4]:
!pip install whoosh
!pip install pytrec_eval
!pip install wget

Collecting whoosh
[?25l  Downloading https://files.pythonhosted.org/packages/ba/19/24d0f1f454a2c1eb689ca28d2f178db81e5024f42d82729a4ff6771155cf/Whoosh-2.7.4-py2.py3-none-any.whl (468kB)
[K     |████████████████████████████████| 471kB 2.9MB/s 
[?25hInstalling collected packages: whoosh
Successfully installed whoosh-2.7.4
Collecting pytrec_eval
  Downloading https://files.pythonhosted.org/packages/2e/03/e6e84df6a7c1265579ab26bbe30ff7f8c22745aa77e0799bba471c0a3a19/pytrec_eval-0.5.tar.gz
Building wheels for collected packages: pytrec-eval
  Building wheel for pytrec-eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec-eval: filename=pytrec_eval-0.5-cp36-cp36m-linux_x86_64.whl size=263967 sha256=f6c4265a582c989cee141dcf39232f409d72c063cf16cc1c1e4137d4f33aedfa
  Stored in directory: /root/.cache/pip/wheels/55/66/40/1779aa0a8eb66e088669befe286f695cdfe420ba91ce662127
Successfully built pytrec-eval
Installing collected packages: pytrec-eval
Successfully installed pytrec-eval-0.5
C

In [5]:
from whoosh import index, writing
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
from whoosh.analysis import *
from whoosh.qparser import QueryParser
import os.path
from pathlib import Path
import tempfile
import subprocess
import pytrec_eval
import wget

In [6]:
filename = wget.download("https://github.com/MIE451-1513-2019/course-datasets/raw/master/lab-data.zip", "lab-data.zip")

In [7]:
!unzip lab-data.zip

Archive:  lab-data.zip
   creating: lab-data/
  inflating: lab-data/air.topics     
   creating: lab-data/documents/
  inflating: lab-data/documents/email05  
  inflating: lab-data/documents/email02  
  inflating: lab-data/documents/email03  
  inflating: lab-data/documents/email04  
  inflating: lab-data/documents/email10  
  inflating: lab-data/documents/email08  
  inflating: lab-data/documents/email01  
  inflating: lab-data/documents/email06  
  inflating: lab-data/documents/email07  
  inflating: lab-data/documents/email09  
  inflating: lab-data/documents/email14  
  inflating: lab-data/air.qrels      


In [8]:
DATA_DIR = "lab-data"
DOCUMENTS_DIR = os.path.join(DATA_DIR, "documents")
TOPIC_FILE = os.path.join(DATA_DIR, "air.topics")
QRELS_FILE = os.path.join(DATA_DIR, "air.qrels")

## Part 1: Basic Indexing

### Creating the index

To begin using Whoosh, you need an index object. The first time you create an index, you must define the index’s schema. The schema lists the fields in the index. A field is a piece of information for each document in the index, such as its title or text content. A field can be indexed (meaning it can be searched) and/or stored (meaning the value that gets indexed is returned with the results; this is useful for fields such as the title).

More information:
https://whoosh.readthedocs.io/en/latest/schema.html?highlight=schema

In [9]:
def createIndex(schema):
    # Generate a temporary directory for the index
    indexDir = tempfile.mkdtemp()

    # create and return the index
    return index.create_in(indexDir, schema)

In [10]:
# first, define a Schema for the index
mySchema = Schema(file_path = ID(stored=True),
                  file_content = TEXT(analyzer = RegexTokenizer()))

# now, create the index at the path INDEX_DIR based on the new schema
myIndex = createIndex(mySchema)

### Indexing the documents

In [11]:
def addFilesToIndex(indexObj, fileList):
    # open writer
    writer = writing.BufferedWriter(indexObj, period=None, limit=1000)

    try:
        # write each file to index
        for docNum, filePath in enumerate(fileList):
            with open(filePath, "r", encoding="utf-8") as f:
                fileContent = f.read()
                writer.add_document(file_path = filePath,
                                    file_content = fileContent)

                # print status every 1000 documents
                if (docNum+1) % 1000 == 0:
                    print("already indexed:", docNum+1)
        print("done indexing.")

    finally:
        # close the index
        writer.close()

In [12]:
# Build a list of files to index
filesToIndex = [str(filePath) for filePath in Path(DOCUMENTS_DIR).glob("**/*") if filePath.is_file()]

In [13]:
# Check the list
filesToIndex[:5]

['lab-data/documents/email03',
 'lab-data/documents/email01',
 'lab-data/documents/email08',
 'lab-data/documents/email02',
 'lab-data/documents/email09']

In [14]:
# count files to index
print("number of files:", len(filesToIndex))

number of files: 11


In [15]:
addFilesToIndex(myIndex, filesToIndex)

done indexing.


### Querying

More information: https://whoosh.readthedocs.io/en/latest/api/qparser.html?highlight=queryparser

In [16]:
# define a query parser for the field "file_content" in the index
myQueryParser = QueryParser("file_content", schema=myIndex.schema)
mySearcher = myIndex.searcher()

In [22]:
# run a sample query for the phrase "item"
sampleQuery = myQueryParser.parse("duck")
sampleQueryResults = mySearcher.search(sampleQuery, limit=None)

In [23]:
sampleQuery

Term('file_content', 'duck')

In [24]:
sampleQueryResults

<Top 2 Results for Term('file_content', 'duck') runtime=0.0006261990001803497>

In [25]:
# inspect the result:
# for each document print the rank and the score
for (docnum, result) in enumerate(sampleQueryResults):
    score = sampleQueryResults.score(docnum)
    fileName = os.path.basename(result["file_path"])
    print(fileName, docnum, score)

email06 0 3.8267972898082365
email04 1 2.2736898161091434


### Evaluation using TREC_EVAL
In order to evaluate our results we will use a topic file - a list of topics we use to evaluate our IR system

In [26]:
# print the topic file
with open(TOPIC_FILE, "r") as f:
    print(f.read())

01 ducks
02 ig nobel prizes
03 mathematics
04 flowing hair
05 music
06 AIR TV



We will compare our evaluate our results with a set of judged results(qrels file) using TREC_EVAL 

In [27]:
# print the first 10 lines in the qrels file
with open(QRELS_FILE, "r") as f:
    qrels10 = f.readlines()[:10]
    print("".join(qrels10))

01 0 email01 0
01 0 email02 0
01 0 email03 0
01 0 email04 1
01 0 email05 1
01 0 email06 1
01 0 email07 0
01 0 email08 0
01 0 email09 0
01 0 email10 0



The follwing function takes a topic file, a qrels file, a query parser and a searcher and use pytrec_eval to compare our results with the provided qrels file (see assignment PDF for more details)

In [None]:
def pyTrecEval(topicFile, qrelsFile, queryParser, searcher):
    # Load topic file - a list of topics(search phrases) used for evalutation
    with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()

    # create an output file to which we'll write our results
    tempOutputFile = tempfile.mkstemp()[1]
    with open(tempOutputFile, "w") as outputTRECFile:
        # for each evaluated topic:
        # build a query and record the results in the file in TREC_EVAL format
        for topic in topics:
            topic_id, topic_phrase = tuple(topic.split(" ", 1))
            #print(topic_id, topic_phrase)
            topicQuery = queryParser.parse(topic_phrase)
            topicResults = searcher.search(topicQuery, limit=None)
            for (docnum, result) in enumerate(topicResults):
                score = topicResults.score(docnum)
                #print("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
                outputTRECFile.write("%s Q0 %s %d %lf test\n" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
    with open(qrelsFile, 'r') as f_qrel:
        qrel = pytrec_eval.parse_qrel(f_qrel)

    with open(tempOutputFile, 'r') as f_run:
        run = pytrec_eval.parse_run(f_run)

    evaluator = pytrec_eval.RelevanceEvaluator(
        qrel, pytrec_eval.supported_measures)

    results = evaluator.evaluate(run)
    def print_line(measure, scope, value):
        print('{:25s}{:8s}{:.4f}'.format(measure, scope, value))

    for query_id, query_measures in results.items():
        for measure, value in query_measures.items():
            if measure == "runid":
              continue
            print_line(measure, query_id, value)
    for measure in query_measures.keys():
        if measure == "runid":
              continue
        print_line(
            measure,
            'all',
            pytrec_eval.compute_aggregated_measure(
                measure,
                [query_measures[measure]
                 for query_measures in results.values()]))

In [None]:
pyTrecEval(TOPIC_FILE, QRELS_FILE, myQueryParser, mySearcher) 

num_q                    01      1.0000
num_ret                  01      1.0000
num_rel                  01      3.0000
num_rel_ret              01      1.0000
map                      01      0.3333
gm_map                   01      -1.0986
Rprec                    01      0.3333
bpref                    01      0.3333
recip_rank               01      1.0000
iprec_at_recall_0.00     01      1.0000
iprec_at_recall_0.10     01      1.0000
iprec_at_recall_0.20     01      1.0000
iprec_at_recall_0.30     01      1.0000
iprec_at_recall_0.40     01      0.0000
iprec_at_recall_0.50     01      0.0000
iprec_at_recall_0.60     01      0.0000
iprec_at_recall_0.70     01      0.0000
iprec_at_recall_0.80     01      0.0000
iprec_at_recall_0.90     01      0.0000
iprec_at_recall_1.00     01      0.0000
P_5                      01      0.2000
P_10                     01      0.1000
P_15                     01      0.0667
P_20                     01      0.0500
P_30                     01      0.0333

In [None]:
def printRelName(topicFile, qrelsFile, queryParser, searcher, id):
  with open(topicFile, "r") as tf:
        topics = tf.read().splitlines()
  for topic in topics:
        topic_id, topic_phrase = tuple(topic.split(" ", 1))
        if topic_id == id:
          print("---------------------------Topic_id and Topic_phrase----------------------------------")
          print(topic_id, topic_phrase)
          topicQuery = queryParser.parse(topic_phrase)
          topicResults = searcher.search(topicQuery, limit=None)
          print("---------------------------Return documents----------------------------------")
          for (docnum, result) in enumerate(topicResults):
              score = topicResults.score(docnum)
              print("%s Q0 %s %d %lf test" % (topic_id, os.path.basename(result["file_path"]), docnum, score))
          print("---------------------------Relevant documents----------------------------------")
          with open(qrelsFile, 'r') as f_qrel:
            qrels = f_qrel.readlines()
            for i in qrels:
              qid, _, doc, rel = i.rstrip().split(" ")
              if qid == id and rel == "1":
                print(i.rstrip())

In [None]:
printRelName(TOPIC_FILE, QRELS_FILE, myQueryParser, mySearcher, "01")

---------------------------Topic_id and Topic_phrase----------------------------------
01 ducks
---------------------------Return documents----------------------------------
01 Q0 email06 0 2.601436 test
---------------------------Relevant documents----------------------------------
01 0 email04 1
01 0 email05 1
01 0 email06 1


## Part 2: Evaluating different configurations

### Inspecting our index

In [None]:
# Is it empty?
print("Index is empty?", myIndex.is_empty())

# How many files indexed?
print("Number of indexed files:", myIndex.doc_count())

Index is empty? False
Number of indexed files: 11


In [None]:
# define a reader object on the index
myReader = myIndex.reader()

In [None]:
# print first 5 indexed documents
[(docnum, doc_dict) for (docnum, doc_dict) in myReader.iter_docs()][0:5]

[(0, {'file_path': 'lab-data/documents/email06'}),
 (1, {'file_path': 'lab-data/documents/email02'}),
 (2, {'file_path': 'lab-data/documents/email07'}),
 (3, {'file_path': 'lab-data/documents/email05'}),
 (4, {'file_path': 'lab-data/documents/email04'})]

In [None]:
# list indexed terms for field "file_content"
[term for term in myReader.field_terms("file_content")][1000:1025]

['Care',
 'Carlos',
 'Carmen',
 'Carnivalesque',
 'Carolina',
 'Case',
 'Cat',
 'Catalysis',
 'Catalyst',
 'Catchers',
 'Cater',
 'Caused',
 'Caveat',
 'CbZF1d0021swQuc57kfqHt',
 'Cechetto',
 'Ceder',
 'Celebratory',
 'Center',
 'Cereal',
 'Ceremony',
 'Cerrahi',
 'Certolizumab',
 'Cervical',
 'Chair',
 'Chalfie']

In [None]:
#how many terms do we have?
print(myReader.field_length("file_content"))

29729


In [None]:
# how many documents have the phares "bit", blob"
#   in the field "file_content"?
print("# docs with 'bit'", myReader.doc_frequency("file_content", "bit"))
print("# docs with 'are'", myReader.doc_frequency("file_content", "are"))
print("# docs with 'get'", myReader.doc_frequency("file_content", "get"))

# docs with 'bit' 1
# docs with 'are' 11
# docs with 'get' 6


### Text Analyzers

In [None]:
# we start with basic tokenizer
tokenizer = RegexTokenizer()
[token.text for token in tokenizer("We are going to do Text Analysis with whoosh.analysis")]

['We',
 'are',
 'going',
 'to',
 'do',
 'Text',
 'Analysis',
 'with',
 'whoosh.analysis']

In [None]:
# we might want use stemming:
stmAnalyzer = RegexTokenizer() | StemFilter()
[token.text for token in stmAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

['We', 'ar', 'go', 'to', 'do', 'Text', 'Analysi', 'with', 'whoosh.analysi']

In [None]:
# We probably want to lower-case it
# so we add LowercaseFilter
stmLwrAnalyzer = RegexTokenizer() | LowercaseFilter() | StemFilter()
[token.text for token in stmLwrAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

['we', 'ar', 'go', 'to', 'do', 'text', 'analysi', 'with', 'whoosh.analysi']

In [None]:
# we probably want to ignore words like "we", "are", "with" when we index files
# so we add StopFilter to filter stop words
stmLwrStpAnalyzer = RegexTokenizer() | LowercaseFilter() | StopFilter() | StemFilter()
[token.text for token in stmLwrStpAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

['go', 'do', 'text', 'analysi', 'whoosh.analysi']

In [None]:
# we also probably want to break phrases like "whoosh.analysis" into "whoosh" and "analysis"
# so we add IntraWordFilter
stmLwrStpIntraAnalyzer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() | StemFilter()
[token.text for token in stmLwrStpIntraAnalyzer("We are going to do Text Analysis with whoosh.analysis")]

['go', 'do', 'text', 'analysi', 'whoosh', 'analysi']

### Evaluating the new analyzers

In [None]:
# define a Schema with the new analyzer
mySchema2 = Schema(file_path = ID(stored=True),
                   file_content = TEXT(analyzer = stmLwrStpIntraAnalyzer))

# create the index based on the new schema
myIndex2 = createIndex(mySchema2)

In [None]:
addFilesToIndex(myIndex2, filesToIndex)

done indexing.


In [None]:
# define a query parser for the field "file_content" in the index
myQueryParser2 = QueryParser("file_content", schema=myIndex2.schema)
mySearcher2 = myIndex2.searcher()

In [None]:
pyTrecEval(TOPIC_FILE, QRELS_FILE, myQueryParser2, mySearcher2) 

num_q                    01      1.0000
num_ret                  01      3.0000
num_rel                  01      3.0000
num_rel_ret              01      3.0000
map                      01      1.0000
gm_map                   01      0.0000
Rprec                    01      1.0000
bpref                    01      1.0000
recip_rank               01      1.0000
iprec_at_recall_0.00     01      1.0000
iprec_at_recall_0.10     01      1.0000
iprec_at_recall_0.20     01      1.0000
iprec_at_recall_0.30     01      1.0000
iprec_at_recall_0.40     01      1.0000
iprec_at_recall_0.50     01      1.0000
iprec_at_recall_0.60     01      1.0000
iprec_at_recall_0.70     01      1.0000
iprec_at_recall_0.80     01      1.0000
iprec_at_recall_0.90     01      1.0000
iprec_at_recall_1.00     01      1.0000
P_5                      01      0.6000
P_10                     01      0.3000
P_15                     01      0.2000
P_20                     01      0.1500
P_30                     01      0.1000


In [None]:
# let count the same words again
myReader2 = myIndex2.reader()
print("# docs with 'bit'", myReader2.doc_frequency("file_content", "bit"))
print("# docs with 'are'", myReader2.doc_frequency("file_content", "are"))
print("# docs with 'get'", myReader2.doc_frequency("file_content", "get"))

# docs with 'bit' 11
# docs with 'are' 0
# docs with 'get' 7


**Can you explain the differences?**

### Using NLTK's stemmers and lemmatizers

In [None]:
import nltk
from nltk.stem import *

In [None]:
# download required resources
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
# we'll compare two stemmers and a lemmatizer
lrStem = LancasterStemmer()
sbStem = SnowballStemmer("english")
wnLemm = WordNetLemmatizer()

In [None]:
# define a list of words to compare the stemmers on
listWords = ["going", "saying", "minimize", "maximum", 
             "meeting", "files", "tries", "is", "are", "beautiful",
             "summarize", "better", "dogs", "phenomena"]

In [None]:
for word in listWords:
    print("%15s %15s %15s %15s" % (lrStem.stem(word),
                                   sbStem.stem(word),
                                   wnLemm.lemmatize(word),
                                   wnLemm.lemmatize(word, 'v')))

          going              go           going              go
            say             say          saying             say
          minim           minim        minimize        minimize
          maxim         maximum         maximum         maximum
           meet            meet         meeting            meet
            fil            file            file            file
            tri             tri             try             try
             is              is              is              be
             ar             are             are              be
         beauty          beauti       beautiful       beautiful
           summ          summar       summarize       summarize
            bet          better          better          better
            dog             dog             dog             dog
       phenomen       phenomena      phenomenon       phenomena


### How to use NLTK stemmers / lemmatizers in Whoosh

In [None]:
# Dont change this! Use it as-is in your code
# This filter will run for both the index and the query
from whoosh.analysis import Filter
class CustomFilter(Filter):
    is_morph = True
    def __init__(self, filterFunc, *args, **kwargs):
        self.customFilter = filterFunc
        self.args = args
        self.kwargs = kwargs
    def __eq__(self):
        return (other
                and self.__class__ is other.__class__)
    def __call__(self, tokens):
        for t in tokens:
            if t.mode == 'query': # if called by query parser
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t
            else: # == 'index' if called by indexer
                t.text = self.customFilter(t.text, *self.args, **self.kwargs)
                yield t

In [None]:
# Example1: Whoosh filter for NLTK's LancasterStemmer
myFilter1 = RegexTokenizer() | CustomFilter(LancasterStemmer().stem)
[token.text for token in myFilter1("We are going to do Text Analysis with whoosh.analysis")]

['we', 'ar', 'going', 'to', 'do', 'text', 'analys', 'with', 'whoosh.analysis']

In [None]:
# Example2: Whoosh filter for NLTK's WordNetLemmatizer
myFilter2 = RegexTokenizer() | CustomFilter(WordNetLemmatizer().lemmatize)
[token.text for token in myFilter2("We are going to do Text Analysis with whoosh.analysis")]

['We',
 'are',
 'going',
 'to',
 'do',
 'Text',
 'Analysis',
 'with',
 'whoosh.analysis']

In [None]:
# Example3: Whoosh filter for NLTK's WordNetLemmatizer for verbs
myFilter3 = RegexTokenizer() | CustomFilter(WordNetLemmatizer().lemmatize, 'v')
[token.text for token in myFilter3("We are going to do Text Analysis with whoosh.analysis")]

['We', 'be', 'go', 'to', 'do', 'Text', 'Analysis', 'with', 'whoosh.analysis']

You can now use myFilter1/2/3 as part of your Schema

------------
You can find details of other NLTK Stemmers and Lemmatizers here:

http://www.nltk.org/api/nltk.stem.html