# Indexing

In [18]:
import pandas as pd
import os
import json
import pandas as pd

In [2]:
import pyterrier as pt
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])  # Initialisation package for RM3

PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


#### Load data
Not necessary to load all documents in dataframe and then index, which occupys much memory. Here I do so to see what the documents look like.

In [4]:
df_docs = pd.read_json('NIR2022 dataset/corpus.jsonl', dtype=str, lines=True)

In [None]:
print(df_docs.shape)
df_docs = df_docs.rename(columns={'_id':'docno'})
df_docs.head()

In [19]:
topics_df = pd.read_csv("NIR2022 dataset/train_query.csv", dtype=str)
print(topics_df.shape)
topics_df
qrels_df = pd.read_csv("NIR2022 dataset/train_qrel.csv")
print(qrels_df.shape)
qrels_df.head()

(200, 2)


Unnamed: 0,qid,query
0,301,international organized crime
1,302,poliomyelitis and post polio
2,303,hubble telescope achievements
3,306,african civilian deaths
4,307,new hydroelectric projects
...,...,...
195,693,newspapers electronic media
196,695,white collar crime sentence
197,697,air traffic controller
198,698,literacy rates africa


Default setting, with stop word removal and Porter stemming.

In [None]:
# original
if not os.path.exists('./indexes_p/iterindex'):
    iter_indexer = pt.IterDictIndexer(
        "./indexes_p/iterindex",
        overwrite=True,
        meta=["docno", "title", "text"],
        meta_lengths=[20, 100, 4096],
        blocks=True
    )

    indexref = iter_indexer.index(df_docs.to_dict(orient='records'), fields=["title", "text"])

With stop word removal but not stemming.

In [None]:
# no stemming
if not os.path.exists('./indexes_p/iterindex_noprocess'):
    iter_indexer_np = pt.IterDictIndexer(
        "./indexes_p/iterindex_noprocess",
        overwrite=True,
        meta=["docno", "title", "text"],
        meta_lengths=[20, 100, 4096],
        blocks=True
    )
    iter_indexer_np.setProperty('termpipelines','Stopwords')

    indexref_np = iter_indexer_np.index(df_docs.to_dict(orient='records'), fields=["title", "text"])

With stop word removal and snowball stemming.

In [None]:
# optimize with snowball stemmer
iter_indexer_opt1 = pt.IterDictIndexer(
    "./indexes_p/iterindex_opt",
    overwrite=True,
    meta=["docno", "title", "text"],
    meta_lengths=[20, 100, 4096],
    blocks=True
)
iter_indexer_opt1.setProperty('tokeniser','EnglishTokeniser')
iter_indexer_opt1.setProperty('termpipelines','Stopwords,EnglishSnowballStemmer')
indexref_opt1 = iter_indexer_opt1.index(df_docs.to_dict(orient='records'), fields=["title", "text"])

In [3]:
index_np = pt.IndexFactory.of('./indexes_p/iterindex_noprocess')
print(index_np.getCollectionStatistics())
index = pt.IndexFactory.of('./indexes_p/iterindex')
print(index.getCollectionStatistics())
index_opt1 = pt.IndexFactory.of('./indexes_p/iterindex_opt')
print(index_opt1.getCollectionStatistics())

Number of documents: 528155
Number of terms: 621458
Number of postings: 90480384
Number of fields: 2
Number of tokens: 145322007
Field names: [title, text]
Positions:   true

Number of documents: 528155
Number of terms: 520520
Number of postings: 83716130
Number of fields: 2
Number of tokens: 145322007
Field names: [title, text]
Positions:   true

Number of documents: 528155
Number of terms: 521571
Number of postings: 83641265
Number of fields: 2
Number of tokens: 145322007
Field names: [title, text]
Positions:   true



### Statistics

In [12]:
print(index_opt1.getCollectionStatistics())
stats = index_opt1.getCollectionStatistics()

Number of documents: 528155
Number of terms: 521571
Number of postings: 83641265
Number of fields: 2
Number of tokens: 145322007
Field names: [title, text]
Positions:   true



In [15]:
print(f"Average document length: {stats.getAverageDocumentLength()}")

Average document length: 275.1503005746419


In [42]:
import nltk

docs_len = []
for i in topics_df['query']:
    doc = nltk.tokenize.word_tokenize(i)
    docs_len.append(len(doc))

print(f"The average length of queries: {sum(docs_len) / len(docs_len)}")

The average length of queries: 2.745


In [45]:
topics_df_test = pd.read_csv("NIR2022 dataset/test_query.csv", dtype=str)
print(topics_df_test.shape)
topics_df_test.head()

(50, 2)


Unnamed: 0,qid,query
0,304,endangered species mammals
1,305,most dangerous vehicles
2,310,radio waves and brain cancer
3,311,industrial espionage
4,314,marine vegetation


In [52]:
qrels_df.groupby(qrels_df['qid']).mean().describe()

Unnamed: 0,label,iteration
count,199.0,199.0
mean,0.064724,0.0
std,0.072932,0.0
min,0.002838,0.0
25%,0.019741,0.0
50%,0.043478,0.0
75%,0.084021,0.0
max,0.623188,0.0


In [46]:
docs_len = []
for i in topics_df_test['query']:
    doc = nltk.tokenize.word_tokenize(i)
    docs_len.append(len(doc))

print(f"The average length of queries: {sum(docs_len) / len(docs_len)}")

The average length of queries: 2.84
