# Assignment 2B: Feature computation

The purpose of this notebook is to perform the computation of features. 

Note that some features might be expensive, so you don't want to keep re-computing them. Instead, aim for writing a set of relatively simple feature extractors, each computing one or multiple features, and save their output to separate files. Then, load the pre-computed features from multiple files in the learning step (in the [ranking notebook](2_Ranking.ipynb)).

In [1]:
import json
import pandas as pd
from pprint import pprint
from tqdm import tqdm
import requests
from urllib.parse import urlencode
import math
from statistics import mean

SEARCH_URL = 'http://gustav1.ux.uis.no:5002/clueweb12b/_search?q=' #q=united+states&df=title&size=20
SEARCH_ANCHORS_URL = 'http://gustav1.ux.uis.no:5002/clueweb12b_anchors/_search?q=' #united+states&df=anchors&size=20
TERMVECTORS_URL = 'http://gustav1.ux.uis.no:5002/clueweb12b/' #doc_id/_termvectors?term_statistics=true
EXISTS_URL = 'http://gustav1.ux.uis.no:5002/clueweb12b/' # doc_id/_exists
ANALYZE_URL = 'http://gustav1.ux.uis.no:5002/clueweb12b/_analyze?' #World%27s+biggest+dog

## Feature extractors

In [2]:
def query_length(query):
    """Number of terms in the query""" 
    return len(query)

In [3]:
def field_length(doc_id, documents_stats, field=None):
    """Length of the document, divided by longest document in collection"""
    try:
        length = documents_stats.loc[doc_id, field]['Field_length']
        return length     
    except KeyError:
        #print(f'{doc_id} is not in index')
        return 0

In [4]:
def feature_pagerank(doc_id, pagerank):
    """Pagerank score of a given document"""
    return pagerank.loc[doc_id]['Score']

In [17]:
def feature_query_coverage(query, doc_id, field, docterms, normalized=False):
    score = 0

    for term in query:
        try:
            tf = docterms.loc[doc_id, field, term]['TermFreq'].values[0]
            score += 1
        except KeyError:
            score += 0

    if normalized == True:
        score /= len(query)

    return score

In [5]:
def feature_tf(query, doc_id, field, docterms, docs, strategy='sum', normalized=False):
    scores = []

    for term in query:
        try:
            tf = docterms.loc[doc_id, field, term]['TermFreq'].values[0]
            scores.append(tf)
        except KeyError:
            scores.append(0)

    if strategy == 'sum':
        score = sum(scores)
    elif strategy == 'mean':
        score = mean(scores)
    elif strategy == 'min':
        score = min(scores)
    elif strategy == 'max':
        score = max(scores)

    if normalized == True and score != 0:
        score /= docs.loc[doc_id, field]['Field_length']

    return score

In [6]:
def feature_tfidf(query, doc_id, field, docterms, docs, strategy='sum', normalized=False):
    scores = []

    for term in query:
        try:
            tf = docterms.loc[doc_id, field, term]['TermFreq'].values[0]
            idf = docterms.loc[doc_id, field, term]['IDF'].values[0]
            scores.append(tf * idf)
        except KeyError:
            scores.append(0)

    if strategy == 'sum':
        score = sum(scores)
    elif strategy == 'mean':
        score = mean(scores)
    elif strategy == 'min':
        score = min(scores)
    elif strategy == 'max':
        score = max(scores)

    if normalized == True and score != 0:
        score /= docs.loc[doc_id, field]['Field_length']

    return score

In [7]:
def feature_bm25(query, doc_id, field, docterms, docs, collection, k1=1.2, b=0.75):
    """BM25 score on a given field"""
    score = 0

    try:
        average_field_length = collection.loc[field]['Field_length'] / collection.loc[field]['Num_docs']
        document_field_length = docs.loc[doc_id, field]['Field_length']
        length_ratio = document_field_length / average_field_length
        normalization = k1 * (1 - b + b * length_ratio)
    except KeyError:
        #print(f'{doc_id}-{field} not in index')
        return 0

    for term in query:
        try:
            tf = docterms.loc[doc_id, field, term]['TermFreq'].values[0]
            idf = docterms.loc[doc_id, field, term]['IDF'].values[0]
            score += idf * (tf * (1 + k1)) / (tf + normalization)
        except KeyError:
            #print(f'{term} not in {doc_id}-{field}')
            score += 0
    
    return score

In [8]:
def feature_lm(query, doc_id, field, docterms, docs, terms, collection, lambda_param=0.75):
    """Language Model score on a given field"""
    score = 0

    try:
        document_field_length = docs.loc[doc_id, field]['Field_length']
        collection_field_length = collection.loc[field]['Field_length']
        
    except KeyError:
        #print(f'{doc_id}-{field} not in index')
        return 0

    for term in query:
        try:
            tf = docterms.loc[doc_id, field, term]['TermFreq'].values[0]
        except KeyError:
            tf = 0

        sum_tf = terms.loc[term, field]['SumTermFreq']

        ptd = tf / document_field_length
        ptc = sum_tf / collection_field_length

        raw_score = (1 - lambda_param) * ptd + lambda_param * ptc
        score += math.log(raw_score) if raw_score > 0 else 0
    
    return score

---
## File loading utilities

In [9]:
def process_query(q):
    params = urlencode({'text': q})
    query = f'{ANALYZE_URL}{params}'
    response = json.loads(requests.get(query).text)
    if response != {}:
        processed_query = [item['token'] for item in response['tokens']]
    else:
        raise ValueError(f'Query {q} could not be processed')
    return processed_query

In [10]:
def load_queries(path):
    with open(path) as f:
        query_list = f.readlines()
    queries = {q.split()[0]:process_query(' '.join(q.split()[1:])) for q in query_list} 
    return queries

In [11]:
def doc_in_index(doc_id):
    query = f'{EXISTS_URL}{doc_id}/_exists'
    response = json.loads(requests.get(query).text)
    return response['exists']

In [12]:
def load_documents(query, q_id, ranking):
    """ Retrieve all documents in the fist pass ranking for a given query """
    documents = ranking.loc[q_id]['DocumentId'].to_list()
    return documents

In [13]:
# Loading queries, qrels, and first pass ranking
train_queries = load_queries('data/queries.txt')
test_queries = load_queries('data/queries2.txt')

train_qrels = pd.read_csv('data/qrels.csv')
train_qrels.set_index('QueryId', inplace=True)

train_first_pass = pd.read_csv('data/ranking_bm25.csv')
train_first_pass.set_index('QueryId', inplace=True)

test_first_pass = pd.read_csv('data/ranking2_bm25.csv')
test_first_pass.set_index('QueryId', inplace=True)

# Loading collection statistics
terms_stats = pd.read_csv('data/stats_terms.tsv', sep='\t')
terms_stats.set_index(['Term', 'Field'], inplace=True)

collection_stats = pd.read_csv('data/stats_coll.tsv', sep='\t')
collection_stats.set_index('Field', inplace=True)

docs_stats = pd.read_csv('data/stats_docs.tsv', sep='\t')
docs_stats.set_index(['DocumentId', 'Field'], inplace=True)

docterms_stats = pd.read_csv('data/stats_docs_terms.tsv', sep='\t')
docterms_stats.set_index(['DocumentId', 'Field', 'Term'], inplace=True)

# Loading pagerank
pagerank = pd.read_csv('data/pagerank.csv', sep=' ', header=None, names=['DocumentId', 'Score'], index_col='DocumentId')

---
## Feature computation

For each feature extractor above, we compute the features on the documents retrieved from the first pass retrieval. Features are computed for both train and test sets, and saved to files in order to be loaded in the ranking part.

In [22]:
def compute_features(mode='train'):
    
    if mode == 'train':
        queries=train_queries
        first_pass=train_first_pass
    else:
        queries=test_queries
        first_pass=test_first_pass
        
    features = []
    
    for q_id, query in tqdm(queries.items()):

        docs = load_documents(query, q_id, first_pass)

        for d in docs:
            
            # Field length 
            #f = dict(QueryId=q_id, 
            #         DocumentId=d,
            #         content_length=field_length(d, docs_stats, 'content'),
            #         title_length=field_length(d, docs_stats, 'title'),
            #         anchors_length=field_length(d, docs_stats, 'anchors'))

            # Query length 
            #f = dict(QueryId=q_id, 
            #         DocumentId=d,
            #         query_length=query_length(query))

            # Term frequency
            #f = dict(QueryId=q_id, 
            #         DocumentId=d,
            #         TF_title_sum=feature_tf(query, d, 'title', docterms_stats, docs_stats, strategy='sum', normalized=False),
            #         TF_content_sum=feature_tf(query, d, 'content', docterms_stats, docs_stats, strategy='sum', normalized=False),
            #         TF_anchors_sum=feature_tf(query, d, 'anchors', docterms_stats, docs_stats, strategy='sum', normalized=False),

            #         TF_title_mean=feature_tf(query, d, 'title', docterms_stats, docs_stats, strategy='mean', normalized=False),
            #         TF_content_mean=feature_tf(query, d, 'content', docterms_stats, docs_stats, strategy='mean', normalized=False),
            #         TF_anchors_mean=feature_tf(query, d, 'anchors', docterms_stats, docs_stats, strategy='mean', normalized=False),

            #         TF_title_max=feature_tf(query, d, 'title', docterms_stats, docs_stats, strategy='max', normalized=False),
            #         TF_content_max=feature_tf(query, d, 'content', docterms_stats, docs_stats, strategy='max', normalized=False),
            #         TF_anchors_max=feature_tf(query, d, 'anchors', docterms_stats, docs_stats, strategy='max', normalized=False),

            #         TF_title_min=feature_tf(query, d, 'title', docterms_stats, docs_stats, strategy='min', normalized=False),
            #         TF_content_min=feature_tf(query, d, 'content', docterms_stats, docs_stats, strategy='min', normalized=False),
            #         TF_anchors_min=feature_tf(query, d, 'anchors', docterms_stats, docs_stats, strategy='min', normalized=False),

            #         normalized_TF_title_sum=feature_tf(query, d, 'title', docterms_stats, docs_stats, strategy='sum', normalized=True),
            #         normalized_TF_content_sum=feature_tf(query, d, 'content', docterms_stats, docs_stats, strategy='sum', normalized=True),
            #         normalized_TF_anchors_sum=feature_tf(query, d, 'anchors', docterms_stats, docs_stats, strategy='sum', normalized=True),

            #         normalized_TF_title_mean=feature_tf(query, d, 'title', docterms_stats, docs_stats, strategy='mean', normalized=True),
            #         normalized_TF_content_mean=feature_tf(query, d, 'content', docterms_stats, docs_stats, strategy='mean', normalized=True),
            #         normalized_TF_anchors_mean=feature_tf(query, d, 'anchors', docterms_stats, docs_stats, strategy='mean', normalized=True),

            #         normalized_TF_title_max=feature_tf(query, d, 'title', docterms_stats, docs_stats, strategy='max', normalized=True),
            #         normalized_TF_content_max=feature_tf(query, d, 'content', docterms_stats, docs_stats, strategy='max', normalized=True),
            #         normalized_TF_anchors_max=feature_tf(query, d, 'anchors', docterms_stats, docs_stats, strategy='max', normalized=True),

            #         normalized_TF_title_min=feature_tf(query, d, 'title', docterms_stats, docs_stats, strategy='min', normalized=True),
            #         normalized_TF_content_min=feature_tf(query, d, 'content', docterms_stats, docs_stats, strategy='min', normalized=True),
            #         normalized_TF_anchors_min=feature_tf(query, d, 'anchors', docterms_stats, docs_stats, strategy='min', normalized=True))

            # TFIDF
            #f = dict(QueryId=q_id, 
            #         DocumentId=d,
            #         TFIDF_title_sum=feature_tfidf(query, d, 'title', docterms_stats, docs_stats, strategy='sum', normalized=False),
            #         TFIDF_content_sum=feature_tfidf(query, d, 'content', docterms_stats, docs_stats, strategy='sum', normalized=False),
            #         TFIDF_anchors_sum=feature_tfidf(query, d, 'anchors', docterms_stats, docs_stats, strategy='sum', normalized=False),

            #         TFIDF_title_mean=feature_tfidf(query, d, 'title', docterms_stats, docs_stats, strategy='mean', normalized=False),
            #         TFIDF_content_mean=feature_tfidf(query, d, 'content', docterms_stats, docs_stats, strategy='mean', normalized=False),
            #         TFIDF_anchors_mean=feature_tfidf(query, d, 'anchors', docterms_stats, docs_stats, strategy='mean', normalized=False),

            #         TFIDF_title_max=feature_tfidf(query, d, 'title', docterms_stats, docs_stats, strategy='max', normalized=False),
            #         TFIDF_content_max=feature_tfidf(query, d, 'content', docterms_stats, docs_stats, strategy='max', normalized=False),
            #         TFIDF_anchors_max=feature_tfidf(query, d, 'anchors', docterms_stats, docs_stats, strategy='max', normalized=False),

            #         TFIDF_title_min=feature_tfidf(query, d, 'title', docterms_stats, docs_stats, strategy='min', normalized=False),
            #         TFIDF_content_min=feature_tfidf(query, d, 'content', docterms_stats, docs_stats, strategy='min', normalized=False),
            #         TFIDF_anchors_min=feature_tfidf(query, d, 'anchors', docterms_stats, docs_stats, strategy='min', normalized=False),

            #         normalized_TFIDF_title_sum=feature_tfidf(query, d, 'title', docterms_stats, docs_stats, strategy='sum', normalized=True),
            #         normalized_TFIDF_content_sum=feature_tfidf(query, d, 'content', docterms_stats, docs_stats, strategy='sum', normalized=True),
            #         normalized_TFIDF_anchors_sum=feature_tfidf(query, d, 'anchors', docterms_stats, docs_stats, strategy='sum', normalized=True),

            #         normalized_TFIDF_title_mean=feature_tfidf(query, d, 'title', docterms_stats, docs_stats, strategy='mean', normalized=True),
            #         normalized_TFIDF_content_mean=feature_tfidf(query, d, 'content', docterms_stats, docs_stats, strategy='mean', normalized=True),
            #         normalized_TFIDF_anchors_mean=feature_tfidf(query, d, 'anchors', docterms_stats, docs_stats, strategy='mean', normalized=True),

            #         normalized_TFIDF_title_max=feature_tfidf(query, d, 'title', docterms_stats, docs_stats, strategy='max', normalized=True),
            #         normalized_TFIDF_content_max=feature_tfidf(query, d, 'content', docterms_stats, docs_stats, strategy='max', normalized=True),
            #         normalized_TFIDF_anchors_max=feature_tfidf(query, d, 'anchors', docterms_stats, docs_stats, strategy='max', normalized=True),

            #         normalized_TFIDF_title_min=feature_tfidf(query, d, 'title', docterms_stats, docs_stats, strategy='min', normalized=True),
            #         normalized_TFIDF_content_min=feature_tfidf(query, d, 'content', docterms_stats, docs_stats, strategy='min', normalized=True),
            #         normalized_TFIDF_anchors_min=feature_tfidf(query, d, 'anchors', docterms_stats, docs_stats, strategy='min', normalized=True))

            # Query coverage
            f = dict(QueryId=q_id,
                     DocumentId=d,
                     title_query_coverage=feature_query_coverage(query, d, 'title', docterms_stats, normalized=False),
                     content_query_coverage=feature_query_coverage(query, d, 'content', docterms_stats, normalized=False),
                     anchors_query_coverage=feature_query_coverage(query, d, 'anchors', docterms_stats, normalized=False),

                     normalized_title_query_coverage=feature_query_coverage(query, d, 'title', docterms_stats, normalized=True),
                     normalized_content_query_coverage=feature_query_coverage(query, d, 'content', docterms_stats, normalized=True),
                     normalized_anchors_query_coverage=feature_query_coverage(query, d, 'anchors', docterms_stats, normalized=True))

            # PageRank
            #f = dict(QueryId=q_id,
            #         DocumentId=d,
            #         pagerank_score=feature_pagerank(d, pagerank))

            # BM25 
            #f = dict(QueryId=q_id, 
            #         DocumentId=d,
            #         bm25_content=feature_bm25(query, d, 'content', docterms_stats, docs_stats, collection_stats),
            #         bm25_title=feature_bm25(query, d, 'title', docterms_stats, docs_stats, collection_stats), 
            #         bm25_anchors=feature_bm25(query, d, 'anchors', docterms_stats, docs_stats, collection_stats))

            # LM
            #f = dict(QueryId=q_id, 
            #         DocumentId=d,
            #         lm_content=feature_lm(query, d, 'content', docterms_stats, docs_stats, terms_stats, collection_stats),
            #         lm_title=feature_lm(query, d, 'title', docterms_stats, docs_stats, terms_stats, collection_stats), 
            #         lm_anchors=feature_lm(query, d, 'anchors', docterms_stats, docs_stats, terms_stats, collection_stats))

            features.append(f)

    features = pd.DataFrame.from_dict(features).set_index(['QueryId', 'DocumentId'])

    # write computed features to file
    features.to_csv(f'data/{mode}_features_qcoverage.csv')

In [23]:
compute_features(mode='train')
compute_features(mode='test')

  
100%|██████████| 50/50 [00:25<00:00,  1.92it/s]
100%|██████████| 50/50 [00:24<00:00,  2.04it/s]
