# Assignment 2B: Feature computation

The purpose of this notebook is to perform the computation of features. 

Note that some features might be expensive, so you don't want to keep re-computing them. Instead, aim for writing a set of relatively simple feature extractors, each computing one or multiple features, and save their output to separate files. Then, load the pre-computed features from multiple files in the learning step (in the [ranking notebook](2_Ranking.ipynb)).

## Feature extractors

In [1]:
import json
import pandas as pd
from pprint import pprint
from tqdm import tqdm
import requests
from urllib.parse import urlencode

SEARCH_URL = 'http://gustav1.ux.uis.no:5002/clueweb12b/_search?q=' #q=united+states&df=title&size=20
SEARCH_ANCHORS_URL = 'http://gustav1.ux.uis.no:5002/clueweb12b_anchors/_search?q=' #united+states&df=anchors&size=20
TERMVECTORS_URL = 'http://gustav1.ux.uis.no:5002/clueweb12b/' #doc_id/_termvectors?term_statistics=true
EXISTS_URL = 'http://gustav1.ux.uis.no:5002/clueweb12b/' # doc_id/_exists
ANALYZE_URL = 'http://gustav1.ux.uis.no:5002/clueweb12b/_analyze?' #World%27s+biggest+dog

In [2]:
def query_length(query):
    """Number of terms in the query""" 
    return len(query.split('+'))

In [3]:
def field_length(doc_id, documents_stats, field=None):
    """Length of the document, divided by longest document in collection"""
    try:
        length = documents_stats[documents_stats['Field'] == field].loc[doc_id]['Field_length']
        return length     
    except KeyError:
        return 0

In [4]:
def old_feature_lm(query, doc, field):
    """Feature: LM retrieval score on a given field."""
    assert field in ['title', 'content', 'anchors']
    
    r = requests.get(f'{TERMVECTORS_URL}{doc}/_termvectors?term_statistics=true')   
    if r.status_code != 200:
        raise ValueError('Index did not respond')     
    response = json.loads(r.text)['term_vectors'][field]
    
    tf = 0 
    for term in query.split():
        if term in response['terms']:
            tf += response['terms'][term]['term_freq']
        
    # Document length is calculated
    len_d = sum([t['term_freq'] for t in response['terms'].values()])    
    
    return tf / len_d
    
    return 0

In [5]:
def old_feature_es_score(query, doc, field, size=100):
    """Feature: BM25 retrieval score on a given field."""
    
    for field in ['title', 'content', 'anchors']:
        if field == 'anchors':
            response = requests.get(f'{SEARCH_ANCHORS_URL}{query}&df={field}&size={size}')
        else:
            response = requests.get(f'{SEARCH_URL}{query}&df={field}&size={size}')

        if response.status_code != 200:
            raise ValueError('Index did not respond')
        else:  
            response = json.loads(response.text)

        for hit in response['hits']['hits']:
            if hit['_id'] == doc:
                return hit['_score'] 
        
    return 0

In [6]:
def feature_pagerank(doc_id, pagerank):
    return pagerank.loc[doc_id]['Score']

In [7]:
def feature_tf(query, doc_id, field=None):
    pass

## Feature computation

Computes features for document-query pairs and saves them to a file.

Specifically, we will save features to a JSON file, using a nested map structure, with queries on the first level, documents on the second level, and individual features on the third level. 

```python
  features = {
      'query_i': {
          'doc_j': {
              'feature_1': 0,  # value of feature_1 for (query_i, doc_j) pair
              'feature_2': 0,  # value of feature_2 for (query_i, doc_j) pair
              ...
          }
          ...
      }
      ...
  }
```

**Note**: The set of documents for a query (for which you want to compute features) should be a combination of the documents for which you have relevance labels and the top-100 documents retrieved in first-pass retrieval.
You can then decide in the learning part if/how you want to deal with class imbalance.

---
## File loading utilities

In [8]:
def process_query(q):
    params = urlencode({'text': q})
    #query = f'{ANALYZE_URL}{params}'
    #response = json.loads(requests.get(query).text)
    #if response != {}:
    #    processed_query = '+'.join(item['token'] for item in response['tokens'])
    #else:
    #    raise ValueError(f'Query {q} could not be processed')
    #return processed_query
    return q.lower().replace(' ', '+')

In [9]:
def load_queries(path):
    with open(path) as f:
        query_list = f.readlines()
    queries = {q.split()[0]:process_query(' '.join(q.split()[1:])) for q in query_list} 
    return queries

In [10]:
def doc_in_index(doc_id):
    query = f'{EXISTS_URL}{doc_id}/_exists'
    response = json.loads(requests.get(query).text)
    return response['exists']

In [11]:
def load_documents(query, q_id, ranking):
    """ Retrieve all documents in the fist pass ranking for a given query """
    documents = ranking.loc[q_id]['DocumentId'].to_list()
    return documents

In [12]:
# Loading queries, qrels, and first pass ranking
train_queries = load_queries('data/train_queries.txt')
test_queries = load_queries('data/test_queries.txt')

train_qrels = pd.read_csv('data/qrels.csv')
train_qrels.set_index('QueryId', inplace=True)

train_first_pass = pd.read_csv('data/ranking_bm25_train.csv')
train_first_pass.set_index('QueryId', inplace=True)

test_first_pass = pd.read_csv('data/ranking_bm25_test.csv')
test_first_pass.set_index('QueryId', inplace=True)

# Loading collection statistics
terms_stats = pd.read_csv('data/stats_terms.tsv', sep='\t')
terms_stats.set_index(['Term', 'Field'], inplace=True)

collection_stats = pd.read_csv('data/stats_coll.tsv', sep='\t')
collection_stats.set_index('Field', inplace=True)

docs_stats = pd.read_csv('data/stats_docs.tsv', sep='\t')
docs_stats.set_index(['DocumentId'], inplace=True)

docterms_stats = pd.read_csv('data/stats_docs_terms.tsv', sep='\t')
docterms_stats.set_index(['DocumentId', 'Field', 'Term'], inplace=True)

# Loading pagerank
pagerank = pd.read_csv('data/pagerank.csv', sep=' ', header=None, names=['DocumentId', 'Score'], index_col='DocumentId')

In [17]:
docterms_stats.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,TermFreq,IDF
DocumentId,Field,Term,Unnamed: 3_level_1,Unnamed: 4_level_1
clueweb12-0000tw-05-12114,anchors,raspberri,5023,9.619303
clueweb12-0000tw-05-12114,anchors,pi,5071,9.058517
clueweb12-0000wb-30-01951,anchors,raspberri,1089,9.588754
clueweb12-0000wb-30-01951,anchors,pi,1089,9.079421
clueweb12-0000wb-60-01497,content,raspberri,16,6.231949
clueweb12-0000wb-60-01497,title,raspberri,1,9.511528
clueweb12-0000wb-60-01497,anchors,raspberri,387,9.621843
clueweb12-0000wb-60-01497,content,pi,17,5.360829
clueweb12-0000wb-60-01497,title,pi,1,8.660969
clueweb12-0000wb-60-01497,anchors,pi,387,9.069365


---
## Feature computation

In [14]:
def compute_features(mode='train'):
    if mode == 'train':
        queries=train_queries
        first_pass=train_first_pass
    else:
        queries=test_queries
        first_pass=test_first_pass
        
    features = []
    
    for q_id, query in tqdm(queries.items()):

        docs = load_documents(query, q_id, first_pass)

        for d in docs:

            # TF
            #f = dict(QueryId=q_id, 
            #         DocumentId=d, 
            #         title_term_frequency=feature_tf(query, doc_id, field='title'),
            #         content_term_frequency=feature_tf(query, doc_id, field='content'),
            #         anchors_term_frequency=feature_tf(query, doc_id, field='anchors'))
            
            # Field length 
            #f = dict(QueryId=q_id, 
             #        DocumentId=d,
              #       content_length=field_length(d, docs_stats, 'content'),
               #      title_length=field_length(d, docs_stats, 'title'),
                #     anchors_length=field_length(d, docs_stats, 'anchors'),
                 #   )
            
            # PageRank
            #f = dict(QueryId=q_id,
            #         DocumentId=d,
            #         pagerank_score=feature_pagerank(d, pagerank))

            # bm25 
            #f = dict(QueryId=q_id, 
            #         DocumentId=d,
            #         bm25_content=feature_bm25(query, d, 'content'),
            #         bm25_title=feature_bm25(query, d, 'title'))

            features.append(f)

    features = pd.DataFrame.from_dict(features).set_index(['QueryId', 'DocumentId'])

    # write computed features to file
    features.to_csv(f'data/{mode}_features_field_length.csv')

In [15]:
#compute_features(mode='train')

In [16]:
#compute_features(mode='test')