# Assignment 2B: Ranking

This notebook contains the skeleton for training a model and then applying it to produce a document ranking.

## Loading the precomputed features

The code below loads the precomputed features and combines them into feature vectors for query-document pairs.

For this part to work, you'll need to run the `1_Feature_computation` notebook first to generate the sample features JSON files.

In [1]:
import json
import pandas as pd
pd.options.mode.chained_assignment = None
from pprint import pprint

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
def load_features(features_to_load):
    feature_list = []
    
    for feature in features_to_load:
        print(f'Loading features from {feature["file"]}')
        with open(feature['file']) as f:
              index = ['QueryId', 'DocumentId']
              new_feature = pd.read_csv(f, index_col=index)[feature['select']]
              feature_list.append(new_feature)

    features = pd.concat(feature_list, axis='columns')
    return features

In [3]:
def load_qrels(path):
    with open(path) as f:
        qrels = pd.read_csv(f, index_col=['QueryId', 'DocumentId'])
    return qrels

In [4]:
def normalize_dataframe(df):
    return (df - df.min()) / (df.max() - df.min())

In [5]:
# Specify the features to be loaded from each file
train_features = [
    {
        'file': 'data/train_features_pagerank.csv',
        'select': ['pagerank_score']
    },
    {
        'file': 'data/train_features_field_length.csv',
        'select': ['content_length', 'title_length', 'anchors_length']
    },
    #{
    #    'file': 'data/features_bm25.csv',
    #    'select': ['bm25_title', 'bm25_content']
    #}
]

test_features = [
    {
        'file': 'data/test_features_pagerank.csv',
        'select': ['pagerank_score']
    },
    {
        'file': 'data/test_features_field_length.csv',
        'select': ['content_length', 'title_length', 'anchors_length']
    },
    #{
    #    'file': 'data/features_bm25.csv',
    #    'select': ['bm25_title', 'bm25_content']
    #}
]


train_features = load_features(train_features)
feature_list = list(train_features.columns)

test_features = load_features(test_features)
assert feature_list == list(test_features.columns)

Loading features from data/train_features_pagerank.csv
Loading features from data/train_features_field_length.csv
Loading features from data/test_features_pagerank.csv
Loading features from data/test_features_field_length.csv


In [6]:
#print(feature_list)
#vector = features.join(qrels, how='left')

qrels = load_qrels('data/qrels.csv')
train_data = normalize_dataframe(train_features)
train_data = train_data.join(qrels, how='left')

# documents in the index but not in the qrels have a relevance of zero
train_data.fillna(0.0, inplace=True)

test_data = normalize_dataframe(test_features)

In [7]:
test_data

Unnamed: 0_level_0,Unnamed: 1_level_0,pagerank_score,content_length,title_length,anchors_length
QueryId,DocumentId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
240,clueweb12-1714wb-62-05205,0.002519,0.034397,0.115385,0.000000
240,clueweb12-1910wb-56-32265,0.000000,0.023207,0.134615,0.001178
240,clueweb12-0012wb-03-18567,0.003036,0.039390,0.057692,0.017577
240,clueweb12-0003wb-42-21171,0.000000,0.038426,0.096154,0.000491
240,clueweb12-0501wb-41-12471,0.000082,0.011397,0.115385,0.006186
...,...,...,...,...,...
250,clueweb12-1208wb-74-36611,0.002356,0.081534,0.096154,0.185202
250,clueweb12-0612wb-80-21169,0.002519,0.054471,0.115385,0.000736
250,clueweb12-0412wb-17-24226,0.002163,0.027373,0.076923,0.009967
250,clueweb12-1809wb-76-08314,0.002592,0.082498,0.134615,0.000000


In [8]:
train_data

Unnamed: 0_level_0,Unnamed: 1_level_0,pagerank_score,content_length,title_length,anchors_length,Relevance
QueryId,DocumentId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
201,clueweb12-1700tw-22-12689,0.000000,0.016158,0.050847,0.000000,0.0
201,clueweb12-0915wb-34-01721,0.000000,0.011953,0.076271,0.000512,0.0
201,clueweb12-0713wb-35-13648,0.018176,0.028308,0.025424,0.022801,1.0
201,clueweb12-0900wb-98-24673,0.000000,0.007390,0.042373,0.000000,0.0
201,clueweb12-0909wb-96-26247,0.001158,0.014655,0.050847,0.000470,0.0
...,...,...,...,...,...,...
239,clueweb12-1111wb-58-09080,0.000173,0.014637,0.059322,0.002220,0.0
239,clueweb12-1507wb-91-37258,0.000000,0.007104,0.025424,0.000128,0.0
239,clueweb12-1504wb-88-20525,0.017532,0.007480,0.008475,0.032365,0.0
239,clueweb12-1506wb-78-03624,0.000000,0.007140,0.033898,0.000171,0.0


## Training a model

Training needs to be done differently based on the scenario:

  * **Scenario 1**: The model is trained using cross-validation, that is on 4/5 of queries, then applied on the remaining 1/5 of queries (repeated 5 times).
  * **Scenario 2**: The model is trained on all available training data.
  
The feature vectors at this point are already created. These should contain both (a) the training queries and (b) the queries on which you want to apply your model.

Train your model on queries (a). For that you'll also need to load the corresponding relevance labels.

In [25]:
from sklearn.model_selection import GroupKFold
from sklearn.base import clone
from sklearn.ensemble import AdaBoostRegressor

In [26]:
class PointWiseLTRModel():
    def __init__(self, data, regressor=AdaBoostRegressor(random_state=42)):
        self.regressor = regressor
        self.features = data[feature_list]
        self.targets = data['Relevance']
        
    def cross_validate(self, output=None):
        ranking = pd.DataFrame()
        gkf = GroupKFold(n_splits=5)
        groups = self.features.reset_index('DocumentId').index
        
        for train_index, test_index in gkf.split(self.features.index, self.targets.index, groups=groups):
            x_train, x_test = self.features.iloc[train_index], self.features.iloc[test_index]
            y_train, y_test = self.targets.iloc[train_index], self.targets.iloc[test_index]
            print(y_train.reset_index('DocumentId').index.unique(), y_test.reset_index('DocumentId').index.unique())
            
            regressor = clone(self.regressor)
            regressor.fit(x_train, y_train)
            new_ranking = self.rank(x_test, estimator=regressor)
            ranking = pd.concat([new_ranking, ranking])
            
        ranking.sort_values(by=['QueryId', 'Relevance'], ascending=[True, False], inplace=True)
        if output is not None:
            ranking.to_csv(output, columns=['Relevance'])
        else:
            return ranking
        
    def train(self):
        self.regressor.fit(self.features, self.targets)

    def rank(self, x, output=None, estimator=None):
        estimator = self.regressor if estimator is None else estimator
        x_copy = x.copy()
        x_copy['Relevance'] = estimator.predict(x_copy)
        
        if output is not None:
            x_copy.sort_values(by=['QueryId', 'Relevance'], ascending=[True, False], inplace=True)
            x_copy.to_csv(output, columns=['Relevance'])
        else:
            return x_copy

In [27]:
model = PointWiseLTRModel(train_data)
model.cross_validate('data/cross_validation.csv')

Int64Index([201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 212, 213, 215,
            216, 217, 218, 220, 222, 223, 225, 226, 227, 228, 229, 230, 231,
            232, 233, 235, 236, 237],
           dtype='int64', name='QueryId') Int64Index([204, 214, 219, 221, 224, 234, 238, 239], dtype='int64', name='QueryId')
Int64Index([201, 202, 204, 205, 206, 207, 209, 211, 212, 214, 215, 216, 217,
            218, 219, 220, 221, 222, 224, 225, 226, 227, 229, 231, 232, 234,
            235, 236, 237, 238, 239],
           dtype='int64', name='QueryId') Int64Index([203, 208, 210, 213, 223, 228, 230, 233], dtype='int64', name='QueryId')
Int64Index([201, 203, 204, 205, 206, 208, 209, 210, 211, 213, 214, 215, 216,
            218, 219, 220, 221, 223, 224, 225, 226, 228, 229, 230, 231, 233,
            234, 235, 236, 238, 239],
           dtype='int64', name='QueryId') Int64Index([202, 207, 212, 217, 222, 227, 232, 237], dtype='int64', name='QueryId')
Int64Index([202, 203, 204, 205, 207, 208, 209,

## Applying the model to produce a ranking

Apply the trained model on queries (b) and sort documents according to the predicted relevance score.

In [28]:
model.train()

In [29]:
model.rank(test_data, output='data/ranking_kaggle.csv')

##  most important features

In [21]:
model.regressor.feature_importances_

array([0.16089395, 0.540063  , 0.18254956, 0.11649349])