# Learning to Rank on MQ2008 Dataset

In [4]:
# Importing libraries

import pandas as pd
import numpy as np
import itertools
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import ndcg_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## Building train, validation, test data

In [5]:
# Extracting (q, d, c) from dataset

def load_data(file_path):
    '''returns a dictionary with keys equal to queries and values equal to features of each doc
    dict of q: (d1, c1), (d2, c2), ..., (dn, cn) where c is relevance and d is doc feature vector'''
    query_feedback = {}
    with open(file_path) as fp:
        line = fp.readline()
        while line:
            parts = line.strip().split(" ")
            c = int(parts[0]) # relevance
            q = parts[1].split(":")[1] # query id
            s = parts[2:48] # feature vector
            s = np.array([float(f.split(":")[1]) for f in s])
            if q in query_feedback:
                query_feedback[q].append((s, c))
            else:
                query_feedback[q] = [(s, c)]
            line = fp.readline()
    return query_feedback       

In [21]:
# Building pairwise features and targets (for train data)

def build_pairwise_data(query_dict):
    y = []
    X = []
    for query in query_dict:
        docs = query_dict[query]
        for (d1, c1), (d2, c2) in itertools.combinations(docs, 2):
            # do not consider pairs with equal relevance
            if c1 == c2:
                continue
            X.append(d1 - d2)
            y.append(np.sign(c1 - c2))
    y = np.array(y)
    X = np.array(X)
    return X, y    

In [13]:
# Building features and targets (for test and validation data)

def build_data(query_dict):
    y = []
    X = []
    for query in query_dict:
        docs = query_dict[query]
        for (d, c) in docs:
            X.append(d)
            y.append(c)
    y = np.array(y)
    X = np.array(X)
    return X, y     

In [9]:
data_train = load_data('train.txt')
data_test = load_data('test.txt')
data_val = load_data('vali.txt')

In [11]:
print("Num training queries: ", len(data_train))
print("Num validation queries: ", len(data_val))
print("Num test queries: ", len(data_test))

Num training queries:  471
Num validation queries:  157
Num test queries:  156


In [22]:
X_train, y_train = build_pairwise_data(data_train)
X_test, y_test = build_data(data_test)
X_val, y_val = build_data(data_val)

In [23]:
X_train.shape, X_test.shape, X_val.shape

((52325, 46), (2874, 46), (2707, 46))

In [30]:
y_train.shape, y_test.shape, y_val.shape

((52325,), (2874,), (2707,))

In [None]:
np.unique(y_train)

array([-1,  0,  1])

## Training SVM Model and Hyper-parameter Tuning

### Average NDCG values

In [88]:
# compute ndcg for each query and mean of all of them

def evaluate_average_ndcg(query_dict, coef, k=None):
    ndcg_vals = []
    for query in query_dict:
        y = []
        X = []
        docs = query_dict[query]
        for (d, c) in docs:
            X.append(d)
            y.append(c)
        y = np.array(y)
        X = np.array(X)
        scores = np.dot(X, coef.T) 
        ndcg = ndcg_score(np.asarray([y]), np.asarray([scores.flatten()]), k=k)
        # remove queries which have the same relevance for all docs
        if ndcg != 0:
          ndcg_vals.append(ndcg)
    return np.mean(ndcg_vals)       

### Training

In [89]:
# SVM model with default parameters

svm_model = svm.LinearSVC(random_state=0)
svm_model.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)

In [90]:
# average ndcg on validation data with default params
evaluate_average_ndcg(data_val, svm_model.coef_)

0.7662074077880603

### Hyperparameter Tuning

In [95]:
# Hyper-parameter tuning with validation data
params = [1, 0.1, 0.01, 0.001, 0.0001]
best_param = 1
best_ndcg = 0

for c in params:
    svm_model = svm.LinearSVC(random_state=0, C = c)
    svm_model.fit(X_train, y_train)
    coef = svm_model.coef_
    ndcg = evaluate_average_ndcg(data_val, coef)
    print(ndcg, c)
    if ndcg > best_ndcg:
        best_param = c
        best_ndcg = ndcg

print("Best param = {'C' : " + str(best_param) + "}")        

0.7662074077880603 1
0.7662315992476308 0.1
0.7659158363756174 0.01
0.7641601941246547 0.001
0.7604602982935231 0.0001
Best param = {'C' : 0.1}


## Model Performance with NDCG@5

In [96]:
svm_model = svm.LinearSVC(random_state=0, C = best_param)
svm_model.fit(X_train, y_train)

LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
          verbose=0)

In [97]:
# average ndcg per query for test data
evaluate_average_ndcg(data_test, svm_model.coef_, k=5)

0.7378472102305029