In [None]:
import requests
import sys
sys.path.append('..')
from ltr.client.solr_client import SolrClient

client = SolrClient(host='http://aips-solr:8983/solr')



## Recreate last section

Rebuild everything from last section we'll need to work with the full training set

Start by making sure the feature set is installed.

In [None]:

requests.delete('http://aips-solr:8983/solr/tmdb/schema/feature-store/movies')

import requests

feature_set = [
    {
      "name" : "title_bm25",
      "store": "movies",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : { #q=title:({$keywords})
        "q" : "title:(${keywords})"
      }
    },
    {
      "name" : "overview_bm25",
      "store": "movies",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:(${keywords})"
      }
    },
    {
      "name" : "release_year",
      "store": "movies",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!func}release_year"
}}]

resp = requests.put('http://aips-solr:8983/solr/tmdb/schema/feature-store',
                    json=feature_set)
resp.text

## Log features

Log the full training set of ~100 movie queries, each with ~40 graded documents. Save judgment list with features logged as `logged_judgments`

In [None]:
from ltr.log import FeatureLogger
from ltr.judgments import judgments_open
from itertools import groupby
from ltr import download

judgments='http://es-learn-to-rank.labs.o19s.com/ai_pow_search_judgments.txt'
download([judgments], dest='data/')

ftr_logger=FeatureLogger(client, index='tmdb', feature_set='movies')

with judgments_open('data/ai_pow_search_judgments.txt') as judgment_list:
    for qid, query_judgments in groupby(judgment_list, key=lambda j: j.qid):
        ftr_logger.log_for_qid(judgments=query_judgments, 
                               qid=qid,
                               keywords=judgment_list.keywords(qid))

logged_judgments = ftr_logger.logged

# Now should have lots of judgments with title_bm25, overview_bm25, and release_year 
# logged out
logged_judgments

# Figure 10.7 - Plot logged features

This data set also has queries `star wars` and `social network` let's see what those look like

In [None]:
from ltr.plots import plot_judgments

plot_judgments(qids=[11,40], focus=[11,40], 
               xlabel="Title BM25",
               ylabel="Overview BM25",
               title_prepend="Logged features for queries:",
               judg_list=ftr_logger.logged)

## Feature Normalization Function (omitted from book)

As we describe in the book, SVMs are sensitive to the range of the underlying data. They work best with normalized features, which we do here.

This function computes the mean and standard deviation of all 3 of our features, then scales each feature value accordingly so that feature values at the mean are mapped to 0. And +1 corresponds to 1 std deviation above the mean. -1 1 standard deviation below, etc

We capture the mean and standard deviation of each feature for later work

In [None]:
from ltr.judgments import Judgment

def normalize_features(logged_judgments):
    all_features = []
    means = [0,0,0]
    for judgment in logged_judgments:
        for idx, f in enumerate(judgment.features):
            means[idx] += f
        all_features.append(judgment.features)
    
    for i in range(len(means)):
        means[i] /= len(logged_judgments)
      
    std_devs = [0.0, 0.0, 0.0]
    for judgment in logged_judgments:
        for idx, f in enumerate(judgment.features):
            std_devs[idx] += (f - means[idx])**2
            
    from math import sqrt
    for i in range(len(std_devs)):
        std_devs[i] /= len(logged_judgments)
        std_devs[i] = sqrt(std_devs[i])
        
    # Normalize!
    normed_judgments = []
    for judgment in logged_judgments:
        normed_features = [0.0] * len(judgment.features)
        for idx, f in enumerate(judgment.features):
            normed = (f - means[idx]) / std_devs[idx]
            normed_features[idx] = normed
        normed_judgment=Judgment(qid=judgment.qid,
                                 keywords=judgment.keywords,
                                 doc_id=judgment.doc_id,
                                 grade=judgment.grade,
                                 features=normed_features)
        normed_judgment.old_features=judgment.features
        normed_judgments.append(normed_judgment)

    return means, std_devs, normed_judgments

means, std_devs, normed_judgments = normalize_features(ftr_logger.logged)

normed_judgments

## Listing 10.11 - Normalize

Perform the normalization, and inspect the difference between of "The Social Network" between raw/logged and normalized feature values. 

In [None]:
means, std_devs, normed_judgments = normalize_features(logged_judgments)

logged_judgments[360], normed_judgments[360]

# Figures 10.8-10.11

Examine the normalized judgments now, in preperation for transforming them with SVMRank's pair-wise transform

In [None]:
from ltr.plots import plot_judgments

plot_judgments(qids=[11,40], 
               xlabel="Title BM25 Std Devs",
               ylabel="Overview BM25 Std Devs",
               title_prepend="Normalized features for queries:",
               judg_list=normed_judgments)

# Listing 10.11 (Python Equivelant)

We put psuedocode in the book, but here we show the equivelant Python code. This code also transforms the data to a numpy array of predictors and features for later model training with an SVM.

In [None]:
import numpy as np
from ltr.judgments import judgments_from_file, judgments_to_nparray

def pairwise_transform(normed_judgments):
        
    from itertools import groupby
    pointwise_predictors = []
    pointwise_features = []
    
    # For each query's judgments
    for qid, query_judgments in groupby(normed_judgments, key=lambda j: j.qid):

        # Annoying issue consuming python iterators, we ensure we have two
        # full copies of each query's judgments
        query_judgments_copy_1 = list(query_judgments) 
        query_judgments_copy_2 = list(query_judgments_copy_1)

        # Examine every judgment combo for this query, 
        # if they're different, store the pairwise difference:
        # +1 if judgment1 more relevant
        # -1 if judgment2 more relevant
        for judgment1 in query_judgments_copy_1:
            for judgment2 in query_judgments_copy_2:
                
                j1_features=np.array(judgment1.features)
                j2_features=np.array(judgment2.features)
                
                if judgment1.grade > judgment2.grade:
                    pointwise_predictors.append(+1)
                    pointwise_features.append(j1_features-j2_features)
                elif judgment1.grade < judgment2.grade:
                    pointwise_predictors.append(-1)
                    pointwise_features.append(j1_features-j2_features)

    return np.array(pointwise_features), np.array(pointwise_predictors)

pointwise_features, pointwise_predictors = pairwise_transform(normed_judgments)

## Figure 10.9

Finally, we have the full plot training set showing pair-wise differences

In [None]:
from ltr.plots import plot_pairwise_data

# Filter down to a judgment list of our two fav queries 
# out of the normalized data
just_star_wars_social_network = []
for j in normed_judgments:
    if j.qid == 11 or j.qid == 40:
        just_star_wars_social_network.append(j)

# Pairwise transform just these two, and plot
features, predictors = pairwise_transform(just_star_wars_social_network)
plot_pairwise_data(features, predictors,
                   xlabel="Title BM25 (Delta Std Devs)",
                   ylabel="Overview BM25 (Delta Std Devs)",
                   title="Pairwise Differences, just Star Wars, Social Network")

# Figure 10.14 

Full dataset pairwise differences

In [None]:
from ltr.plots import plot_pairwise_data

plot_pairwise_data(pointwise_features, pointwise_predictors,
                   xlabel="Title BM25 (Delta Std Devs)",
                   ylabel="Overview BM25 (Delta Std Devs)",
                   title="All Relevance Pairwise Differences")

In [None]:
with open('data/feature_data.npy', 'wb') as f:
    feature_data = np.append(pointwise_features, [means, std_devs] )
    rows=feature_data.shape[0]//3    
    cols=3
    feature_data = feature_data.reshape((rows,cols))
    
    np.save(f, feature_data)
    print(feature_data.shape)
    
with open('data/pointwise_predictors.npy', 'wb') as f:
    np.save(f, pointwise_predictors)

## Listing 10.16

Train the model with the fully transformed dataset

In [None]:
from sklearn import svm
model = svm.LinearSVC(max_iter=10000, verbose=1)
model.fit(pointwise_features, pointwise_predictors)
model.coef_