In [43]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt
import functools
import operator
import subprocess
from sklearn.preprocessing import StandardScaler

from dataproc.data import to_svmrank, scale_features, to_trec
from dataproc.selectivesearch import (load_shard_selection,
                                      load_bucket_selection,
                                      select,
                                      select_buckets)

def calc_taily_features(taily_scores):
    tf = taily_scores.copy()
    tf['rank'] = tf.groupby('query').rank(method='first', ascending=False)['taily']
    tf['invrank'] = 1 / tf['rank']
    tf['binrank'] = np.ceil(tf['rank'] / 10)
    return tf

feature_dir = '/data/michal/experiments/oss/gov2-dai/features/mq'

In [2]:
exhaustive_results = pq.read_table(f'{feature_dir}/mq.08.clean.or.results',
                                   columns=['query', 'rank', 'gdocid', 'score']).to_pandas()

# Ground Truth

## FAST

In [3]:
ground_truths = []
for shard in range(123):
    shard_results = pq.read_table(f'{feature_dir}/mq.08.clean.or#{shard}.results-1',
                                 columns=['query', 'gdocid']).to_pandas()
    ground_truths.append(pd.merge(shard_results, exhaustive_results, on=['query', 'gdocid'], how='left')
                         .groupby('query')['rank']
                         .count()
                         .reset_index()
                         .assign(shard=shard))
    del shard_results
gt = pd.concat(ground_truths).rename(columns={'rank': 'shard_score'}).sort_values(['query', 'shard'])
gt['rank'] = gt.groupby(['query']).rank(method='first', ascending=False)['shard_score']
gt = gt.astype({'rank': np.int})
gt.to_csv(f'{feature_dir}/ground_truth.csv', index=False)
del ground_truths

## B=10

In [4]:
ground_truths = []
for shard in range(123):
    shard_results = pq.read_table(f'{feature_dir}/mq.08.clean.or#{shard}.results-10',
                                 columns=['query', 'bucket', 'gdocid']).to_pandas()
    ground_truths.append(pd.merge(shard_results, exhaustive_results, on=['query', 'gdocid'], how='left')
                         .groupby(['query', 'bucket'])['rank']
                         .count()
                         .reset_index()
                         .assign(shard=shard))
    del shard_results
gt_b10 = pd.concat(ground_truths).rename(columns={'rank': 'shard_score'}).sort_values(['query', 'shard', 'bucket'])
gt_b10['rank'] = gt_b10.groupby(['query']).rank(method='first', ascending=False)['shard_score']
gt_b10 = gt_b10.astype({'rank': np.int})
gt_b10.to_csv(f'{feature_dir}/ground_truth_b10.csv', index=False)
del ground_truths

# Shard Popularity

In [5]:
clustering_strategy = pd.read_csv(f'{feature_dir}/strategy.csv')
shard_popularity = (pd.merge(exhaustive_results, clustering_strategy, on='gdocid')
                    .groupby('shard')['gdocid']
                    .agg([('popularity', 'count')])
                    .reset_index())
shard_popularity.to_csv(f'{feature_dir}/shard_popularity.csv', index=False)

# Term-Based Statistics

In [6]:
taily = pq.read_table(f'{feature_dir}/mq.08.clean.or.taily').to_pandas()
taily_features = calc_taily_features(taily)
taily_features.to_csv(f'{feature_dir}/taily_features.csv', index=False)

In [7]:
champion_lists = pd.read_csv(f'{feature_dir}/champion-lists.csv')

ql_body = pd.read_csv(f'{feature_dir}/query-likelihood.text.csv')
ql_title = pd.read_csv(f'{feature_dir}/query-likelihood.title.csv')
ql_anchor = pd.read_csv(f'{feature_dir}/query-likelihood.anchor.csv')
ql = pd.merge(ql_body, ql_title, on=['query', 'shard'], suffixes=['', '-title'])
ql = pd.merge(ql, ql_anchor, on=['query', 'shard'], suffixes=['-body', '-anchor'])
del ql_body
del ql_title
del ql_anchor

qts_body = pd.read_csv(f'{feature_dir}/query-term-stats.text.csv')
qts_title = pd.read_csv(f'{feature_dir}/query-term-stats.title.csv')
qts_anchor = pd.read_csv(f'{feature_dir}/query-term-stats.anchor.csv')
qts = pd.merge(qts_body, qts_title, on=['query', 'shard'], suffixes=['', '-title'])
qts = pd.merge(qts, qts_anchor, on=['query', 'shard'], suffixes=['-body', '-anchor'])
del qts_body
del qts_title
del qts_anchor

bilogfq = pd.read_csv(f'{feature_dir}/bigram-log-frequency.text.csv')

# Train

## B=1

In [10]:
df = gt[['query', 'shard', 'shard_score']]
df = pd.merge(df, shard_popularity[['shard', 'popularity']])
df = pd.merge(df, taily_features[['query', 'shard', 'taily', 'invrank', 'binrank']])
df = pd.merge(df, champion_lists)
df = pd.merge(df, ql)
df = pd.merge(df, qts)
df = pd.merge(df, bilogfq)
df['query'] += 701
data, scaler = scale_features(df, exclude=['query', 'shard_score', 'shard', 'bucket'])

In [12]:
import pickle
with open(f'{feature_dir}/fast.features.scaler', 'bw') as f:
    pickle.dump(scaler, f)
to_svmrank(data.sort_values(['query', 'shard']), f'{feature_dir}/fast.features.svmrank')

In [13]:
def run_subprocess(cmd):
    return subprocess.run(cmd.split())

model_path = f'{feature_dir}/fast.features.model'
run_subprocess(f'svm_rank_learn -c 1 -t 0 {feature_dir}/fast.features.svmrank {model_path}')

CompletedProcess(args=['svm_rank_learn', '-c', '1', '-t', '0', '/data/michal/experiments/oss/gov2-dai/features/mq/fast.features.svmrank', '/data/michal/experiments/oss/gov2-dai/features/mq/fast.features.model'], returncode=0)

# Evaluate

In [75]:
import tempfile
import re

def trec_eval(selected, qrels_path):
    with open('/tmp/x', 'wb') as trec_file:
        to_trec(selected, str(trec_file.name))
        out = subprocess.run(
            f'trec_eval -q {qrels_path} {trec_file.name}'.split(' '),
            stdout=subprocess.PIPE)
        p10, map_measure = None, None
        for line in out.stdout.decode('UTF-8').splitlines():
            p10_re = re.compile(r'P_10\s+all\s+(?P<score>[0-9]+\.[0-9]+).*')
            map_re = re.compile(r'map\s+all\s+(?P<score>[0-9]+\.[0-9]+).*')
            p10_match = p10_re.match(line)
            map_match = map_re.match(line)
            if p10_match:
                p10 = float(p10_match.group('score'))
            if map_match:
                map_measure = float(map_match.group('score'))
        return p10, map_measure

In [28]:
trec_fast = pd.read_csv(f'{feature_dir}/../trec/fast.features.csv')
trec_fast, _ = scale_features(trec_fast, scaler=scaler, exclude=['query', 'shard_score', 'shard', 'bucket'])
to_svmrank(trec_fast.sort_values(['query', 'shard']), f'{feature_dir}/fast.features.test.svmrank')

In [29]:
run_subprocess(f'svm_rank_classify {feature_dir}/fast.features.test.svmrank '
               f'{feature_dir}/fast.features.model '
               f'{feature_dir}/fast.features.test.pred')

CompletedProcess(args=['svm_rank_classify', '/data/michal/experiments/oss/gov2-dai/features/mq/fast.features.test.svmrank', '/data/michal/experiments/oss/gov2-dai/features/mq/fast.features.model', '/data/michal/experiments/oss/gov2-dai/features/mq/fast.features.test.pred'], returncode=0)

In [79]:
size_costs = pd.read_csv(f'{feature_dir}/../trec/shard-size-costs.csv')
posting_costs = pd.read_csv(f'{feature_dir}/../trec/shard-posting-costs.csv')
posting_costs_frac = pd.read_csv(f'{feature_dir}/../trec/shard-posting-costs-frac.csv')

In [87]:
trec_shard_selection = load_shard_selection(range(701, 851), 199, f'{feature_dir}/fast.features.test.pred')
trec_shard_selection = pd.merge(trec_shard_selection, size_costs).rename(columns={'cost': 'size_cost'})
trec_shard_selection = pd.merge(trec_shard_selection, posting_costs).rename(columns={'cost': 'posting_cost'})
trec_shard_selection = pd.merge(trec_shard_selection, posting_costs_frac).rename(columns={'cost': 'posting_cost_frac'})
trec_shard_selection['cost'] = 1
trec_shard_selection['unit_cost'] = 1
shard_results = pq.read_table(f'{feature_dir}/../trec/shard_results.parquet').to_pandas()

In [93]:
fast = []
for budget in list(range(1, 21)) + [123]:
    filtered_selection, selected = select(trec_shard_selection, shard_results, budget, nonempty=False)
    g = filtered_selection.groupby('query')
    unit_cost = g['unit_cost'].sum().mean()
    size_cost = g['size_cost'].sum().mean()
    posting_cost = g['posting_cost'].sum().mean()
    posting_cost_frac = g['posting_cost_frac'].sum().mean()
    p10, mAP = trec_eval(selected, '/data/queries/gov2/gov2-qrels.txt')
    fast.append((budget, p10, mAP, unit_cost, posting_cost, posting_cost_frac))
fastpd = pd.DataFrame(fast, columns=['budget', 'p10', 'map', 'unit_cost', 'posting_cost', 'posting_cost_frac'])

In [94]:
fastpd

Unnamed: 0,budget,p10,map,unit_cost,posting_cost,posting_cost_frac
0,1,0.2263,0.0689,1.0,52.78124,2.280075
1,2,0.2527,0.0838,2.0,97.066953,3.886414
2,3,0.296,0.0966,3.0,142.323433,5.200362
3,4,0.3322,0.1198,4.0,182.580953,6.434855
4,5,0.3611,0.1315,5.0,220.81416,7.714033
5,6,0.3765,0.1346,6.0,254.518793,9.027956
6,7,0.3987,0.1401,7.0,295.885187,10.147817
7,8,0.4195,0.148,8.0,335.133747,11.336161
8,9,0.4255,0.1524,9.0,370.931587,12.448383
9,10,0.4396,0.1585,10.0,413.072727,13.529804
