In [37]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt

from dataproc.data import cartesian

def load_results(basename, buckets=1):
    return pd.concat([
        pq.read_table(f'{basename}#{shard}.results-{buckets}').to_pandas()
        for shard in range(199)
    ])

def load_shard_titles(shard_dir):
    return pd.concat([
        (pd.read_csv('{}/{:0>3d}'.format(shard_dir, shard), names=['title'])
         .assign(shard=shard)
         .assign(shard_size=lambda df: len(df))
         .rename_axis('ldocid')
         .reset_index())
        for shard in range(199)
    ])

def load_documents(titles_file, title_dir):
    titles = pd.read_csv(titles_file, names=['title'])
    titles.index.name = 'jjj'
    titles.reset_index(inplace=True)
    shard_titles = load_shard_titles(title_dir)
    shard_titles['depth'] = shard_titles['ldocid'] / shard_titles['shard_size']
    return pd.merge(titles, shard_titles, on='title', how='left')

def load_posting_costs(basename, buckets=1):
    return pd.concat([
        pq.read_table(f'{basename}#{shard}.postingcost-{buckets}').to_pandas()
        for shard in range(199)
    ])

def calc_taily_features(taily_scores):
    tf = taily_scores.copy()
    tf['rank'] = tf.groupby('query').rank(method='first', ascending=False)['taily']
    tf['invrank'] = 1 / tf['rank']
    tf['binrank'] = np.ceil(tf['rank'] / 10)
    return tf

feature_dir = '/data/michal/experiments/oss/gov2-dai/features/trec'

# Preprocess Data

In [2]:
documents = load_documents(titles_file='/data/index/dai-gov2/full/gov2.titles',
                           title_dir='/data/index/dai-gov2/shards/titles')
pq.write_table(pa.Table.from_pandas(documents),

In [22]:
documents

Unnamed: 0,docid,title,ldocid,shard,shard_size,depth
0,0,GX252-30-5998799,0,50,95909,0.000000
1,1,GX167-74-1554430,0,29,129558,0.000000
2,2,GX210-51-15250690,0,170,166861,0.000000
3,3,GX224-92-11138801,1,50,95909,0.000010
4,4,GX008-66-15642425,1,29,129558,0.000008
5,5,GX152-09-10037568,2,50,95909,0.000021
6,6,GX222-93-4969172,0,196,149324,0.000000
7,7,GX230-56-14789344,0,151,77849,0.000000
8,8,GX168-10-10917842,2,29,129558,0.000015
9,9,GX254-31-15542689,0,95,167118,0.000000


In [4]:
qrels = pd.read_csv('/data/queries/gov2/gov2-qrels.txt', names=['trecid', '_', 'title', 'relevance'], sep=' ')
qrels['query'] = qrels['trecid'] - 701
qrels['relevant'] = qrels['relevance'] > 0
qrels = qrels[['query', 'trecid', 'title', 'relevance', 'relevant']]
qrels.to_csv('/data/queries/gov2/gov2-qrels.csv')

# Ground Truth

## Shard-wise

In [5]:
gt_fast = pd.merge(qrels, documents, on='title').groupby(['query', 'shard']).agg({'relevant': 'sum'})
gt_fast.reset_index(inplace=True)
gt_fast = pd.merge(gt_fast, cartesian([range(150), range(199)], names=['query', 'shard']),
                   on=['query', 'shard'], how='right').fillna(0)
gt_fast.rename(columns={'relevant': 'shard_score'}, inplace=True)
gt_fast['rank'] = gt_fast.groupby(['query']).rank(method='first', ascending=False)['shard_score']
gt_fast.to_csv(f'{feature_dir}/ground_truth.csv', index=False)

## B = 10

In [6]:
gt_b10 = (pd.merge(qrels, documents, on='title')
          .assign(bucket=lambda df: np.floor(df['depth'].multiply(10)).astype(np.int))
          .groupby(['query', 'shard', 'bucket'])
          .agg({'relevant': 'sum'}))
gt_b10.reset_index(inplace=True)
gt_b10 = pd.merge(gt_b10, cartesian([range(150), range(199), range(10)],
                                    names=['query', 'shard', 'bucket']),
                  on=['query', 'shard', 'bucket'], how='right').fillna(0)
gt_b10.rename(columns={'relevant': 'shard_score'}, inplace=True)
gt_b10['rank'] = gt_b10.groupby(['query']).rank(method='first', ascending=False)['shard_score']
gt_b10.to_csv(f'{feature_dir}/ground_truth_b10.csv', index=False)

In [7]:
gt_b10.groupby('bucket').mean()

Unnamed: 0_level_0,query,shard,shard_score,rank
bucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,74.5,99.0,0.240838,906.167236
1,74.5,99.0,0.159129,950.419129
2,74.5,99.0,0.145025,960.219832
3,74.5,99.0,0.115008,975.436482
4,74.5,99.0,0.086901,993.454405
5,74.5,99.0,0.060268,1008.827739
6,74.5,99.0,0.043685,1023.295377
7,74.5,99.0,0.031323,1032.557588
8,74.5,99.0,0.015209,1045.911122
9,74.5,99.0,0.004355,1058.711089


# Shard Popularity

In [8]:
popularity = (pd.merge(qrels[qrels['relevant'] == True], documents, on='title')
 .groupby('shard')['relevant']
 .agg([('popularity', 'count')])
 .reset_index())
shard_popularity = pd.merge(pd.DataFrame({'shard': range(199)}),
                            popularity, on='shard', how='left').fillna(0)
shard_popularity.astype('int')
shard_popularity.to_csv(f'{feature_dir}/shard_popularity.csv', index=False)

# Term Based Features

## Taily

In [9]:
taily = pq.read_table(f'{feature_dir}/gov2-trec_eval-queries-OR.txt.taily').to_pandas()
taily_features = calc_taily_features(taily)
taily_features.to_csv(f'{feature_dir}/taily_features.csv', index=False)

## Champion List Features

In [10]:
champion_lists = pd.read_csv(f'{feature_dir}/champion-lists.csv')

## Query Likelihood

In [11]:
ql_body = pd.read_csv(f'{feature_dir}/query-likelihood.text.csv')
ql_title = pd.read_csv(f'{feature_dir}/query-likelihood.title.csv')
ql_anchor = pd.read_csv(f'{feature_dir}/query-likelihood.anchor.csv')
ql = pd.merge(ql_body, ql_title, on=['query', 'shard'], suffixes=['', '-title'])
ql = pd.merge(ql, ql_anchor, on=['query', 'shard'], suffixes=['-body', '-anchor'])

## Query Term Statistics

In [12]:
qts_body = pd.read_csv(f'{feature_dir}/query-term-stats.text.csv')
qts_title = pd.read_csv(f'{feature_dir}/query-term-stats.title.csv')
qts_anchor = pd.read_csv(f'{feature_dir}/query-term-stats.anchor.csv')
qts = pd.merge(qts_body, qts_title, on=['query', 'shard'], suffixes=['', '-title'])
qts = pd.merge(qts, qts_anchor, on=['query', 'shard'], suffixes=['-body', '-anchor'])

## Bigram Log Frequency

In [13]:
bilogfq = pd.read_csv(f'{feature_dir}/bigram-log-frequency.text.csv')

# Cost Models

## Shard Size Cost

In [14]:
shard_size_costs = pd.read_csv('/data/index/dai-gov2/shard-sizes.csv')
shard_size_costs['cost'] = shard_size_costs['shard_size'] * 100 / shard_size_costs['shard_size'].sum()
shard_size_costs.drop(columns=['shard_size']).to_csv(f'{feature_dir}/shard-size-costs.csv', index=False)

## Posting Cost

In [15]:
shard_posting_costs = load_posting_costs(f'{feature_dir}/gov2-trec_eval-queries-OR.txt')
shard_posting_costs['cost'] = shard_posting_costs['postingcost'] / 1000
shard_posting_costs['query'] += 701
shard_posting_costs[['query', 'shard', 'cost']].to_csv(f'{feature_dir}/shard-posting-costs.csv', index=False)

# %
shard_posting_costs = load_posting_costs(f'{feature_dir}/gov2-trec_eval-queries-OR.txt')
def f(x):
    total = x['postingcost'].sum()
    df = pd.DataFrame({
        'shard': x['shard'],
        'cost': x['postingcost'] * 100 / total
    })
    return df.reset_index(drop=True)
shard_posting_costs = shard_posting_costs.groupby('query').apply(f).reset_index(level='query')
shard_posting_costs['query'] += 701
shard_posting_costs[['query', 'shard', 'cost']].to_csv(f'{feature_dir}/shard-posting-costs-frac.csv', index=False)

del shard_posting_costs

In [53]:
shard_posting_costs = load_posting_costs(f'{feature_dir}/gov2-trec_eval-queries-OR.txt', buckets=10)
shard_posting_costs['cost'] = shard_posting_costs['postingcost'] / 1000
shard_posting_costs['query'] += 701
shard_posting_costs[['query', 'shard', 'bucket', 'cost']].to_csv(f'{feature_dir}/shard-posting-costs-b10.csv', index=False)

shard_posting_costs = load_posting_costs(f'{feature_dir}/gov2-trec_eval-queries-OR.txt', buckets=10)
def f(x):
    total = x['postingcost'].sum()
    df = pd.DataFrame({
        'shard': x['shard'],
        'bucket': x['bucket'],
        'cost': x['postingcost'] * 100 / total
    })
    return df.reset_index(drop=True)
shard_posting_costs = shard_posting_costs.groupby('query').apply(f).reset_index(level='query')
shard_posting_costs['query'] += 701
shard_posting_costs[['query', 'shard', 'bucket', 'cost']].to_csv(
    f'{feature_dir}/shard-posting-costs-frac-b10.csv', index=False)

del shard_posting_costs

# Export To RankLib Format

In [54]:
results = load_results(f'{feature_dir}/gov2-trec_eval-queries-OR.txt')
trecres = pd.merge(results, documents[['docid', 'ldocid', 'shard', 'title']], on=['shard', 'ldocid'])
trecres = trecres[['query', 'shard', 'rank', 'score', 'title']].sort_values(['query', 'shard', 'rank'])
trecres['query'] = trecres['query'] + 701
pq.write_table(pa.Table.from_pandas(trecres), f'{feature_dir}/shard_results.parquet')

In [None]:
results = load_results(f'{feature_dir}/gov2-trec_eval-queries-OR.txt', buckets=10)
trecres = pd.merge(results, documents[['docid', 'ldocid', 'shard', 'title']], on=['shard', 'ldocid'])
trecres = trecres[['query', 'shard', 'bucket', 'rank', 'score', 'title']].sort_values(['query', 'shard', 'rank'])
trecres['query'] = trecres['query'] + 701
pq.write_table(pa.Table.from_pandas(trecres), f'{feature_dir}/shard_results_b10.parquet')

## FAST Features

In [43]:
fast = gt_fast[['query', 'shard', 'shard_score']]
fast = pd.merge(fast, shard_popularity[['shard', 'popularity']])
fast = pd.merge(fast, taily_features[['query', 'shard', 'taily', 'invrank', 'binrank']])
fast = pd.merge(fast, champion_lists)
fast = pd.merge(fast, ql)
fast = pd.merge(fast, qts)
fast = pd.merge(fast, bilogfq)
fast['query'] += 701
fast.sort_values(['query', 'shard']).to_csv(f'{feature_dir}/fast.features.csv', index=False)

## FAST + GO

In [48]:
gorank = documents.groupby('shard')['docid'].mean().reset_index().rename(columns={'docid': 'gorank'})
fastgo = pd.merge(fast, gorank, on='shard')
fastgo.sort_values(['query', 'shard']).to_csv(f'{feature_dir}/fastgo.features.csv', index=False)

## FAST B=10

In [52]:
b10 = gt_b10[['query', 'shard', 'bucket', 'shard_score']]
b10 = pd.merge(b10, shard_popularity[['shard', 'popularity']])
b10 = pd.merge(b10, taily_features[['query', 'shard', 'taily', 'invrank', 'binrank']])
b10 = pd.merge(b10, champion_lists)
b10 = pd.merge(b10, ql)
b10 = pd.merge(b10, qts)
b10 = pd.merge(b10, bilogfq)
b10['query'] += 701
b10.sort_values(['query', 'shard', 'bucket']).to_csv(f'{feature_dir}/b10.features.csv', index=False)

# Evaluation

In [None]:
fastnn = pd.read_csv('/data/michal/experiments/oss/gov2-dai/eval/fast.NN.eval.csv')
fastnn = fastnn.groupby('budget').mean().reset_index()
fastgonn = pd.read_csv('/data/michal/experiments/oss/gov2-dai/eval/fastgo.NN.eval.csv')
fastgonn = fastgonn.groupby('budget').mean().reset_index()
b10nn = pd.read_csv('/data/michal/experiments/oss/gov2-dai/eval/b10.NN.eval.csv')
b10nn = b10nn.groupby('budget').mean().reset_index()

plt.plot(fastgonn['shards'], fastgonn['p10'], label='B=1 FAST + With global orderin')
plt.plot(fastnn['shards'], fastnn['p10'], label='B=1 FAST')
plt.plot(b10nn['shards'], b10nn['p10'], label='B=10 FAST')
plt.legend()
plt.xlabel('shards')
plt.ylabel('P@10')
fig = plt.gcf()
fig.set_size_inches((16, 8))
plt.show()