In [59]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt
import functools
import operator
from sklearn.preprocessing import StandardScaler

from dataproc.data import to_svmrank

In [18]:
feature_dir = '/data/michal/experiments/oss/cw09b-dai/features/train'

In [None]:
exhaustive_results = pq.read_table(f'{feature_dir}/train.OR.10k.results',
                                   columns=['query', 'rank', 'gdocid', 'score']).to_pandas()

# Ground Truth

## FAST

In [42]:
ground_truths = []
for shard in range(123):
    shard_results = pq.read_table(f'{feature_dir}/train.OR.10k#{shard}.results-1',
                                 columns=['query', 'gdocid']).to_pandas()
    ground_truths.append(pd.merge(shard_results, exhaustive_results, on=['query', 'gdocid'], how='left')
                         .groupby('query')['rank']
                         .count()
                         .reset_index()
                         .assign(shard=shard))
    del shard_results
gt = pd.concat(ground_truths).rename(columns={'rank': 'shard_score'}).sort_values(['query', 'shard'])
gt['rank'] = gt.groupby(['query']).rank(method='first', ascending=False)['shard_score']
gt = gt.astype({'rank': np.int})
gt.to_csv(f'{feature_dir}/ground_truth.csv', index=False)
del ground_truths

## B = 10

In [56]:
ground_truths = []
for shard in range(123):
    shard_results = pq.read_table(f'{feature_dir}/train.OR.10k#{shard}.results-10',
                                 columns=['query', 'bucket', 'gdocid']).to_pandas()
    ground_truths.append(pd.merge(shard_results, exhaustive_results, on=['query', 'gdocid'], how='left')
                         .groupby(['query', 'bucket'])['rank']
                         .count()
                         .reset_index()
                         .assign(shard=shard))
    del shard_results
gt_b10 = pd.concat(ground_truths).rename(columns={'rank': 'shard_score'}).sort_values(['query', 'shard', 'bucket'])
gt_b10['rank'] = gt_b10.groupby(['query']).rank(method='first', ascending=False)['shard_score']
gt_b10 = gt_b10.astype({'rank': np.int})
gt_b10.to_csv(f'{feature_dir}/ground_truth_b10.csv', index=False)
del ground_truths

In [58]:
gt_b10.groupby('bucket').mean()

Unnamed: 0_level_0,query,shard_score,shard,rank
bucket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5000.987855,0.766076,60.946607,603.689038
1,5000.623445,0.779839,60.946413,604.268424
2,5000.646211,0.757651,60.946521,606.876342
3,5000.995443,0.774476,60.946703,606.596544
4,5001.039243,0.798722,60.937937,607.023367
5,5000.827159,0.777437,60.95519,609.436472
6,5000.642178,0.781986,60.958894,611.042866
7,5001.071794,0.778437,60.958872,613.227744
8,5001.018779,0.781899,60.960126,619.108475
9,5000.636012,0.847903,60.826769,627.425227


# Shard Popularity

In [44]:
clustering_strategy = pd.read_csv(f'{feature_dir}/strategy.csv')

In [47]:
shard_popularity = (pd.merge(exhaustive_results, clustering_strategy, on='gdocid')
                    .groupby('shard')['gdocid']
                    .agg([('popularity', 'count')])
                    .reset_index())
shard_popularity.to_csv(f'{feature_dir}/shard_popularity.csv', index=False)

# Term-Based Statistics

In [48]:
# TODO: Taily

In [50]:
champion_lists = pd.read_csv(f'{feature_dir}/champion-lists.csv')

ql_body = pd.read_csv(f'{feature_dir}/query-likelihood.text.csv')
ql_title = pd.read_csv(f'{feature_dir}/query-likelihood.title.csv')
ql_anchor = pd.read_csv(f'{feature_dir}/query-likelihood.anchor.csv')
ql = pd.merge(ql_body, ql_title, on=['query', 'shard'], suffixes=['', '-title'])
ql = pd.merge(ql, ql_anchor, on=['query', 'shard'], suffixes=['-body', '-anchor'])
del ql_body
del ql_title
del ql_anchor

In [52]:
qts_body = pd.read_csv(f'{feature_dir}/query-term-stats.text.csv')
qts_title = pd.read_csv(f'{feature_dir}/query-term-stats.title.csv')
qts_anchor = pd.read_csv(f'{feature_dir}/query-term-stats.anchor.csv')
qts = pd.merge(qts_body, qts_title, on=['query', 'shard'], suffixes=['', '-title'])
qts = pd.merge(qts, qts_anchor, on=['query', 'shard'], suffixes=['-body', '-anchor'])
del qts_body
del qts_title
del qts_anchor

In [53]:
bilogfq = pd.read_csv(f'{feature_dir}/bigram-log-frequency.text.csv')

# Cost Models

In [54]:
shard_size_costs = pd.read_csv('/data/index/dai/shard-sizes.csv')
shard_size_costs['cost'] = shard_size_costs['shard_size'] * 100 / shard_size_costs['shard_size'].sum()
shard_size_costs.drop(columns=['shard_size']).to_csv(f'{feature_dir}/shard-size-costs.csv', index=False)

# Train

In [None]:
df = gt[['query', 'shard', 'shard_score']]
df = pd.merge(df, shard_popularity[['shard', 'popularity']])
df = pd.merge(df, taily_features[['query', 'shard', 'taily', 'invrank', 'binrank']])
df = pd.merge(df, champion_lists)
df = pd.merge(df, ql)
df = pd.merge(df, qts)
df = pd.merge(df, bilogfq)
df['query'] += 1
to_svmrank(df.sort_values(['query', 'shard']), f'{feature_dir}/fast.features.svmrank')

In [None]:
def run_subprocess(cmd):
    return subprocess.run(cmd.split())

model_path = f'{feature_dir}/fast.features.model'
run_subprocess(f'svm_rank_learn -c 1 -t 0 {feature_dir}/fast.features.svmrank {model_path}')
# run_subprocess(f'svm_rank_classify {test_path} {model_path} {pred_path}')