# Intersect common `QID`s from all three logs and sample `n` conversations from the common qids

In [8]:
import pandas as pd
import csv 
import sys
import numpy as np

In [2]:

def read_synthlog(file_name, nrows=None):
    if file_name.endswith('gz'):
        compression = 'gzip'
    else :
        compression = None
    
    synth_df = pd.read_csv(file_name, sep='\t', compression=compression,
                            names=['prefix_str', 'query_id'],
                            nrows=nrows, quoting=csv.QUOTE_NONE)
    synth_df.dropna(subset=['prefix_str'], inplace=True)
    synth_df['prefix_len'] = synth_df.prefix_str.str.len()
    return synth_df[~(synth_df.prefix_len == 0)]

def find_synth_end(synth_log):
    """ Return a dataframe containing only the last interaction from each 
    conversation in synth_log.
    """
    end_df = synth_log.index.to_series().groupby(
                            synth_log['query_id']).last().reset_index(name='end_idx')
    synth_end = synth_log[synth_log.index.isin(end_df.end_idx)]
    return synth_end

def load_synthlog(file_name, nrows=None):
    synth_log = read_synthlog(file_name, nrows)
    synth_end = find_synth_end(synth_log)
    return synth_log, synth_end

def print_basic_stats(log_df, log_end):
    print("Number of conversations: ", len(log_end))
    print("Number of interactions: ", len(log_df))
    # conv_lendf = conversation_lengths(log_df)
    # print("Median length of conversations: ", conv_lendf.conv_len.median())
    # print("Median length of partial queries ", log_df.prefix_len.median())
    print("Partial query lengths stats\n", log_df.prefix_len.describe())
    print("Unique partial queries: ", len(log_df.prefix_str.unique()))
    print("Unique final partial queries: ", len(log_end.prefix_str.unique()))



In [3]:
nrows = 1000

bing_log, bing_end = load_synthlog('../../../synth_log/data/wiki-synthlog.tsv', nrows=nrows)
wikisynth, wiki_end = load_synthlog('../../../synth_log/data/wiki-synthlog.tsv', nrows=nrows)
cwebsynth, cweb_end = load_synthlog('../../../synth_log/data/cweb-synthlog.tsv', nrows=nrows)


In [4]:
print("Basic BingLog stats")
print("="*40)
print_basic_stats(bing_log, bing_end)
print("\n\nBasic WikiSynth stats")
print("="*40)
print_basic_stats(wikisynth, wiki_end)
print("\n\nBasic CwebSynth stats")
print("="*40)
print_basic_stats(cwebsynth, cweb_end)

Basic BingLog stats
('Number of conversations: ', 176)
('Number of interactions: ', 990)
('Partial query lengths stats\n', count    990.000000
mean       8.379798
std        7.541137
min        1.000000
25%        3.000000
50%        6.000000
75%       12.000000
max       61.000000
Name: prefix_len, dtype: float64)
('Unique partial queries: ', 861)
('Unique final partial queries: ', 172)


Basic WikiSynth stats
('Number of conversations: ', 176)
('Number of interactions: ', 990)
('Partial query lengths stats\n', count    990.000000
mean       8.379798
std        7.541137
min        1.000000
25%        3.000000
50%        6.000000
75%       12.000000
max       61.000000
Name: prefix_len, dtype: float64)
('Unique partial queries: ', 861)
('Unique final partial queries: ', 172)


Basic CwebSynth stats
('Number of conversations: ', 176)
('Number of interactions: ', 990)
('Partial query lengths stats\n', count    990.000000
mean       8.379798
std        7.541137
min        1.000000
25%    

## Find intersection of qids

In [5]:
def find_common_qids(bing_log, wikisynth, cwebsynth):
    common_df = wikisynth.merge(cwebsynth, on='query_id', how='inner')\
                    .merge(bing_log, on='query_id', how='inner')
    common_df.dropna(inplace=True)
    common_qids = common_df.query_id.unique()
    return common_qids

In [6]:
common_qids = find_common_qids(bing_log, wikisynth, cwebsynth)
print "Common qids ", len(common_qids)

Common qids  176


In [9]:
np.random.choice(common_qids, 5)

array(['2018-08-13_93_m', '2018-08-13_86_d', '2018-08-13_56_m',
       '2018-08-13_81_m', '2018-08-13_35_m'], dtype=object)

## Sort and sample the logs

In [10]:
def filter_and_sample(log_df, log_end, common_qids, n):
    qid_sample = np.random.choice(common_qids, n)
    log_df = log_df[log_df.query_id.isin(qid_sample)]
    log_end = find_synth_end(log_df)
    return log_df, log_end

In [11]:
bing_log, bing_end = filter_and_sample(bing_log, bing_end, common_qids, 10)
wikisynth, wiki_end = filter_and_sample(wikisynth, wiki_end, common_qids, 10)
cwebsynth, cweb_end = filter_and_sample(cwebsynth, cweb_end, common_qids, 10)

In [12]:
print("Basic BingLog stats")
print("="*40)
print_basic_stats(bing_log, bing_end)
print("\n\nBasic WikiSynth stats")
print("="*40)
print_basic_stats(wikisynth, wiki_end)
print("\n\nBasic CwebSynth stats")
print("="*40)
print_basic_stats(cwebsynth, cweb_end)

Basic BingLog stats
('Number of conversations: ', 10)
('Number of interactions: ', 83)
('Partial query lengths stats\n', count    83.000000
mean     10.855422
std       9.162666
min       1.000000
25%       3.000000
50%       9.000000
75%      16.000000
max      37.000000
Name: prefix_len, dtype: float64)
('Unique partial queries: ', 80)
('Unique final partial queries: ', 10)


Basic WikiSynth stats
('Number of conversations: ', 10)
('Number of interactions: ', 53)
('Partial query lengths stats\n', count    53.000000
mean      7.981132
std       7.617012
min       1.000000
25%       3.000000
50%       6.000000
75%      10.000000
max      30.000000
Name: prefix_len, dtype: float64)
('Unique partial queries: ', 53)
('Unique final partial queries: ', 10)


Basic CwebSynth stats
('Number of conversations: ', 10)
('Number of interactions: ', 44)
('Partial query lengths stats\n', count    44.000000
mean      7.659091
std       6.072979
min       1.000000
25%       3.000000
50%       5.500000

## Export 

In [None]:
def export_logdf(log_df, file_name):
    file_stub = file_name.split('.')
    export_file = file_stub[0] + '-sample.tsv'
    log_df.to_csv(export_file, sep='\t')