# Synthesize search sessions from signals

This notebook synthesizes search sessions from the CTR of the clicked documents on each search result. It's assumed that if you order results by CTR, that roughly captures the source search system's relevance ranking in aggregate (including all the position and other biases). 

You can then check to see if the document is above or below average for that rank position (using a z score). You can then use that z score to translate that document to any other position. 

This is intended more for creating fake search session data for examples in AI Powered Search, and not a replacement for actually logging real search sessions in your search system.

In [1]:
! cd ../../data/retrotech && head signals.csv

import random
import pandas 
import numpy
import sys
sys.path.append('../..')
from aips import *
from ch11.session_gen import SessionGenerator
import os
from IPython.display import display,HTML

#seed=8675309
#random.seed(seed)
#numpy.random.seed(seed)

DOCS_PER_SESSION=15 # how many docs in one search page view?
NUM_SESSIONS=5000 # how many sessions to generate for each query?

# Generate search sessions for these queries
QUERIES_TO_SIMULATE=['dryer', 'iphone', 'ipad', 'transformers dark of the moon']

"query_id","user","type","target","signal_time"
"u2_0_1","u2","query","nook","2019-07-31 08:49:07.3116"
"u2_1_2","u2","query","rca","2020-05-04 08:28:21.1848"
"u3_0_1","u3","query","macbook","2019-12-22 00:07:07.0152"
"u4_0_1","u4","query","Tv antenna","2019-08-22 23:45:54.1030"
"u5_0_1","u5","query","AC power cord","2019-10-20 08:27:00.1600"
"u6_0_1","u6","query","Watch The Throne","2019-09-18 11:59:53.7470"
"u7_0_1","u7","query","Camcorder","2020-02-25 13:02:29.3089"
"u9_0_1","u9","query","wireless headphones","2020-04-26 04:26:09.7198"
"u10_0_1","u10","query","Xbox","2019-09-13 16:26:12.0132"


In [2]:
session_gen = SessionGenerator(signals_path='../../data/retrotech/signals.csv', min_query_count=100)
session_gen('transformers dark of the moon', num_docs=DOCS_PER_SESSION)
session_gen.random_rankings['transformers dark of the moon']

  signals = pandas.read_csv(signals_path)
  pop_query_events = signals[signals['type'] == 'query'][signals['target'].isin(popular_queries)]
  canonical_rankings[canonical_rankings['rank'] == i]['ctr'].mad()
  canonical_rankings[canonical_rankings['rank'] == i]['ctr'].mad()
  canonical = self.canonical_rankings[self.canonical_rankings['query'] == query][self.canonical_rankings['rank'] < num_docs]


Unnamed: 0,posn_ctr_mean,posn_ctr_std,dest_rank,posn_ctr_mad,posn_ctr_median
70426,0.118271,0.071206,1,0.054286,0.105263
70427,0.029738,0.018342,6,0.014968,0.029126
70428,0.061092,0.034642,3,0.027316,0.056936
70429,0.047945,0.028633,4,0.022603,0.044779
70430,0.01545,0.010633,10,0.009189,0.013268
70431,0.223624,0.179218,0,0.134325,0.165036
70432,0.08204,0.047044,2,0.03705,0.074324
70433,0.038391,0.023441,5,0.018783,0.03658
70434,0.020638,0.013713,8,0.011544,0.019349
70435,0.01774,0.011945,9,0.010182,0.016393


# Randomly sample source signals, generate new sessions

In [3]:
from time import perf_counter 

for query in ['transformers dark of the moon']:
    
    session_dfs=[]
    t1_start = perf_counter()  
    for i in range(0, NUM_SESSIONS):
        session_dfs.append(session_gen(query, use_median=True, dampen=1.0, num_docs=DOCS_PER_SESSION))
        if (i % 500 == 0):
            print("Created Sessions %s Last Query %s Elapsed %s" % (i, query, perf_counter()-t1_start))

    sessions = pandas.concat(session_dfs)
    sessions = sessions.sort_values(['sess_id', 'dest_rank'])
    sessions[['sess_id', 'query', 'dest_rank', 'clicked_doc_id', 'clicked']] \
        .rename(columns={'dest_rank': 'rank'}) \
        .to_csv("%s_sessions.gz" % query, compression='gzip', index=False)

  canonical = self.canonical_rankings[self.canonical_rankings['query'] == query][self.canonical_rankings['rank'] < num_docs]


Created Sessions 0 Last Query transformers dark of the moon Elapsed 0.02069779997691512
Created Sessions 500 Last Query transformers dark of the moon Elapsed 7.948518699966371
Created Sessions 1000 Last Query transformers dark of the moon Elapsed 16.04209959995933
Created Sessions 1500 Last Query transformers dark of the moon Elapsed 24.25959489995148
Created Sessions 2000 Last Query transformers dark of the moon Elapsed 32.66837129998021
Created Sessions 2500 Last Query transformers dark of the moon Elapsed 40.64491539995652
Created Sessions 3000 Last Query transformers dark of the moon Elapsed 48.351152800023556
Created Sessions 3500 Last Query transformers dark of the moon Elapsed 55.67575059994124
Created Sessions 4000 Last Query transformers dark of the moon Elapsed 62.9664745000191
Created Sessions 4500 Last Query transformers dark of the moon Elapsed 70.29577289998997


In [4]:
gset = session_gen.canonical_rankings
orig_dryer = gset[gset['query'] == 'transformers dark of the moon']

orig_dryer[orig_dryer['rank'] < 20]

Unnamed: 0,index,query,clicked_doc_id,click_count,tot_query_count,ctr,rank,posn_ctr_mean,posn_ctr_std,posn_ctr_median,posn_ctr_mad,ctr_std_z_score,ctr_mod_z_score
70426,71842,transformers dark of the moon,97360810042,99,147,0.673469,0,0.223624,0.179218,0.165036,0.134325,2.510042,3.785084
70427,71844,transformers dark of the moon,97363560449,19,147,0.129252,1,0.118271,0.071206,0.105263,0.054286,0.154203,0.441894
70428,71835,transformers dark of the moon,25192107191,6,147,0.040816,2,0.08204,0.047044,0.074324,0.03705,-0.876281,-0.904408
70429,71838,transformers dark of the moon,47875841420,6,147,0.040816,3,0.061092,0.034642,0.056936,0.027316,-0.585299,-0.59012
70430,71846,transformers dark of the moon,786936817218,4,147,0.027211,4,0.047945,0.028633,0.044779,0.022603,-0.724128,-0.777255
70431,71840,transformers dark of the moon,47875842335,2,147,0.013605,5,0.038391,0.023441,0.03658,0.018783,-1.057352,-1.223175
70432,71848,transformers dark of the moon,47875841406,2,147,0.013605,6,0.029738,0.018342,0.029126,0.014968,-0.879573,-1.036929
70433,71833,transformers dark of the moon,24543701538,1,147,0.006803,7,0.024395,0.015588,0.024221,0.013035,-1.128557,-1.336316
70434,71834,transformers dark of the moon,24543750949,1,147,0.006803,8,0.020638,0.013713,0.019349,0.011544,-1.008924,-1.086823
70435,71836,transformers dark of the moon,36725235564,1,147,0.006803,9,0.01774,0.011945,0.016393,0.010182,-0.915664,-0.941942


In [5]:
for query in gset['query'].unique():
    print(query)

#
*
1080p
1196648
1342081 1342106 1342115 1342124
24
300
3547042
360
360 elite
3d
3d glasses
3d movies
3d tv
3ds
50 cent
8800
Acer
Acer iconia
Adele
Alarm clock
Alienware
Alpine
Amp
Amplifier
Android tablet
Antenna
Apple
Apple computer
Apple iPad
Apple keyboard
Apple laptop
Apple laptops
Apple tv
Arkham city
Asus
Asus laptop
Asus transformer
Batman
Batman arkham city
Batman year one
Battlefield
Battlefield 3
Beats
Beats by Dre
Beats by dr dre
Beats by dre
Beats headphones
Big bang theory
Blackberry
Blackberry playbook
Blink 182
Blu ray
Blu ray player
Blue tooth
Bluetooth
Bluetooth headphones
Bluetooth headset
Boardwalk empire
Boost mobile
Bose
Bose headphones
Bridesmaids
Cable modem
Call of duty
Camcorder
Camera
Cameras
Canon
Canon 7d
Canon camera
Captain America
Captain america
Car alarm
Car audio
Car radio
Car speakers
Car stereo
Car stereos
Carter 4
Cell phones
Coldplay
Computer
Computer speakers
Computers
D7000
DVD player
Dark souls
Dazzle
Dead island
Dell
Dell laptop
Dell laptops
