# Query Suggestion Exploration

## Import Libraries

In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np
import re
import string
import time as timer
regex = re.compile('[%s]' % re.escape(string.punctuation))

## Import query log data

In [2]:
files = ['data/Clean-Data-01.txt', 'data/Clean-Data-02.txt', 'data/Clean-Data-03.txt', 'data/Clean-Data-04.txt', 'data/Clean-Data-05.txt']

In [3]:
ql = pd.concat([pd.read_csv(f, sep='\t') for f in files])

In [4]:
ql = ql.sort_values(by=['AnonID', 'QueryTime'])

In [5]:
ql = ql.reset_index(drop=True)

In [6]:
ql = ql.dropna()

In [7]:
ql['QueryTime'] = pd.to_datetime(ql['QueryTime'])

## Define some constants

For the given formula, we need to find values of some constants

### max_freq

Maximum frequency of any query in QL

In [8]:
max_freq_list = ql['Query'].value_counts()

In [9]:
max_freq = max_freq_list.max()

### max_session_length

In [11]:
max_session_length = 0
'''
for s in ql['AnonID'].unique():
    session = ql[ql['AnonID'] == s]
    length = session.iloc[len(session)-1]['QueryTime'] - session.iloc[0]['QueryTime']
    length =  length / np.timedelta64(1, 's')
    if length > max_session_length:
        max_session_length = length
'''
max_session_length = 7946741.0

## Pre-process some stuff

### Inverted index

Maps from queries to the index of the query log

In [12]:
queryInvertedIndex = {}
def createIndex(ngram, index, id, time):
    global queryInvertedIndex
    if ngram in queryInvertedIndex:
        queryInvertedIndex[ngram].append((index,id,time))
        return 1
    else:
        queryInvertedIndex[ngram] = [(index,id,time)]
        return 0

In [13]:
ql.apply(lambda x: createIndex(str(x['Query']),x.name,x['AnonID'],x['QueryTime']), axis=1)
print(" ")

 


### QueryCandidates

Maps from a query to all candidates that contain the query + 1 word

In [14]:
queryCandidates = {}
def createCandidates(q, cq):
    global queryCandidates
    if q == cq:
        return
    if q in queryCandidates:
        queryCandidates[q].add(cq)
    else:
        queryCandidates[q] = set([cq])

In [15]:
ql.apply(lambda x: createCandidates(str(x['Query']).rsplit(' ', 1)[0], str(x['Query'])) , axis=1)
print(" ")

 


## Creating the Query Ranking Score Function

In [16]:
def Freq(CQ):
    return len(queryInvertedIndex[CQ])/max_freq

In [33]:
q_sessions = {}
CQ_sessions = {}
common_sessions = {}
def Mod(CQ, q):
    global q_sessions
    global CQ_sessions
    global common_sessions
    q_sessions = {}
    common_sessions = {}
    CQ_sessions={}
    
    start_time = timer.time()
    if q in queryInvertedIndex:
        #print(len(queryInvertedIndex[q]))
        for query in queryInvertedIndex[q]:
            q_sessions[query[1]] = query[2]
    #print("1: " + str(timer.time()-start_time))     
    if len(q_sessions) == 0:
        
        return 0
    
    start_time = timer.time()    
    CQ_sessions = {}
    if CQ in queryInvertedIndex:
        for query in queryInvertedIndex[CQ]:
            if query[1] not in CQ_sessions:
                CQ_sessions[query[1]] = query[2]
    #print("2: " + str(timer.time()-start_time))     

    common_sessions = q_sessions.keys() & CQ_sessions.keys()
    common_sessions = [session for session in common_sessions if (CQ_sessions[session]-q_sessions[session])/np.timedelta64(1, 's') > 0]
    return len(common_sessions)/len(q_sessions)

In [34]:
def Time(CQ, q):
    if len(common_sessions) == 0:
        return 0
    min_time = 1000000000
    for session in common_sessions:
        time_diff = (CQ_sessions[session]-q_sessions[session])/np.timedelta64(1, 's')
        if time_diff < min_time:
            min_time = time_diff
    return min_time/max_session_length

In [35]:
def Score(CQ, q):
    freq = Freq(CQ)
    mod = Mod(CQ, q)
    time = Time(CQ, q)
    val = (freq+mod+time)/(1-min([freq, mod, time]))
    return val

In [51]:
def CreateCandidates(q):
    q=regex.sub('', q.lower())
    candidate_scores = {}
    if q in queryCandidates:
        for CQ in queryCandidates[q]:
            candidate_scores[CQ] = Score(CQ, q)
        return sorted(candidate_scores, key=candidate_scores.get, reverse=True)
    else:
        return []

In [52]:
pre_calculated_suggestions = {}
for q in queryCandidates:
    pre_calculated_suggestions[q] = CreateCandidates(q)

In [57]:
def GetCandidates(q, n=10):
    return pre_calculated_suggestions[q][:n]

In [58]:
GetCandidates('tiger') 

['tiger art',
 'tiger woods',
 'tiger animal',
 'tiger bedding',
 'tiger direct',
 'tiger tattoos',
 'tiger cartoon',
 'tiger animals',
 'tiger army',
 'tiger stadium']

In [59]:
GetCandidates('google')

['google translator',
 'google eth',
 'google artists',
 'google scholar',
 'google auction',
 'google calendar',
 'google globe',
 'google mail',
 'google kids',
 'google notifier']

In [61]:
GetCandidates("how to")

['how to bodybuild',
 'how to sail',
 'how to projects',
 'how to draw',
 'how to kiss',
 'how to masturbate',
 'how to knit',
 'how to dance',
 'how to crochet',
 'how to rap']

## Save and Load Dictionary from file

In [66]:
import pickle

In [67]:
pickle.dump(pre_calculated_suggestions, open("query_suggestions.p", "wb"))  # save it into a file

In [68]:
pre_calculated_suggestions = pickle.load(open("query_suggestions.p", "rb"))

In [71]:
def GetCandidates(q, n=10):
    return pre_calculated_suggestions[q][:n]

In [72]:
GetCandidates('what is the meaning of') 

['what is the meaning of qiyas',
 'what is the meaning of lina',
 'what is the meaning of life',
 'what is the meaning of work',
 'what is the meaning of tender',
 'what is the meaning of recipients',
 'what is the meaning of substitution',
 'what is the meaning of companion',
 'what is the meaning of cool',
 'what is the meaning of philosophy']

## What's needed for a python file

In [73]:
import pickle

pre_calculated_suggestions = pickle.load(open("query_suggestions.p", "rb"))

def GetCandidates(q, n=10):
    return pre_calculated_suggestions[q][:n]

In [74]:
GetCandidates('what is the meaning of') 

['what is the meaning of qiyas',
 'what is the meaning of lina',
 'what is the meaning of life',
 'what is the meaning of work',
 'what is the meaning of tender',
 'what is the meaning of recipients',
 'what is the meaning of substitution',
 'what is the meaning of companion',
 'what is the meaning of cool',
 'what is the meaning of philosophy']