In [12]:
import re
import gensim
import os
import sys
import tarfile
import glob
import gzip
import pickle
from timeit import default_timer as timer
from nltk import word_tokenize
from nltk.corpus import stopwords
from datetime import datetime, timedelta
from nltk import distance 
import numpy as np
import pandas as pd
import fnmatch


In [37]:
"""
Import the background set, and put it to a pandas data frame. 
"""

root_dir  = os.path.abspath('..')
bg_set_dir = root_dir + '/data/background_set/'
dfs = []

"""
Load al the 10 files separetly, then append them to one df
"""

def make_data_frame(set_dir):
    for file in os.listdir(set_dir):
        if fnmatch.fnmatch(file, '*.out'):
            file_path =  set_dir + '/' + file
            dfs.append(pd.read_csv(file_path, delim_whitespace=True))

    df = dfs[0] # start
   
    for i in range(1,len(dfs)):
        df.append(dfs[i])

    # add col names
    df.columns = ["SessionId", "AnonID", "Query"]
    
    return df
    
bg_df = make_data_frame(root_dir + '/data/background_set/')
train_df = make_data_frame(root_dir + '/data/training_set/')
test_df = make_data_frame(root_dir + '/data/test_set/')
val_df = make_data_frame(root_dir + '/data/validation_set/')

In [166]:
"""
Demo for the apply function I made below

"""



s_id = 23422
query = '1482,244'

s_size = 4
# select the last query from a session
anchor_query =  bg_df[bg_df['SessionId'] == s_id].tail(1)

query_string =  anchor_query.iloc[0]['Query']
session_id = anchor_query.iloc[0]['SessionId']



# get all the session ID's where this query is occuring in
session_ids = bg_df.loc[bg_df['Query'] ==  query_string]['SessionId']

# make a df with all the sessions selected by the session ID's generated above
sessions = bg_df.loc[bg_df['SessionId'].isin(session_ids)]

# filter out the anocher query
sessions = sessions.loc[sessions['Query'] !=  query_string]

# get the total amount of queries over all the selected sessions 
amount_queries = sessions.shape[0]

# get the ferquentie how many time a query co-occur with the anoch query ordered
co_accur = sessions['Query'].value_counts()/ amount_queries
if co_accur.size > s_size:
    suggestions = co_accur.index.values


queries = [query_string]
queries.extend(suggestions[0:s_size]) 
session_id = [session_id  for i in range(len(queries))]


d = {"SessionID":session_id,"Query":queries}


df = pd.DataFrame(data=d)
df['Label'] = 0
df.loc[df['Query'] ==  query_string, ['Label']] = 1


df_empty = pd.DataFrame({'SessionID' : [], 'Query': [] , 'Label': []})
df_empty.append(df)

               Query  SessionID  Label
0           1482,244      23422      1
1  1482,244,1647,926      23422      0
2          12374,409      23422      0
3            213,244      23422      0
4              2,0,1      23422      0


Unnamed: 0,Label,Query,SessionID
0,1.0,1482244,23422.0
1,0.0,14822441647926,23422.0
2,0.0,12374409,23422.0
3,0.0,213244,23422.0
4,0.0,201,23422.0


In [185]:
def make_query_suggestions(session, bg_data, datat_frame, s_size=19):
    # select the last query from a session
   
    anchor_query =  session.tail(1)
    
    query_string =  anchor_query.iloc[0]['Query']
    session_id = anchor_query.iloc[0]['SessionId']
    
    # get all the session ID's where this query is occuring in
    session_ids = bg_data.loc[bg_data['Query'] ==  query_string]['SessionId']
    
    # make a df with all the sessions selected by the session ID's generated above
    sessions = bg_data.loc[bg_data['SessionId'].isin(session_ids)]
    
    # filter out the anocher query
    sessions = sessions.loc[sessions['Query'] !=  query_string]
    
    # get the total amount of queries over all the selected sessions 
    amount_queries = sessions.shape[0]
    
    co_accur = sessions['Query'].value_counts()/ amount_queries
    if co_accur.size > s_size:
        suggestions = co_accur.index.values
    
        queries = [query_string]
        queries.extend(suggestions[0:s_size]) 
        session_id = [session_id  for i in range(len(queries))]
    
        d = {"SessionID":session_id,"Query":queries}


        df = pd.DataFrame(data=d)
        df['Label'] = 0
        df.loc[df['Query'] ==  query_string, ['Label']] = 1
    
        datat_frame.append(df)


In [None]:
"""

For each sessie S= {Q1,.., QM} we want to preduct the target query QM. given the context Q1,...QM-1. 
We select 20 possible candidate queries, the true querie gets a label 1. 

How to select the 20 candidate queries? 

For each session in the training, validation and test set, we extract 20 queries that most 
likely follow the anchor query in the background data, i.e. with the highest ADJ score.
The session is included if and only if at least 20 queries have been extracted and the target query 
appears in the candidate list.

"""
query_suggestion_training = query_suggestion_test = query_suggestion_validation = pd.DataFrame({'SessionID' : [], 'Query': [] , 'Label': []})

grouped_training = train_df.groupby(['SessionId'])
test_training = test_df.groupby(['SessionId'])
val_training = val_df.groupby(['SessionId'])

grouped_training.apply(lambda session: make_query_suggestions(session, bg_df, query_suggestion_training, s_size=19))

test_training.apply(lambda session: make_query_suggestions(session, bg_df, query_suggestion_test, s_size=19))

val_training.apply(lambda session: make_query_suggestions(session, bg_df, query_suggestion_validation0, s_size=19))


In [None]:
"""
If we have for the test, training and validation set for every valid session the query suggestions. We need to make a 
feature vector for every query


17 features are used for ranking 

Candidate sollution:
    - For each candidate suggestion, we count how many times it follows 
    the anchor query in the background data and add this count as a feature.
    -  We use the frequency of the anchor query in the background data.
    -  We also add the Levenshtein distance between the anchor and the suggestion.
    - The suggestion length (charac- ters and words) and its frequency in the background set. 
    - we add 10 features corresponding to the character n-gram similarity between the suggestion and the 10 most recent queries in the context. 
    - We add the average Levenshtein distance between the suggestion and each query in the context
    - We use the scores estimated using the context-aware Query Variable Markov Model (QVMM)
    -  The proposed hierarchical recurrent encoder- decoder contributes one additional feature corresponding 
    to the log-likelihood of the suggestion given the context, as detailed in Section 3.4.

""""