In [1]:
import numpy as np
import cPickle
from collections import OrderedDict
import collections
from nltk.metrics import *
import operator
import os
from PST_engine import PSTInfer, PST
"""
    we assume this ipython notebook resides in the following git-repo directory structure:
    ir2
    |---src
        |----preprocessing
                ipython notebook
        |----data
             | -- bg_session.ctx
                  tr_session.ctx
                  ...
        |----baseline (directory of Allesandro programs)
        

"""

DATA_PATH = '../data/'
# raw session files, query words, tab separated queries, space separated words, one line=one session
bg_session_filename = os.path.join(DATA_PATH, 'bg_session.ctx')
val_session_filename = os.path.join(DATA_PATH,'val_session.ctx')
test_session_filename = os.path.join(DATA_PATH,'test_session.ctx')
tr_session_filename = os.path.join(DATA_PATH,'tr_session.ctx')
# query frequency dict of the background data
bg_query_freq_file = os.path.join(DATA_PATH, 'bg_query_freq.pkl')
# ADJ model filename of background data (generated with Allesandro programs)
bg_ADJ_model_filename = os.path.join(DATA_PATH, 'bg_session.ctx_ADJ.mdl')
# VMM model filename of background data (generated with Allesandro programs)
VMM_model_file = os.path.join(DATA_PATH, 'bg_session.ctx_VMM.mdl')
# after loading it once, we can save it to pickle and load pickle file next time...which is quicker
VMM_model_pickle = os.path.join(DATA_PATH, 'bg_pstreeVMM.pkl')
# candidate filename, used to store the dict that holds the sessions & candidate queries for test/val/train
tr_sess_candid_f = os.path.join(DATA_PATH, 'tr_suggest.pkl')
val_sess_candid_f = os.path.join(DATA_PATH, 'val_suggest.pkl')
test_sess_candid_f = os.path.join(DATA_PATH, 'test_suggest.pkl')

In [5]:
"""
Func to print the suggested query id's as strings using the id_to_query map
"""
def print_suggestion(suggestions):
    for suggest in suggestions:
        print id_to_query[suggest[0]]
        
        
"""
    Save a dictionary to file
"""
def save_pickle_dict(a_dict, output_file):
    f = open(output_file, 'wb')
    cPickle.dump(a_dict, f)
    print("Successfully saved dict to %s" % output_file)
    f.close()

"""
    make a inverted version of the query to id dict
""" 
def make_id_to_query_dict(q_to_id_dict):
    return {v: k for k, v in q_to_id_dict.iteritems()}

"""
Make a query frequency dictionary of the background data set
we need this dict for one of the features: 
    -- the frequency of an anchor query in the background data set
Input: session file with string queries
Output: dict with the query frequencies 
"""
def make_query_frequencies(session_file):
    query_freq = {}
    total_freq = 0
    for num, session in enumerate(session_file):
        session = session.strip().split('\t')
        for query in session:
            query_freq[query] = query_freq.get(query, 0.) + 1.
            total_freq += 1
    print("Successfully made background frequency dictionary")
    return query_freq

"""
    load the 2 dicts from the ADJ model that we generated with Allesandro programs
    we need these dicts to generate the candidate queries for a test/train/val files
"""
def load_ADJ_model_dicts(filename):
    print("Loading ADJ model from file %" % filename)
    input_handle = open(filename, 'r')
    tuple_dict = cPickle.load(input_handle)
    query_to_id = cPickle.load(input_handle)
    print("Successfully loaded ADJ model dicts")
    print("\t %d entries in tuple dict" % len(tuple_dict))
    print("\t %d entries in query_to_id dict" % len(query_to_id))
    
    return tuple_dict, query_to_id

"""
make a new dict with key anchor query, as value we have a new dict with keys previous query and 
their value count 

dict[anchor_query] = { previous_query: count_value}

"""
def make_search_dict(tuple_dict):
    search_dict = collections.defaultdict(dict)
    # use the keys (tuples with two query id's) of the tuple dict to make a new dict 
    tuple_pairs = tuple_dict.keys()
    
    print("Start making search dict...")
    for _tuple in tuple_pairs:
        search_dict[_tuple[1]][_tuple[0]] = tuple_dict[_tuple] 
    print("Successfully made search dict")
    del tuple_pairs
    
    return search_dict

# ADJ_tuple_dict, query_dict = load_ADJ_model_dicts(bg_ADJ_model_filename)
bg_query_freq = make_query_frequencies(open(bg_session_filename, 'r'))
save_pickle_dict(bg_query_freq, bg_query_freq_file)

# finally make queryID to query-words dict
id_to_query = make_id_to_query_dict(query_dict)
search_dict = make_search_dict(ADJ_tuple_dict)
print(" ---------->>> READY let's start <<<-----------")

Successfully made background frequency dictionary
Successfully saved dict to ../data/bg_query_freq.pkl
Start making search dict...
Successfully made search dict
 ---------->>> READY let's start <<<-----------


In [30]:
"""
Function that makes suggestions for a session

Input: session file, *.ctx
Output: dict with key:session_idx value: (target_query,anchor_query, session, suggestions)

"""

def print_suggestions(session, anchor_query, candidates):
    global id_to_query
    
    print("session ", session)
    print("anchor_query ", anchor_query)
    for query_1, query_2 in candidates:
        print("query 1 ", id_to_query[query_1])
        

def make_suggestions(session_file, min_sess_length=1, max_sess_length=50, num_suggestions=20, early_stop=False):
    global query_dict
    global search_dict
    # make a dict to save all the results
    suggestion_dict = {}
    c = 1
    num_sessions = 0
    # loop over every session in the *.ctx file
    for idx, line in enumerate(session_file):
        # queries are tab-separated 
        session = line.strip().split('\t')
        
        # we also have to limit the session length because we can't generate a HRED log-likehood score
        # for sessions that are too long (memory problems)
        if len(session) >= min_sess_length+1 and len(session) <= max_sess_length:
            target_query = session[-1] # target query is the last query Qm
            anchor_query = session[-2] # Anchor query is the query Qm-1
            context = session[:-1] # Qm-1 till Q1 are the context queries
            
            # find anchor in the background set
            if anchor_query in query_dict:
                key =  query_dict[anchor_query] # the key of the query in the bg-set 
                # check if target query and anchor query are in the background set
                if key in search_dict and target_query in query_dict:
                    """
                    We could use the search dict to find all the queries that follow the anchor query 
                    in the bg set, we use this queries as suggestions
                    """
                    suggestions = search_dict[key]
                    if len(suggestions) > num_suggestions: # we need at least 20 suggestions 
                        target_key = query_dict[target_query] # find the key of the target query
                        list_suggestions = [(key, suggestions[key] )for key in suggestions.keys()]
                        # sort list of tuples by second tuple entry which is the frequency count
                        # also reverse order so it is in descending order
                        sorted_suggestions = sorted(list_suggestions, key=lambda x: x[1])[::-1]
                        #take only the top 20 suggestions based on counts 
                        suggestions = sorted_suggestions[0:num_suggestions]
                        # final check, is the target query really in the set of suggestions? 
                        if target_key in (x[0] for x in suggestions): 
                            # we have a valid session, now we list all the suggestions and sort them
                            # save this in the dict key(idx):(target_query,anchor_query, session, suggestions)
                            suggestion_dict[idx] = (target_query,anchor_query, session, suggestions)
                            # print_suggestions(session, anchor_query, suggestions)
                            num_sessions += 1
        if num_sessions > 3 and early_stop:
            print("Break")
            break
            
    return suggestion_dict

In [31]:
def generate_suggestions(p_type="tr", min_sess_length=5, max_sess_length=50, 
                         num_suggestions=20, load_existing=False, early_stop=False):
    """
        find for a specific session file (train/test/val) all the corresponding
        suggestions and store result in a dict for later processing
        This procedures differs between experiments 4.4, 4.5, 4.6 in the paper
    """
    global test_session_filename, tr_session_filename, val_session_filename
    global tr_sess_candid_f, test_sess_candid_f, val_sess_candid_f
    
    if p_type == 'tr':
        # training
        session_file = tr_session_filename
        output_file = tr_sess_candid_f
    elif p_type == 'val':
        # validation
        session_file = val_session_filename
        output_file = val_sess_candid_f
    else:
        # test sessions
        session_file = test_session_filename
        output_file = test_sess_candid_f
        
    if not load_existing:
    
        print("Generating suggestion queries for session file %s" % session_file)
        suggestion_dict = make_suggestions(open(session_file, 'r'), 
                                           min_sess_length=min_sess_length,
                                           max_sess_length=max_sess_length,
                                           num_suggestions=num_suggestions,
                                           early_stop=early_stop)

        print("Successfully generated suggestions for %d sessions" % len(suggestion_dict))
        save_pickle_dict(suggestion_dict, output_file)
    else:
        print("Loading suggestion queries from file %s" % output_file)
        suggestion_dict = cPickle.load(open(output_file ,'rb'))
        print("Successfully loaded suggestions")
        
    return suggestion_dict


In [96]:
"""
    we use the following output files
    (1) training
        tr_sess_candid_f
    (2) validation
        val_sess_candid_f
        
    (3) test
        test_sess_candid_f
        
    if called with load_existing=True the dict will be loaded from an earlier saved pickle file
"""
suggestion_dict = generate_suggestions(p_type="tr", load_existing=False, early_stop=False)


Generating suggestion queries for session file ../data/tr_session.ctx
Successfully generated suggestions for 24032 sessions
Successfully saved dict to ../data/tr_suggest.pkl


In [9]:
def load_VMM_model(filename, load_saved_model=False):
    # load the VMM model made with Allesandro's Probabilistic Suffix Tree (PST)
    # currently the context scope is limited to D=2 which means the tuple dict contains
    # tuples with max lenght of 3 (so the memory span is look 2 queries ahead)
    global VMM_model_pickle
    
    if not load_saved_model:
        print("Loading VMM model from %s" % filename)
        print("Patient, this will take a while (approx 5 minutes)")
        pstree = PSTInfer()
        pstree.load(filename)
        save_pickle_dict(pstree, VMM_model_pickle)
    else:
        print("Loading VMM model from pickle %s" % VMM_model_pickle)
        pstree = cPickle.load(open(VMM_model_pickle, 'rb'))
        
    print("======== READY ===========")
    return pstree

pstreeVMM = load_VMM_model(VMM_model_file, load_saved_model=True)

Loading VMM model from pickle ../data/bg_pstreeVMM.pkl


In [10]:
def count_letter_ngram(sentence, n=3):
    """
    How many n-grams fits in this sentenec 
    """
    if len(sentence) < n:
        return set(sentence)
    local_counts = set()
    for k in range(len(sentence.strip()) - n + 1): 
        local_counts.add(sentence[k:k+n])
    return local_counts

def matches(ng1, ng2):
    """
    For both n-gram sets how many sim elements they contain
    """
    return len(ng1 & ng2)

def n_gram_sim(query1, query2,n=3):
    """
    return n-gram similarity between two queries 
    """
    return matches(count_letter_ngram(query1, n), count_letter_ngram(query2, n))

def make_n_gram_sim_features(context_queries,suggestion):
    """
    For every suggestion make the n-gram similarity for the context queries (at most 10)
    """
    n_sim = [0] * 10
    for idx, context_query in enumerate(context_queries):
        if idx >=10:
            """
            only do this for at most 10 context queries 
            """
            break
        n_sim[idx] = n_gram_sim(suggestion, context_query,n=3)
        
    return n_sim


def get_VMM_score(session, suggestion, no_normalize=False, fallback=False):
    global pstreeVMM
    """
    For every suggestion determine the VMM score (variable memory Markov score)
    """
    
    _, scores = pstreeVMM.rerank(session, suggestion, no_normalize=no_normalize, fallback=fallback)
    
    return scores[0]

In [97]:
def prepare_files_hred_score(suggestion_dict, out_dir, suffix='tr'):
    """
        in order to obtain the HRED scores from Allesandro's model we need to feed the score.py 
        with two input files (note raw data = the acutal words)
        (1) sessions, tab separated queries, space separated words
        (2) candidates belonging to that session, so on each line tab-separated the candidate queries 
        
        the procedures generates both output files based on the input suggestion dictionary.
        Remember, that dicionary contains per entry all necessary information:
        
        out_dir is just the directory where to write to
        
    """
    global pstreeVMM
    
    session_f = os.path.join(out_dir, suffix + "_hred_sess.ctx")
    candid_f = os.path.join(out_dir, suffix + "_hred_cand.ctx")
    print("Writing 2 output files")
    print(session_f)
    print(candid_f)
    
    with open(session_f, 'w') as sess, open(candid_f, 'w') as cand:
        for session_key in suggestion_dict.keys():
            # tuple 
            session_tuple = suggestion_dict[session_key]
            target_query = session_tuple[0]
            context_queries = session_tuple[2][:-1]
            anchor_query = session_tuple[1]
            anchor_query_id = pstreeVMM.query_to_id[anchor_query]
            suggestions = session_tuple[3]
            queries = []
            for idx, suggestion in enumerate(suggestions):
                suggestion_id = suggestion[0]
                query_words = pstreeVMM.id_to_query[suggestion_id]
                queries.append(query_words)

            sess.write("\t".join(context_queries) + "\n")
            cand.write("\t".join(queries) + "\n")
        
    print("-----> Ready <------")
"""
    make the files that we need to generate the HRED log-likelihood scores
"""      
prepare_files_hred_score(suggestion_dict, DATA_PATH, suffix='tr') 

Writing 2 output files
../data/tr_hred_sess.ctx
../data/tr_hred_cand.ctx
-----> Ready <------


In [94]:
"""
    count number of lines in a file
"""
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

"""
Function that returens a feature vector for every suggestion 

Input: suggestion_dict
Output: per session a matrix [17,20] with the feature vectors 
"""

def make_suggestion_features(suggestion_dict, hred_ll_file, num_features=17, do_test=False):
    global pstreeVMM
    global bg_query_freq
    global query_dict
    
    num_of_candidates = 20
    # Number of lines in HRED file must be equal to, -1 because of header line
    lines_in_file = file_len(hred_ll_file)  - 1
    expected_count = len(suggestion_dict) * num_of_candidates
    print("Lines match? %d = %d" % (lines_in_file, expected_count))
    
    assert lines_in_file == expected_count
    
    c = 0
    """
        matrix_out is the final numpy matrix. The layout is as follows:
        dim0 = number of sessions which is actually equal to the size of the suggestions dict
               because we determined for each session from the tr, test, val session file if it passes
               the requirements, 20 suggestions
        dim1 = col0=anchor query ID
               col1=suggestion/candidate query ID
               col2-19: 18 features (the last column of these 18 is the HRED log-likelihood)
               col20: label, which basically is zero except for the target query ID, which is one of the 
                      suggestion queries.
    """
    
    feature_dim = num_features + 3
    matrix_out = np.zeros((len(suggestion_dict) * 20, feature_dim))
    session_id = 0
    sess_less_cand = 0
    
    with open(hred_ll_file, 'r') as hred_ll:
        # read header line
        print("HRED-header ", hred_ll.readline() )
        for session_key in suggestion_dict.keys():
            # tuple 
            session_tuple = suggestion_dict[session_key]
            target_query = session_tuple[0]
            target_id = query_dict[target_query]
            context_queries = session_tuple[2][:-1]
            anchor_query = session_tuple[1]
            anchor_query_id = pstreeVMM.query_to_id[anchor_query]
            suggestions = session_tuple[3]
            VMM_scores = []
            candidates = []
            # create an empty matrix for this session, which stores the num_of_candidates rows
            # we're doing this because at the end of the session we will sort this matrix
            # so that the target query (the positive candidate) is the first row in the session matrix
            session_matrix = np.zeros((num_of_candidates, feature_dim))
            if len(suggestions) == 20:
                for idx, suggestion in enumerate(suggestions):

                    suggestion_id = suggestion[0]
                    query_string = pstreeVMM.id_to_query[suggestion_id]

                    session_matrix[idx, 0] = anchor_query_id
                    session_matrix[idx, 1] = suggestion_id
                    """"
                    For each candidate suggestion, we count how many times it follows 
                    the anchor query in the background data and add this count as a feature.
                    """
                    follow_anchor_count = suggestion[1]
                    session_matrix[idx, 2] = follow_anchor_count
                    """
                    Additionally, we use the frequency of the anchor query in the background data.
                    """
                    bg_freq = bg_query_freq[query_string]
                    session_matrix[idx, 3] = bg_freq
                    """
                    We also add the Levenshtein distance between the anchor and the suggestion.
                    """
                    levenshtein_distance = edit_distance(anchor_query, query_string)
                    session_matrix[idx, 4] = levenshtein_distance
                    """
                    The suggestion length (characters and words)
                    """
                    chars_leng = len(query_string) 
                    session_matrix[idx, 5] = chars_leng
                    word_leng = len(query_string.split())
                    session_matrix[idx, 6] = word_leng

                    """
                    We add 10 features corresponding to the character n-gram similarity 
                    between the suggestion and the 10 most recent queries in the context.
                    """
                    n_gram_sim =  make_n_gram_sim_features(context_queries, query_string)
                    session_matrix[idx, 7:17] = n_gram_sim
                    candidates.append(query_string)

                    VMM_score = get_VMM_score(context_queries, [query_string])
                    VMM_scores.append(VMM_score)
                    session_matrix[idx, 17] = VMM_score

                    """
                    HRED Score
                    """
                    hred_score = float(hred_ll.readline())
                    session_matrix[idx, 18] = hred_score

                    if target_id == suggestion_id:
                        session_matrix[idx, 19] = 1
                    else:
                        session_matrix[idx, 19] = 0

                # ok, let's sort the session matrix first on the last column, the label (0/1) so that the target
                # query is the first row
                # session_matrix = session_matrix[session_matrix[:, feature_dim-1].argsort()[::-1]]
                # let's parse the session_matrix into our final output matrix
                start = session_id * feature_dim
                end   = start + feature_dim

                matrix_out[start:end, :] = session_matrix
                # if anchor_query == "jesse mccartney":
                    # print("session ", context_queries)
                    # print("anchor_query ", anchor_query)
                    # print("candidates ", candidates)
                    # print("VMM scores ", VMM_scores)
                    # print(matrix_out[session_id:20,feature_dim-1])
                    # break
                assert np.sum(matrix_out[start:end, 19] == 1) == 1, query_string + " " + str(target_id) + " " + str(anchor_query_id)
                session_id += 1
                if session_id % 1000 == 0:
                    print("Progress, session id %d" % session_id)
            else:
                sess_less_cand += 1
                
            if session_id > 10 and do_test:
                break

        
    print("Session with less than 20 candidates %d" % sess_less_cand)
    return matrix_out

In [99]:
"""
    Please note, the output files of HRED with the ll-score are called:
    Experiments 4.4.1 (base)
        tr_hred_score_exp4_4_1.f
        val_hred_score_exp4_4_1.f
        test_hred_score_exp4_4_1.f
"""

hred_ll_file = os.path.join(DATA_PATH, 'tr_hred_score_exp_4_4_1.f')
output_matrix = make_suggestion_features(suggestion_dict, hred_ll_file, num_features=17, do_test=False)
np.savez(os.path.join(DATA_PATH, "train_suggest_matrix"), output_matrix)

Lines match? 480640 = 480640
('HRED-header ', '0_HED_1479568981.18\n')
Progress, session id 1000
Progress, session id 2000
Progress, session id 3000
Progress, session id 4000
Progress, session id 5000
Progress, session id 6000
Progress, session id 7000
Progress, session id 8000
Progress, session id 9000
Progress, session id 10000
Progress, session id 11000
Progress, session id 12000
Progress, session id 13000
Progress, session id 14000
Progress, session id 15000
Progress, session id 16000
Progress, session id 17000
Progress, session id 18000
Progress, session id 19000
Progress, session id 20000
Progress, session id 21000
Progress, session id 22000
Progress, session id 23000
Progress, session id 24000
Session with less than 20 candidates 0


In [87]:
np.unique(output_matrix[:, 10])

array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
        11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
        22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,
        33.,  34.,  36.,  39.])