In [79]:
import numpy as np
import cPickle
from collections import OrderedDict
import collections
from nltk.metrics import *

In [4]:

# load the data molde
input_handle = open('data/bg_session.ctx_ADJ.mdl', 'r')

# load the tuple dict and the query dict
tuple_dict = cPickle.load(input_handle)
query_to_id = cPickle.load(input_handle)

In [36]:
# make a inverted version of the query to id dict
id_to_query =  {v: k for k, v in query_to_id.iteritems()}

In [67]:
"""
When using enumerate you can only use this ones for the data set, you need to reload the data
before you can use emenumerate again
"""
def open_data():
    val_sessions = open('data/val_session.ctx', 'r')
    train_session = open('data/tr_session.ctx', 'r')
    bg_session = open('data/bg_session.ctx', 'r')
    return train_session, val_sessions, bg_session

In [6]:
# use the keys (tuples with two query id's) of the tuple dict to make a new dict 
tuple_pairs = tuple_dict.keys()

In [7]:
search_dict = collections.defaultdict(dict)
"""
make a new dict with key anchor query, as value we have a new dict with keys previous query and 
their value count 

dict[anchor_query] = { previous_query: count_value}

"""

for _tuple in tuple_pairs:
    search_dict[_tuple[1]][_tuple[0]] = tuple_dict[_tuple] 

In [37]:
"""
Func to print the suggested query id's as strings using the id_to_query map
"""
def print_suggestion(suggestions):
    for suggest in suggestions:
        print id_to_query[suggest[0]]

In [55]:
"""
Function that makes suggestions for a session

Input: session file, *.ctx
Output: dict with key:session_idx value: (target_query,anchor_query, session, suggestions)

"""

def make_suggestions(session_file, recent_queries=1,num_suggestions=20):
    # make a dict to save all the results
    suggestion_dict = {}
    
    # loop over every session in the *.ctx file
    for idx, line in enumerate(session_file):
        # queries are tab-separated 
        session = line.strip().split('\t')
        
        
        if len(session) >= recent_queries+1:
            target_query = session[-1] # target query is the last query Qm
            anchor_query = session[-2] # Anchor query is the query Qm-1
            context = session[:-1] # Qm-1 till Q1 are the context queries
            
            # find anchor in the background set
            if anchor_query in query_to_id:
                key =  query_to_id[anchor_query] # the key of the query in the bg-set 
                # check if target query and anchor query are in the background set
                if key in search_dict and target_query in query_to_id:
                    """
                    We could use the search dict to find all the queries that follow the anchor query 
                    in the bg set, we use this queries as suggestions
                    """
                    suggestions = search_dict[key]
                    if len(suggestions) > num_suggestions: # we need at least 20 suggestions 
                        target_key = query_to_id[target_query] # find the key of the target query
                        
                        if target_key in suggestions: # check if target query is among the suggestions 
                            # we have a valid session, now we list all the suggestions and sort them
                            list_suggestions = [(key, suggestions[key] )for key in suggestions.keys()]
                            sorted_suggestions = sorted(list_suggestions, key=lambda x: x[1])[::-1]
                            #take only the top 20 suggestions based on counts 
                            suggestions = sorted_suggestions[0:num_suggestions]
                            # save this in the dict key(idx):(target_query,anchor_query, session, suggestions)
                            suggestion_dict[idx] = (target_query,anchor_query, session, suggestions)
    return suggestion_dict

In [72]:
train_session, val_sessions, bg_session = open_data() # reload the data

In [56]:


# dicts with results
suggestion_train = make_suggestions(train_session)
suggestion_val = make_suggestions(val_sessions)

In [73]:
# TODO Jorg: we hebben de .ctx file nodig van de bg set:

"""
Input: session file with string queries
Output: dict with the query frequencies 
"""
def make_query_frequncies(session_file):
    query_freq = {}
    total_freq = 0
    for num, session in enumerate(session_file):
        session = session.strip().split('\t')
        for query in session:
            query_freq[query] = query_freq.get(query, 0.) + 1.
            total_freq += 1
    return query_freq

query_freq = make_query_frequncies(bg_session)

In [85]:
def count_letter_ngram(sentence, n=3):
    """
    How many n-grams fits in this sentenec 
    """
    if len(sentence) < n:
        return set(sentence)
    local_counts = set()
    for k in range(len(sentence.strip()) - n + 1): 
        local_counts.add(sentence[k:k+n])
    return local_counts

def matches(ng1, ng2):
    """
    For both n-gram sets how many sim elements they contain
    """
    return len(ng1 & ng2)

def n_gram_sim(query1, query2,n=3):
    """
    return n-gram similarity between two queries 
    """
    return matches(count_letter_ngram(query1, n), count_letter_ngram(query2, n))

def make_n_gram_sim_features(context_queries,suggestion):
    """
    For every suggestion make the n-gram similarity for the context queries (at most 10)
    """
    n_sim = [0] * 10
    for idx, context_query in enumerate(context_queries):
        n_sim[idx] = n_gram_sim(suggestion, context_query,n=3)
        if idx >=10:
            """
            only do this for at most 10 context queries 
            """
            break
    
    return n_sim

In [86]:
"""
Function that returens a feature vector for every suggestion 

Input: suggestion_dict
Output: per session a matrix [17,20] with the feature vectors 
"""

def make_suggestion_features(suggestion_dict, num_features=17):
    for session_key in suggestion_dict.keys():
        session_tuple = suggestion_dict[session_key]
        target_query = session_tuple[0]
        context_queries = session_tuple[2][:-1]
        anchor_query = session_tuple[1]
        suggestions = session_tuple[3]
        for idx, suggestion in enumerate(suggestions):
            suggestion_id = suggestion[0]
            query_string = id_to_query[suggestion_id]
            """"
            For each candidate suggestion, we count how many times it follows 
            the anchor query in the background data and add this count as a feature.
            """
            follow_anchor_count = suggestion[1]

            """
            Additionally, we use the frequency of the anchor query in the background data.
            """
            bg_freq = query_freq[query_string]

            """
            We also add the Levenshtein distance between the anchor and the suggestion.
            """
            levenshtein_distance = edit_distance(anchor_query, query_string)

            """
            The suggestion length (characters and words)
            """
            chars_leng = len(query_string) 
            word_leng = len(query_string.split())
            
            
            """
            We add 10 features corresponding to the character n-gram similarity 
            between the suggestion and the 10 most recent queries in the context.
            """
            
            n_gram_sim =  make_n_gram_sim_features(context_queries, suggestion)
            
            """
            HRED Score
            """
            hred_score = None 

In [87]:
make_suggestion_features(suggestion_train, num_features=17)

KeyboardInterrupt: 