In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Load Data
path = '/Users/kimts/Workspace/data/ubuntu/'
train_df = pd.read_csv(os.path.join(path, "train.csv"))
test_df = pd.read_csv(os.path.join(path, "test.csv"))
validation_df = pd.read_csv(os.path.join(path, "valid.csv"))
y_test = np.zeros(len(test_df))

In [3]:
def evaluate_recall(y, y_test, k=1):
    num_examples = float(len(y))
    num_correct = 0
    for predictions, label in zip(y, y_test):
        if label in predictions[:k]:
            num_correct += 1
    return num_correct/num_examples

In [4]:
def predict_random(context, utterances):
    return np.random.choice(len(utterances), 10, replace=False)

In [5]:
# Evaluate Random predictor
y_random = [predict_random(test_df.Context[x], test_df.iloc[x,1:].values) for x in range(len(test_df))]
for n in [1, 2, 5, 10]:
    print("Recall @ ({}, 10): {:g}".format(n, evaluate_recall(y_random, y_test, n)))

Recall @ (1, 10): 0.0991543
Recall @ (2, 10): 0.199894
Recall @ (5, 10): 0.501374
Recall @ (10, 10): 1


In [6]:
w2v_path = '/Users/kimts/Workspace/code/chatbot-retrieval/data/glove.6B.100d.txt'
with open(w2v_path, "rb") as lines:
    w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
           for line in lines}

In [7]:
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

class TFIDFPredictor:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()

    def train(self, data):
        self.vectorizer.fit(np.append(data.Context.values, data.Utterance.values))
        
    def predict(self, context, utterances):
        # Convert context and utterances into tfidf vector
        vector_context = self.vectorizer.transform([context])
        vector_doc = self.vectorizer.transform(utterances)
        # The dot product measures the similarity of the resulting vectors
        result = np.dot(vector_doc, vector_context.T).todense()
        result = np.asarray(result).flatten()
        # Sort by top results and return the indices in descending order
        return np.argsort(result, axis=0)[::-1]

    
class TFIDFGlovePredictor:
    def __init__(self, word2vec):
        self.vectorizer = TfidfVectorizer()
        self.word2vec = word2vec

    def train(self, data):
        self.vectorizer.fit(np.append(data.Context.values, data.Utterance.values))
        
        max_idf = max(self.vectorizer.idf_)
        
        self.word2weight = defaultdict(lambda : max_idf, 
                                       [(w, self.vectorizer.idf_[i]) for w, i in self.vectorizer.vocabulary_.items()])

    def predict(self, context, answer_list):
        # tokenize
        context_words = self.vectorizer.analyzer(context)
        answer_words_list = [self.vectorizer.analyzer(answer) for answer in answer_list]

        # weighted vector
        context_vector = np.mean([self.word2vec[w]*self.word2weight[w]
                                  for w in context_words if w in self.word2vec] or [np.zeros(100)], axis=0)        
        
        answer_vector_list = np.array([np.mean([self.word2vec[w]*self.word2weight[w]
                                                for w in answer_words if w in self.word2vec] or [np.zeros(100)], axis=0)
                                       for answer_words in answer_words_list])
        result = cosine_similarity(context_vector.reshape((1, -1)), answer_vector_list).shape
        result = np.asarray(result).flatten()
        # Sort by top results and return the indices in descending order
        return np.argsort(result, axis=0)[::-1]

In [8]:
# Evaluate TFIDF predictor
pred = TFIDFPredictor()
pred.train(train_df)
y = [pred.predict(test_df.Context[x], test_df.iloc[x,1:].values) for x in range(len(test_df))]
for n in [1, 2, 5, 10]:
    print("Recall @ ({}, 10): {:g}".format(n, evaluate_recall(y, y_test, n)))

Recall @ (1, 10): 0.495032
Recall @ (2, 10): 0.596882
Recall @ (5, 10): 0.766121
Recall @ (10, 10): 1


In [12]:
# Evaluate TFIDF predictor
w2v_pred = TFIDFGlovePredictor(w2v)
w2v_pred.train(train_df)
y = [w2v_pred.predict(test_df.Context[x], test_df.iloc[x, 1:].values) for x in range(len(test_df))]
for n in [1, 2, 5, 10]:
    print("Recall @ ({}, 10): {:g}".format(n, evaluate_recall(y, y_test, n)))

In [10]:
print y[0]

[6 7 0 4 1 2 8 3 9 5]


In [11]:
for n in [1, 2, 5, 10]:
    print("Recall @ ({}, 10): {:g}".format(n, evaluate_recall(y, y_test, n)))

Recall @ (1, 10): 0.495032
Recall @ (2, 10): 0.596882
Recall @ (5, 10): 0.766121
Recall @ (10, 10): 1
