In [56]:
from sklearn import linear_model
import gensim
import operator
import string
import random
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [12]:
def load_data(filename):
    res = []
    with open(filename, 'r') as f:
        for line in f:
            question, label = line.split(",,,", 1)
            res.append((question.strip(), label.strip()))
    return res

In [57]:
def precision(y_true, y_pred, strategy='weighted'):
    return metrics.precision_score(y_true, y_pred, average=strategy)

def recall(y_true, y_pred, strategy='weighted'):
    return metrics.recall_score(y_true, y_pred, average=strategy)

def f1_score(y_true, y_pred, strategy='weighted'):
    return metrics.f1_score(y_true, y_pred, average=strategy)

In [59]:
def training_error(y_true, y_pred):
    prec = precision(y_true, y_pred)
    rec = recall(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return {"precision": prec, "recall": rec, "f1-score": f1}

In [42]:
word_vector_path = "data/glove_new.txt"
training_data_path = "data/train.txt"
vector_dim = 50
# gensim.scripts.glove2word2vec.glove2word2vec(word_vector_path, "data/glove2.txt")
word_vector = gensim.models.KeyedVectors.load_word2vec_format("data/glove.txt", binary=False)

In [5]:
WHEN_TYPE        = 'when'
WHAT_TYPE        = 'what'
WHO_TYPE         = 'who'
AFFIRMATIVE_TYPE = 'affirmation'
UNKNOWN_TYPE     = 'unknown'

ALL_TYPES = [WHEN_TYPE, WHAT_TYPE, WHO_TYPE, AFFIRMATIVE_TYPE, UNKNOWN_TYPE]

In [6]:
def create_vector(question):
    global word_vector
    
    splitted = question.split(" ")
    vector = np.zeros(vector_dim)
    count = 2.0
    try:
        if len(splitted) == 0:
            return vector
        else:
            vector = map(operator.add,
                         word_vector[splitted[0].lower()],
                         vector)
            if len(splitted) == 1:
                return np.asarray(vector)
            vector = map(operator.add,
                         word_vector[splitted[1].lower()],
                         vector)
            if (splitted[0].lower() == 'what' and
                    splitted[1].lower() == 'is'):
                count = 0.0
                vector = np.zeros(vector_dim)
                for token in splitted:
                    count += 1
                    try:
                        vector = map(operator.add,
                                     word_vector[token.lower()],
                                     vector)
                    except KeyError:
                        count -=1
                if count == 0:
                    return np.asarray(vector)
                return np.asarray(vector) / count
            return np.asarray(vector) / count
    except KeyError:
        return vector

In [60]:
train_data = load_data(training_data_path)
test_data = [("What's your name?", "what"), ("When is the show happening?", "when"), ("Is there a cab available for airport?", "affirmation"), ("What time does the train leave?", "when")]

question_vectors = np.asarray([create_vector(line[0]) for line in train_data])

encoder = LabelEncoder()
encoder.fit(ALL_TYPES)
train_labels = encoder.transform([line[1] for line in train_data])

clf = linear_model.LogisticRegression(multi_class='multinomial',solver='lbfgs')
clf.fit(question_vectors, train_labels)
train_data_prediction = clf.predict([create_vector(line[0].lower()) for line in train_data])
test_data_prediction = encoder.inverse_transform(clf.predict([create_vector(line[0].lower()) for line in test_data]))
print training_error(train_labels, train_data_prediction)
print test_data_prediction

{'recall': 0.94351042367182247, 'f1-score': 0.94298249678919555, 'precision': 0.94323655353861802}
['what' 'when' 'affirmation' 'when']
