In [191]:
import pandas as pd
import numpy as np
import os
import sys
import nltk
from nltk.corpus import stopwords

In [26]:
# GLOBALS
LOCAL_DATA_ROOT = '/Users/varunn/Documents/projects/pretrained_word_vectors/'
GLOVE_PATH = LOCAL_DATA_ROOT + 'glove.6B/'
GLOVE_INP_FN = GLOVE_PATH + 'glove.6B.50d.txt'
OUT_PATH = '/Users/varunn/Documents/NLP-data/'
VOCAB_FILE = OUT_PATH+'vocabulary_file_w2v.txt'

In [9]:
count = 1
for line in open(GLOVE_INP_FN):
    if count <= 5:
        print(line)
    else:
        break
    count += 1

the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581

, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 -0.42852 -0.55641 -0.364 -0.23938 0.13001 -0.063734 -0.39575 -0.48162 0.23291 0.090201 -0.13324 0.078639 -0.41634 -0.15428 0.10068 0.48891 0.31226 -0.1252 -0.037512 -1.5179 0.12612 -0.02442 -0.042961 -0.28351 3.5416 -0.11956 -0.014533 -0.1499 0.21864 -0.33412 -0.13872 0.31806 0.70358 0.44858 -0.080262 0.63003 0.32111 -0.46765 0.22786 0.36034 -0.37818 -0.56657 0.044691 0.30392

. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 -0.43478 -0.31086 -0.44999 -0.29486 0.16608 0.11963 -0.41328 -0.423

# Entity Extraction

In [170]:
class Embedding(object):
    def __init__(self,vocab_file,vectors_file, vocab_flag=True):
        if vocab_flag:
            words = []
            with open(vocab_file, 'r') as f:
                lines = [x.rstrip().split('\n') for x in f.readlines()]
                lines = [x[0] for x in lines]
                for line in lines:
                    current_words = line.split(' ')
                    words = list(set(words) | set(current_words))
            

        with open(vectors_file, 'r') as f:
            vectors = {}
            for line in f:
                vals = line.rstrip().split(' ')
                vectors[vals[0]] = [float(x) for x in vals[1:]]
        
        
        if not vocab_flag:
            words = vectors.keys()
        
        vocab_size = len(words)
        vocab = {w: idx for idx, w in enumerate(words)}
        ivocab = {idx: w for idx, w in enumerate(words)}

        vector_dim = len(vectors[ivocab[0]])
        W = np.zeros((vocab_size, vector_dim))
        for word, v in vectors.items():
            if (word == '<unk>') | (word not in vocab):
                continue
            W[vocab[word], :] = v
            

        # normalize each word vector to unit variance
        W_norm = np.zeros(W.shape)
        d = (np.sum(W ** 2, 1) ** (0.5))
        W_norm = (W.T / d).T
        
        if vocab_flag:
            for i in range(W.shape[0]):
                x = W[i, :]
                if sum(x) == 0:
                    W_norm[i, :] = W[i, :]

        self.W = W_norm
        self.vocab = vocab
        self.ivocab = ivocab

In [181]:
def find_similar_words(embed,text,refs):

    C = np.zeros((len(refs),embed.W.shape[1]))

    for idx, term in enumerate(refs):
        if term in embed.vocab:
            C[idx,:] = embed.W[embed.vocab[term], :]


    tokens = text.split(' ')
    scores = [0.] * len(tokens)

    for idx, term in enumerate(tokens):
        if term in embed.vocab:
            vec = embed.W[embed.vocab[term], :]
            cosines = np.dot(C,vec.T)
            score = np.mean(cosines)
            scores[idx] = score
    
    print(scores)

    return tokens[np.argmax(scores)]

In [182]:
examples = ["i am looking for a place in the north of town",
            "looking for indian restaurants",
            "Indian wants to go to an italian restaurant",
            "show me chinese restaurants",
            "show me chines restaurants in the north",
            "show me a mexican place in the centre",
            "i am looking for an indian spot called olaolaolaolaolaola",
            "search for restaurants",
            "anywhere in the west",
            "anywhere near 18328",
            "I am looking for asian fusion food",
            "I am looking a restaurant in 29432",
            "I am looking for mexican indian fusion",
            "central indian restaurant"]

In [183]:
examples = [x.lower() for x in examples]

In [184]:
fn = open(OUT_PATH+'vocabulary_file_w2v.txt', 'w')
for example in examples:
    fn.write(example)
    fn.write('\n')
fn.close()

In [185]:
embed = Embedding(VOCAB_FILE, GLOVE_INP_FN, False)

In [186]:
print(embed.W.shape)
print(len(embed.vocab))

(400000, 50)
400000


In [204]:
test_example1 = 'looking for spanish restaurants'
test_example2 = 'looking for indian restaurants'
test_example3 = 'looking for south indian restaurants'
test_example4 = 'I want to find a chettinad restaurant'
test_example5 = 'chinese man looking for a indian restaurant'
refs = ["mexican","chinese","french","british","american"]
threshold = 0.2

In [205]:
# With stopwords
for example in [test_example1, test_example2, test_example3,
                test_example4, test_example5]:
    example = example.lower()
    print('text: ', example)
    print(find_similar_words(embed,example,refs))
    print('\n')

text:  looking for spanish restaurants
[0.5009760990196709, 0.5917668769198282, 0.651118669525621, 0.3510683709582437]
spanish


text:  looking for indian restaurants
[0.5009760990196709, 0.5917668769198282, 0.5738119115027638, 0.3510683709582437]
for


text:  looking for south indian restaurants
[0.5009760990196709, 0.5917668769198282, 0.5489459404006024, 0.5738119115027638, 0.3510683709582437]
for


text:  i want to find a chettinad restaurant
[0.36844053629464035, 0.4487847824103165, 0.560535816434504, 0.4564699816910438, 0.5600945573541087, -0.18182957623293405, 0.3889199208317097]
to


text:  chinese man looking for a indian restaurant
[0.6660608244765405, 0.512190629636525, 0.5009760990196709, 0.5917668769198282, 0.5600945573541087, 0.5738119115027638, 0.3889199208317097]
chinese




In [206]:
# With stopwords
stop = set(stopwords.words('english'))
for example in [test_example1, test_example2, test_example3,
                test_example4, test_example5]:
    print('text: ', example)
    example = " ".join([x.lower() for x in nltk.word_tokenize(example)
               if x not in stop])
    print(find_similar_words(embed,example,refs))
    print('\n')

text:  looking for spanish restaurants
[0.5009760990196709, 0.651118669525621, 0.3510683709582437]
spanish


text:  looking for indian restaurants
[0.5009760990196709, 0.5738119115027638, 0.3510683709582437]
indian


text:  looking for south indian restaurants
[0.5009760990196709, 0.5489459404006024, 0.5738119115027638, 0.3510683709582437]
indian


text:  I want to find a chettinad restaurant
[0.36844053629464035, 0.4487847824103165, 0.4564699816910438, -0.18182957623293405, 0.3889199208317097]
find


text:  chinese man looking for a indian restaurant
[0.6660608244765405, 0.512190629636525, 0.5009760990196709, 0.5738119115027638, 0.3889199208317097]
chinese




In [209]:
find_similar_words(embed, 'fish food', refs)

[0.3736672532256827, 0.45149458923102903]


'food'

# Intent Detection

In [210]:
import numpy as np

def sum_vecs(embed,text):

    tokens = text.split(' ')
    vec = np.zeros(embed.W.shape[1])

    for idx, term in enumerate(tokens):
        if term in embed.vocab:
            vec = vec + embed.W[embed.vocab[term], :]
    return vec


def get_centroid(embed,examples):

    C = np.zeros((len(examples),embed.W.shape[1]))
    for idx, text in enumerate(examples):
        C[idx,:] = sum_vecs(embed,text)

    centroid = np.mean(C,axis=0)
    assert centroid.shape[0] == embed.W.shape[1]
    return centroid


def get_intent(embed,text):
    intents = ['deny', 'inform', 'greet']
    vec = sum_vecs(embed,text)
    scores = np.array([ np.linalg.norm(vec-data[label]["centroid"]) for label in intents ])
    return intents[np.argmin(scores)]

In [211]:
data={
  "greet": {
    "examples" : ["hello","hey there","howdy","hello","hi","hey","hey ho"],
    "centroid" : None
  },
  "inform": {
    "examples" : [
      "i'd like something asian",
      "maybe korean",
      "what mexican options do i have",
      "what italian options do i have",
      "i want korean food",
      "i want german food",
      "i want vegetarian food",
      "i would like chinese food",
      "i would like indian food",
      "what japanese options do i have",
      "korean please",
      "what about indian",
      "i want some vegan food",
      "maybe thai",
      "i'd like something vegetarian",
      "show me french restaurants",
      "show me a cool malaysian spot"
    ],
    "centroid" : None
  },
  "deny": {
    "examples" : [
      "nah",
      "any other places ?",
      "anything else",
      "no thanks"
      "not that one",
      "i do not like that place",
      "something else please",
      "no please show other options"
    ],
    "centroid" : None
  }
}
intents = ['greet', 'inform', 'deny']

In [212]:
examples = []
for intent in intents:
    examples = list(set(examples) | set(data[intent]['examples']))

In [213]:
examples

['what mexican options do i have',
 'howdy',
 'hey',
 'hello',
 'something else please',
 'i do not like that place',
 'show me a cool malaysian spot',
 'hey there',
 'anything else',
 'hey ho',
 "i'd like something asian",
 'i would like chinese food',
 'maybe korean',
 'no please show other options',
 'what japanese options do i have',
 'no thanksnot that one',
 'what about indian',
 'show me french restaurants',
 'i want german food',
 'i want korean food',
 'hi',
 'what italian options do i have',
 'i would like indian food',
 "i'd like something vegetarian",
 'i want vegetarian food',
 'maybe thai',
 'korean please',
 'any other places ?',
 'nah',
 'i want some vegan food']

In [214]:
fn = open(VOCAB_FILE, 'w')
for example in examples:
    fn.write(example)
    fn.write('\n')
fn.close()

In [215]:
embed = Embedding(VOCAB_FILE, GLOVE_INP_FN, False)

In [216]:
for label in data.keys():
    data[label]["centroid"] = get_centroid(embed,data[label]["examples"])

In [217]:
data

{'greet': {'examples': ['hello',
   'hey there',
   'howdy',
   'hello',
   'hi',
   'hey',
   'hey ho'],
  'centroid': array([-0.11797034,  0.08401492,  0.03852101,  0.00721597, -0.03044195,
         -0.16339493, -0.05345686, -0.03382873, -0.14831921,  0.09320638,
         -0.05309111,  0.13648676, -0.04040799,  0.04370132,  0.15911899,
          0.06206383, -0.05080601,  0.18281922,  0.01270968,  0.00780746,
         -0.03542216,  0.11279987,  0.11048912,  0.07726852,  0.23467205,
         -0.23460631, -0.2877324 ,  0.07160929,  0.09418186, -0.34342254,
          0.31013594,  0.13606241, -0.05841406,  0.2051403 , -0.09657553,
         -0.11910098,  0.16374698, -0.15650952,  0.05164113, -0.07241123,
         -0.0364207 , -0.0508396 , -0.06296611, -0.0796311 ,  0.19861918,
         -0.04323929, -0.01389304, -0.17603198,  0.04052346,  0.23976296])},
 'inform': {'examples': ["i'd like something asian",
   'maybe korean',
   'what mexican options do i have',
   'what italian options do i 

In [218]:
for text in ["hey you","i am looking for chinese food","not for me"]:
    print("text : '{0}', predicted_label : '{1}'".format(text,get_intent(embed,text)))

text : 'hey you', predicted_label : 'greet'
text : 'i am looking for chinese food', predicted_label : 'inform'
text : 'not for me', predicted_label : 'deny'


In [221]:
text = "how do you do"
print("text : '{0}', predicted_label : '{1}'".format(text,get_intent(embed,text)))

text : 'how do you do', predicted_label : 'deny'
