In [1]:
import re

pattern = "do you remember .*"
message = "do you remember eating apples"

match = re.search(pattern, message)
if match:
  print("Matched")

Matched


In [2]:
pattern = "if (.*)"
message = "what would happen if bots take over the world"

match = re.search(pattern,message)
print(match.group(0))
print(match.group(1))

if bots take over the world
bots take over the world


In [3]:
import re
print(re.search(r"\b(hi|hello|hye)\b","hye there!") is not None)
print(re.search(r"(hi|hello|hye)","which one") is not None)
print(re.search(r"\b(hi|hello|hye)\b","which one") is not None)

True
True
False


In [4]:
pattern = re.compile('[A-Z]{1}[a-z]*')
message = "Marry is my Friend at Oxford"
pattern.findall(message)

['Marry', 'Friend', 'Oxford']

Intent Identification

In [5]:
keywords = {'goodbye': ['bye', 'farewell'],
 'greet': ['hello', 'hi', 'hey'],
 'thankyou': ['thank', 'thx']}

patterns = {}

for intent, keys in keywords.items():
  patterns[intent] = re.compile('|'.join(keys))

print(patterns)

{'goodbye': re.compile('bye|farewell'), 'greet': re.compile('hello|hi|hey'), 'thankyou': re.compile('thank|thx')}


In [6]:
responses = {'default': 'default message',
 'goodbye': 'goodbye for now',
 'greet': 'Hello you! :)',
 'thankyou': 'you are very welcome'}
 
def match_intent(message):
  matched_intent = None
  for intent, pattern in patterns.items():
    if pattern.search(message):
      matched_intent=intent
  return matched_intent

In [7]:
def respond(message):
  intent = match_intent(message)

  key = 'default'
  if intent in responses:
    key=intent
  return(responses[key])

In [8]:
bot_template = "BOT : {0}"
user_template = "USER : {0}"

def send_message(message):
  print(user_template.format(message))
  response = respond(message)
  print(bot_template.format(response))

# Send messages
send_message("hello!")
send_message("bye byeee")
send_message("thanks very much!")
send_message("yoo")

USER : hello!
BOT : Hello you! :)
USER : bye byeee
BOT : goodbye for now
USER : thanks very much!
BOT : you are very welcome
USER : yoo
BOT : default message


Entity Identification

In [9]:
def find_name(message):
  name=None

  name_keyword = re.compile(r"name|call")

  name_pattern = re.compile(r"[A-Z]{1}[a-z]*")

  if name_keyword.search(message):
    name_words = name_pattern.findall(message)
    if len(name_words)>0:
      name = ' '.join(name_words)
  return name

def respond(message):
  name = find_name(message)
  if name is None:
    return("Hi there!")
  else:
    return("Hello {0}".format(name))

send_message("my name is David Copperfield")
send_message("call me Ishmael")
send_message("People call me Cassandra")

USER : my name is David Copperfield
BOT : Hello David Copperfield
USER : call me Ishmael
BOT : Hello Ishmael
USER : People call me Cassandra
BOT : Hello People Cassandra


Word Vectors: Spacy

In [10]:
import spacy
nlp = spacy.load('en')

doc = nlp("Hye How can I help you")
for token in doc:
  print("{} : {}".format(token,token.vector[:3]))
print(nlp.vocab.vectors_length)

Hye : [ 1.04217    -2.4582524  -0.84306914]
How : [ 1.9784399  -0.69105196  2.3142889 ]
can : [ 1.3584795  1.1       -4.645312 ]
I : [ 1.4846243  3.2576845 -2.4165175]
help : [-0.5243782   3.151106    0.34150904]
you : [0.6228677 1.830877  1.1961658]
0


Cosine similatrity

In [11]:
doc = nlp('cat')
print(doc.similarity(nlp('can')))
print(doc.similarity(nlp('dog')))

0.2550576840046807
0.6549556828973659


  "__main__", mod_spec)
  "__main__", mod_spec)


Atis Dataset for intent identification

In [12]:
!wget -nc https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/standard_format/rasa/train.json -P data
!wget -nc https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/standard_format/rasa/test.json -P data

--2021-01-28 14:59:03--  https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/standard_format/rasa/train.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4586495 (4.4M) [text/plain]
Saving to: ‘data/train.json’


2021-01-28 14:59:04 (27.2 MB/s) - ‘data/train.json’ saved [4586495/4586495]

--2021-01-28 14:59:04--  https://raw.githubusercontent.com/howl-anderson/ATIS_dataset/master/data/standard_format/rasa/test.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 788180 (770K) [text/plain]
Saving to: ‘data/test.json’


In [13]:
import json

with open("data/train.json") as f:
  train_raw=json.load(f)
print(train_raw)

{'rasa_nlu_data': {'common_examples': [{'text': 'i want to fly from boston at 838 am and arrive in denver at 1110 in the morning', 'intent': 'flight', 'entities': [{'start': 19, 'end': 25, 'value': 'boston', 'entity': 'fromloc.city_name'}, {'start': 29, 'end': 35, 'value': '838 am', 'entity': 'depart_time.time'}, {'start': 50, 'end': 56, 'value': 'denver', 'entity': 'toloc.city_name'}, {'start': 60, 'end': 64, 'value': '1110', 'entity': 'arrive_time.time'}, {'start': 72, 'end': 79, 'value': 'morning', 'entity': 'arrive_time.period_of_day'}]}, {'text': 'what flights are available from pittsburgh to baltimore on thursday morning', 'intent': 'flight', 'entities': [{'start': 32, 'end': 42, 'value': 'pittsburgh', 'entity': 'fromloc.city_name'}, {'start': 46, 'end': 55, 'value': 'baltimore', 'entity': 'toloc.city_name'}, {'start': 59, 'end': 67, 'value': 'thursday', 'entity': 'depart_date.day_name'}, {'start': 68, 'end': 75, 'value': 'morning', 'entity': 'depart_time.period_of_day'}]}, {'tex

In [14]:
import pandas as pd
train_df = pd.DataFrame.from_dict(train_raw['rasa_nlu_data']['common_examples'])
print(train_df)

                                                   text  ...                                           entities
0     i want to fly from boston at 838 am and arrive...  ...  [{'start': 19, 'end': 25, 'value': 'boston', '...
1     what flights are available from pittsburgh to ...  ...  [{'start': 32, 'end': 42, 'value': 'pittsburgh...
2     what is the arrival time in san francisco for ...  ...  [{'start': 12, 'end': 24, 'value': 'arrival ti...
3               cheapest airfare from tacoma to orlando  ...  [{'start': 1, 'end': 9, 'value': 'cheapest', '...
4     round trip fares from pittsburgh to philadelph...  ...  [{'start': 1, 'end': 11, 'value': 'round trip'...
...                                                 ...  ...                                                ...
4973  what is the airfare for flights from denver to...  ...  [{'start': 37, 'end': 43, 'value': 'denver', '...
4974  do you have any flights from denver to baltimo...  ...  [{'start': 29, 'end': 35, 'value': 'denver

In [15]:
import json

with open("data/test.json") as f:
  test_raw=json.load(f)
print(test_raw)

{'rasa_nlu_data': {'common_examples': [{'text': 'i would like to find a flight from charlotte to las vegas that makes a stop in st. louis', 'intent': 'flight', 'entities': [{'start': 35, 'end': 44, 'value': 'charlotte', 'entity': 'fromloc.city_name'}, {'start': 48, 'end': 57, 'value': 'las vegas', 'entity': 'toloc.city_name'}, {'start': 79, 'end': 88, 'value': 'st. louis', 'entity': 'stoploc.city_name'}]}, {'text': 'on april first i need a ticket from tacoma to san jose departing before 7 am', 'intent': 'airfare', 'entities': [{'start': 3, 'end': 8, 'value': 'april', 'entity': 'depart_date.month_name'}, {'start': 9, 'end': 14, 'value': 'first', 'entity': 'depart_date.day_number'}, {'start': 36, 'end': 42, 'value': 'tacoma', 'entity': 'fromloc.city_name'}, {'start': 46, 'end': 54, 'value': 'san jose', 'entity': 'toloc.city_name'}, {'start': 65, 'end': 71, 'value': 'before', 'entity': 'depart_time.time_relative'}, {'start': 72, 'end': 76, 'value': '7 am', 'entity': 'depart_time.time'}]},

In [16]:
test_df = pd.DataFrame.from_dict(test_raw['rasa_nlu_data']['common_examples'])
print(test_df)

                                                  text  ...                                           entities
0    i would like to find a flight from charlotte t...  ...  [{'start': 35, 'end': 44, 'value': 'charlotte'...
1    on april first i need a ticket from tacoma to ...  ...  [{'start': 3, 'end': 8, 'value': 'april', 'ent...
2    on april first i need a flight going from phoe...  ...  [{'start': 3, 'end': 8, 'value': 'april', 'ent...
3    i would like a flight traveling one way from p...  ...  [{'start': 32, 'end': 39, 'value': 'one way', ...
4    i would like a flight from orlando to salt lak...  ...  [{'start': 27, 'end': 34, 'value': 'orlando', ...
..                                                 ...  ...                                                ...
888  please find all the flights from cincinnati to...  ...  [{'start': 33, 'end': 43, 'value': 'cincinnati...
889  find me a flight from cincinnati to any airpor...  ...  [{'start': 22, 'end': 32, 'value': 'cincinnati...
8

In [17]:
import numpy as np

X_train_shape = (len(train_df.text),96)
X_train = np.zeros(X_train_shape)
i=0
for sentence in train_df.text:
  X_train[i,:]=nlp(sentence).vector
  i=i+1

In [18]:
X_test_shape = (len(test_df.text),96)
X_test = np.zeros(X_test_shape)
i=0
for sentence in test_df.text:
  X_test[i,:]=nlp(sentence).vector
  i=i+1

Cosine similarity

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
test_message = "I would like to find a flight time from India to America"
test_x = nlp(test_message).vector

scores = [cosine_similarity(X_train[i,:].reshape(1,-1),test_x.reshape(1,-1)) for i in range(len(X_train))]

train_df.intent[np.argmax(scores)]

'flight'

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
test_message = "What is the arrival time in San francisco"
test_x = nlp(test_message).vector

scores = [cosine_similarity(X_train[i,:].reshape(1,-1),test_x.reshape(1,-1)) for i in range(len(X_train))]

train_df.intent[np.argmax(scores)]

'flight_time'

SVC

In [21]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
svc = SVC()
svc.fit(X_train,train_df.intent)
y_pred = svc.predict(X_test)
print(accuracy_score(y_pred,test_df.intent))

0.8521836506159015


Named Entity Recognition using Spacy

In [22]:
import spacy
nlp = spacy.load('en')
doc = nlp('My friend John worked at Google since 2004')
for ent in doc.ents:
  print(ent.text,ent.label_)

John PERSON
Google ORG
2004 DATE


In [23]:
include_entities = ['DATE', 'ORG', 'PERSON']

def extract_entities(message):
  ents = dict.fromkeys(include_entities)

  doc = nlp(message)
  for ent in doc.ents:
    if ent.label_ in include_entities:
      ents[ent.label_]=' & '.join(filter(None,[ents[ent.label_],ent.text]))
  return ents

print(extract_entities('friends called Mary who have worked at Google since 2010'))
print(extract_entities('people who graduated from MIT in 1999 are Mary and John'))

{'DATE': '2010', 'ORG': 'Google', 'PERSON': 'Mary'}
{'DATE': '1999', 'ORG': 'MIT', 'PERSON': 'Mary & John'}


Entity Extraction

In [24]:
doc = nlp('a flight to India from America')
India, America = doc[3],doc[5]
print("India: ",list(India.ancestors))
print("America: ",list(America.ancestors))

India:  [to, flight]
America:  [from, flight]


In [25]:
doc = nlp('a flight from America to India')
India, America = doc[5],doc[3]
print("India: ",list(India.ancestors))
print("America: ",list(America.ancestors))

India:  [to, flight]
America:  [from, flight]


In [26]:
doc = nlp("lets see that jacket in red and some blue jeans")

colors = [doc[5],doc[8]]
items = [doc[3],doc[9]]

for color in colors:
  for tok in color.ancestors:
    if tok in items:
      print("color {} belongs to item {}".format(color,tok))
      break

color red belongs to item jacket
color blue belongs to item jeans
