In [43]:
import spacy 
import pandas as pd 
import utils

from collections import Counter 
from IPython.display import display
from spacy.matcher import Matcher

# auto load the changes of referenced codes
%load_ext autoreload
%autoreload 2

# ebablbe auto-completion
%config Completer.use_jedi = False

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data

In [7]:
df_atis = pd.read_csv("./Chapter06/data/atis_intents.csv", header=None)
df_atis.columns = ['intent', 'text']

In [35]:
print('-'*50)
print(len(df_atis))

print('-'*50)
display(df_atis.head(5))

print('-'*50)
display(df_atis.describe().T)

print('-'*50)
df_atis.intent.unique()

print('-'*50)
df_atis.info()

--------------------------------------------------
4978
--------------------------------------------------


Unnamed: 0,intent,text
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...


--------------------------------------------------


Unnamed: 0,count,unique,top,freq
intent,4978,22,atis_flight,3666
text,4978,4634,what is fare code h,8


--------------------------------------------------
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4978 entries, 0 to 4977
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   intent  4978 non-null   object
 1   text    4978 non-null   object
dtypes: object(2)
memory usage: 77.9+ KB


查看数据内容。可以看到：

- 第一个用户说想定某个时间，从某地到某地ide航班？
- 第二个用户同第一个用户
- 第三个用户想问航班的到达时间。
- 第三根用户想问从某地到某地最便宜的机票。
- 第五个用户问从某地到某地低于1000美元的航班。

In [13]:
print(*list(df_atis.head().text), sep='\n')

 i want to fly from boston at 838 am and arrive in denver at 1110 in the morning
 what flights are available from pittsburgh to baltimore on thursday morning
 what is the arrival time in san francisco for the 755 am flight leaving washington
 cheapest airfare from tacoma to orlando
 round trip fares from pittsburgh to philadelphia under 1000 dollars


查看意图的分布。

In [36]:
df_grouped = df_atis.groupby(['intent']).size()
df_grouped

intent
atis_abbreviation                            147
atis_aircraft                                 81
atis_aircraft#atis_flight#atis_flight_no       1
atis_airfare                                 423
atis_airfare#atis_flight_time                  1
atis_airline                                 157
atis_airline#atis_flight_no                    2
atis_airport                                  20
atis_capacity                                 16
atis_cheapest                                  1
atis_city                                     19
atis_distance                                 20
atis_flight                                 3666
atis_flight#atis_airfare                      21
atis_flight_no                                12
atis_flight_time                              54
atis_ground_fare                              18
atis_ground_service                          255
atis_ground_service#atis_ground_fare           1
atis_meal                                      6
atis_quantity

在atis_abbreviation这种意图中，用户询问关于各种缩写的含义。

In [24]:
df_filter = df_atis.loc[(df_atis.intent=='atis_abbreviation')]
print(*list(df_filter.sample(10).text), sep='\n')

 what is the ap57 restriction
 what is ua
 what does nw stand for
 what are fare codes qw and qx
 what is sa
 what is the yn code
 what is fare code f
 what does ap57 mean
 what is fare code f
 what is fare code h


## Extracting named entities

需要注意的是，在书中使用如下代码获取文本（相当于df_atis的text列）

~~~shell
awk -F ',' '{print $2}' data/atis_intents.csv  > data/atis_utterances.txt
~~~

In [27]:
nlp  = spacy.load("en_core_web_md") 

corpus = open("Chapter06/data/atis_utterances.txt", "r").read().split("\n") 

all_ent_labels = [] 
for sentence in corpus: 
    doc = nlp(sentence.strip()) 
    ents = doc.ents 
    all_ent_labels += [ent.label_ for ent in ents] 

c = Counter(all_ent_labels) 
print(c) 

Counter({'GPE': 9219, 'DATE': 1454, 'TIME': 925, 'ORG': 425, 'CARDINAL': 275, 'ORDINAL': 201, 'FAC': 63, 'NORP': 60, 'MONEY': 52, 'PERSON': 17, 'PRODUCT': 14, 'LOC': 6, 'EVENT': 3})


上面的代码运行起来有些慢。采用如下代码快好几倍。

In [39]:
texts = list(df_atis.text)
texts = [text.strip() for text in texts]
docs = nlp.pipe(texts)

all_ent_labels = [] 
for doc in docs: 
    ents = doc.ents 
    all_ent_labels += [ent.label_ for ent in ents] 

c = Counter(all_ent_labels) 
print(c) 

Counter({'GPE': 9219, 'DATE': 1454, 'TIME': 925, 'ORG': 425, 'CARDINAL': 275, 'ORDINAL': 201, 'FAC': 63, 'NORP': 60, 'MONEY': 52, 'PERSON': 17, 'PRODUCT': 14, 'LOC': 6, 'EVENT': 3})


### Extracting named entities with Matcher
抽取的内容如下

In [63]:
def show_matches(nlp, doc, matches):
    for match_id, start, end in matches:
        pattern_name = nlp.vocab.strings[match_id]
        m_span = doc[start:end]  
        print(pattern_name, start, end, m_span.text)  
        
# 起始地址
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "ADP"}, {"ENT_TYPE": "GPE"}]
matcher.add("prepositionLocation", [pattern])

# 航班信息
pattern = [{"ENT_TYPE": "ORG", "OP": "+"}]
matcher.add("AirlineName", [pattern])

# 日期和时间
pattern = [{"ENT_TYPE":  {"IN": ["DATE", "TIME"]}}]
matcher.add("Datetime", [pattern])

# 缩写abbreviation
pattern1 = [{"TEXT": {"REGEX": "\w{1,2}\d{1,2}"}}]
pattern2 = [{"SHAPE": { "IN": ["x", "xx"]}}, {"SHAPE": {"IN": ["d", "dd"]}}]
pattern3 = [{"TEXT": {"IN": ["class", "code", "abbrev", "abbreviation"]}}, {"SHAPE": { "IN": ["x", "xx"]}}]
pattern4 = [{"POS": "NOUN", "SHAPE": { "IN": ["x", "xx"]}}]
matcher.add("abbrevEntities", [pattern1, pattern2, pattern3, pattern4])

texts = ["show me flights from denver to boston on tuesday",
         "i'm looking for a flight that goes from ontario to westchester and stops in chicago",
         "what flights arrive in chicago on sunday on continental",
         "yes i'd like a flight from long beach to st. louis by way of dallas",
         "what are the evening flights flying out of dalas",
         "what is the earliest united airlines flight flying from denver",
         'what does restriction ap 57 mean',
         'what does the abbreviation co mean',
         'what does fare code qo mean',
         'what is the abbreviation d10',
         'what does code y mean',
         'what does the fare code f and fn mean',
         'what is booking class c']
docs = nlp.pipe(texts)

for doc in docs:
    print('-'*50)
    spacy.displacy.render(doc, style='ent')
#     print([(ent.text, ent.label_) for ent in doc.ents])
    matches = matcher(doc)
    show_matches(nlp, doc, matches)

--------------------------------------------------


prepositionLocation 3 5 from denver
prepositionLocation 5 7 to boston
Datetime 8 9 tuesday
--------------------------------------------------


prepositionLocation 8 10 from ontario
prepositionLocation 14 16 in chicago
--------------------------------------------------


prepositionLocation 3 5 in chicago
Datetime 6 7 sunday
--------------------------------------------------


prepositionLocation 9 11 to st
prepositionLocation 15 17 of dallas
--------------------------------------------------


--------------------------------------------------


AirlineName 4 5 united
AirlineName 4 6 united airlines
AirlineName 5 6 airlines
prepositionLocation 8 10 from denver
--------------------------------------------------


abbrevEntities 3 5 ap 57
abbrevEntities 4 5 57
--------------------------------------------------


abbrevEntities 3 5 abbreviation co
abbrevEntities 4 5 co
--------------------------------------------------


abbrevEntities 3 5 code qo
abbrevEntities 4 5 qo
--------------------------------------------------


abbrevEntities 4 5 d10
--------------------------------------------------


abbrevEntities 2 4 code y
--------------------------------------------------


abbrevEntities 4 6 code f
abbrevEntities 7 8 fn
--------------------------------------------------


abbrevEntities 3 5 class c
abbrevEntities 4 5 c


### Using dependency trees for extracting entities

~~~
I want to fly to Munich tomorrow. 
~~~

对于上面的语句，上节的抽取方法可以抽取到destination city，但是如果是下面的语句，就无能为了。

~~~
I'm going to a conference in Munich. I need an air ticket.
My sister's wedding will be held in Munich. I'd like to book a flight
I want to book a flight to my conference without stopping at Berlin.
~~~

本节将尝试使用dependency trees来解析他们之间的关系。

In [67]:
texts = ["I'm going to a conference in Munich. I need an air ticket.",
         "My sister's wedding will be held in Munich. I'd like to book a flight",
         "I want to book a flight to my conference without stopping at Berlin."]

def reach_parent(source_token, dest_token):
    source_token = source_token.head
    while source_token != dest_token:
        if source_token.head == source_token:
            return None
        source_token = source_token.head
    return source_token

doc = nlp("I'm going to a conference in Munich.")
source_token = reach_parent(doc[-2], doc[3])
print(doc[-2], doc[3], source_token)

Munich to to


## Using dependency relations for intent recognition
### Linguistic primer

ATIS数据中包含了多个意图（intent），比如：

- book a flight
- purchase a meal on their already booked flight
- cancel their flight

可以发现这些意图可以表示为：动词（verb） + 对象（object）。在本节中将使用这个规则，将抽取：
- 及物动词/非及物动词（transitive/intransitive verbs）
- 直接宾语/间接宾语（direct/indirect objects）

下面是首先来理解及物动词/非及物动词。

- 及物动词
    ~~~
    I bought flowers.
    He loved his cat.
    He borrowed my book.
    ~~~

- 非及物动词
    ~~~
    Yesterday I slept for 8 hours.
    The cat ran towards me.
    When I went out, the sun was shining.
    Her cat died 3 days ago.
    ~~~
    
再来看直接宾语/间接宾语。

- 直接宾语
    ~~~
    I bought flowers.  I bought what? - flowers
    He loved his cat.  He loved who?  - his cat
    He borrowed my book. He borrowed what? - my book
    ~~~

- 间接宾语
    ~~~
    He gave me his book.  He gave his book to whom?  - me
    He gave his book to me. He gave his book to whom? -me
    ~~~
    
下面代码将展示直接宾语和间接宾语。更多相关知识，可以参见这本书[Linguistic Fundamentals for Natural Language Processing: 100 Essentials from Morphology and Syntax](https://dl.acm.org/doi/book/10.5555/2534456)。

- 直接宾语： 用dobj表示。
- 间接宾语： 用dative表示。

In [69]:
doc = nlp("He gave me his book.")
spacy.displacy.render(doc, style='dep')

doc = nlp("He gave his book to me. ")
spacy.displacy.render(doc, style='dep')

### Extracting transitive verbs and their direct objects

通过抽取transitive verbs和direct objects，我们可以识别用户意图（intent）。比如:

In [72]:
doc = nlp("find a flight from washington to sf")
spacy.displacy.render(doc, style='dep')
for token in doc:
    if token.dep_ == "dobj":
        print(token.head.text + token.text.capitalize())

findFlight


### Extracting multiple intents with conjunction relation

有些用户场景，覆盖了多个用户意图。比如：

~~~
show all flights and fares from denver to san francisco
~~~

使用token.conjuncts可以返回coordinated tokens。这样我们不难发现用户的意图是showFlights和showFares.

In [75]:
doc = nlp("show all flights and fares from denver to san francisco")
spacy.displacy.render(doc, style='dep')
for token in doc:
    if token.dep_ == "dobj":
        dobj = token.text
        conj = [t.text for t in token.conjuncts]
        verb = token.head
print(verb, dobj, conj)

show flights ['fares']


### Recognizing the intent using wordlists

In [91]:
doc = nlp("i want to make a reservation for a flight")
spacy.displacy.render(doc, style='dep')

dObj =None
tVerb = None

# Extract the direct object and its transitive verb
for token in doc:
    if token.dep_ == "dobj":
        dObj = token
        tVerb = token.head 

print(f'dObj: {dObj}')
print(f'tVerb: {tVerb}')
        
# Extract the helper verb
intentVerb = None
verbList = ["want", "like", "need", "order"]
if tVerb.text in verbList:
    intentVerb = tVerb
else:
    if tVerb.head.dep_ == "ROOT":
        intentVerb = tVerb.head
        
# Extract the object of the intent
intentObj = None
objList = ["flight", "meal", "booking"]
if dObj.text in objList:
    intentObj = dObj
else:
    for child in tVerb.children:
        if child.dep_ == "prep":
            intentObj = list(child.children)[0]
            break
        elif child.dep_ == "compound":
            intentObj = child
            break
            
print(intentVerb.text + intentObj.text.capitalize())            

dObj: reservation
tVerb: make
wantFlight


## Semantic similarity methods for semantic parsing

一般来说，有两种方式来识别语法的相似性。

- 使用同义词字典（synonyms dictionary）
- 使用基于词向量的语义相似度

本节将尝试这两种方法。

### Using synonyms lists for semantic similarity

In [102]:

verbSynsets = {
    "show": ["list"],
    "book": ["make a reservation", "buy", "reserve"]
} 

objSynsets = {
    "meal": ["food"],
    "plane": ["aircraft", "airplane"]
}    


def get_vert_object(doc):
    for token in doc:
        if token.dep_ == "dobj":
            obj = token.lemma_
            verb = token.head.lemma_
            break    
    return verb, obj

def synonym_replace(verb, obj, 
                    verbSynsets=verbSynsets, 
                    objSynsets=objSynsets):
    for key, synonyms in verbSynsets.items():
        if verb in synonyms:
            verb = key
            break
                
    for key, synonyms in objSynsets.items():
        if obj in synonyms:
            obj = key
            break                
        
    return verb, obj
    


doc1 = nlp("show me all aircrafts that cp uses")
doc2 = nlp("list all meals on my flight")

verb1, obj1 = get_vert_object(doc1)
verb2, obj2 = get_vert_object(doc2)

print(verb1+obj1.capitalize())
print(verb2+obj2.capitalize())

synonym_verb1, synonym_obj1 = synonym_replace(verb1, obj1)
synonym_verb2, synonym_obj2 = synonym_replace(verb2, obj2)

print('-'*50)
print(verb1+obj1.capitalize())
print(verb2+obj2.capitalize())

showAircraft
listMeal
--------------------------------------------------
showAircraft
listMeal


### Using word vectors to recognize semantic similarity

从下面的相似度，结果可以认定，这两个语句是统一场景的可能性非常低。

In [108]:
def get_vert_object(doc):
    for token in doc:
        if token.dep_ == "dobj":
            obj = token
            verb = token.head
            break    
    return verb, obj

verb1, obj1 = get_vert_object(doc1)
verb2, obj2 = get_vert_object(doc2)

print(obj1.similarity(obj2))
print(verb1.similarity(verb2))

0.15025872
0.33161193


## Putting it all together

In [117]:
matcher = Matcher(nlp.vocab)

doc = nlp("show me flights from denver to philadelphia on tuesday")
ents = doc.ents
print(f'ents = {ents}')

# 介词 + 地点
print('-'*50)
pattern = [{"POS": "ADP"}, {"ENT_TYPE": "GPE"}]
matcher.add("prepositionLocation", [pattern])
matches = matcher(doc)
show_matches(nlp, doc, matches)

# 直接宾语
print('-'*50)
for token in doc:
    if token.dep_ == "dobj":
        print(token.head.lemma_ + token.lemma_.capitalize())

ents = (denver, philadelphia, tuesday)
--------------------------------------------------
prepositionLocation 3 5 from denver
prepositionLocation 5 7 to philadelphia
--------------------------------------------------
showFlight
