In [1]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
from nltk import word_tokenize
from nltk.data import load


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets.zip.


In [97]:
# POS Tagging

tagdict = load('help/tagsets/upenn_tagset.pickle')
pos_tags_list = list(tagdict.keys())
pos_dict = dict.fromkeys(pos_tags_list, 0)
tags = []
flagged = ['name', 'address', 'email', 'phone number']
conversation = ['I can address the problem.', 'What is the address?', 'What is your phone number?', 'Our phone number is 9199199199.']

for sen in conversation:
 tags.append(nltk.pos_tag(word_tokenize(sen)))

tags

[[('I', 'PRP'),
  ('can', 'MD'),
  ('address', 'VB'),
  ('the', 'DT'),
  ('problem', 'NN'),
  ('.', '.')],
 [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('address', 'NN'), ('?', '.')],
 [('What', 'WP'),
  ('is', 'VBZ'),
  ('your', 'PRP$'),
  ('phone', 'NN'),
  ('number', 'NN'),
  ('?', '.')],
 [('Our', 'PRP$'),
  ('phone', 'NN'),
  ('number', 'NN'),
  ('is', 'VBZ'),
  ('9199199199', 'CD'),
  ('.', '.')]]

In [11]:
# Parse Tree
from nltk import Tree

pattern = """NP: {<DT>?<JJ>*<NN>}
VBD: {<VBD>}
IN: {<IN>}"""

NPChunker = nltk.RegexpParser(pattern) 

for sen in tags:
  result = NPChunker.parse(sen)
  Tree.fromstring(str(result)).pretty_print()

                 S                           
   ______________|______________              
  |     |        |              NP           
  |     |        |         _____|______       
I/PRP can/MD address/VB the/DT     problem/NN

                S                       
    ____________|__________              
   |      |     |          NP           
   |      |     |     _____|______       
What/WP is/VBZ ?/. the/DT     address/NN

                S                               
    ____________|__________________________      
   |      |     |          NP              NP   
   |      |     |     _____|_____          |     
What/WP is/VBZ ?/. the/DT     phone/NN number/NN

                       S                          
    ___________________|_____________________      
   |       |           |           NP        NP   
   |       |           |           |         |     
Our/PRP$ is/VBZ 919-xxx-xxxx/JJ phone/NN number/NN



# Stanza

The Stanford NLP Group's official Python NLP library. It contains support for running various accurate natural language processing tools on 60+ languages and for accessing the Java Stanford CoreNLP software from Python. 

For detailed information please visit https://stanfordnlp.github.io/stanza/.

In [30]:
!pip install stanza

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/e7/8b/3a9e7a8d8cb14ad6afffc3983b7a7322a3a24d94ebc978a70746fcffc085/stanza-1.1.1-py3-none-any.whl (227kB)
[K     |█▍                              | 10kB 18.1MB/s eta 0:00:01[K     |██▉                             | 20kB 2.2MB/s eta 0:00:01[K     |████▎                           | 30kB 2.8MB/s eta 0:00:01[K     |█████▊                          | 40kB 3.0MB/s eta 0:00:01[K     |███████▏                        | 51kB 2.5MB/s eta 0:00:01[K     |████████▋                       | 61kB 2.8MB/s eta 0:00:01[K     |██████████                      | 71kB 3.0MB/s eta 0:00:01[K     |███████████▌                    | 81kB 3.3MB/s eta 0:00:01[K     |█████████████                   | 92kB 3.3MB/s eta 0:00:01[K     |██████████████▍                 | 102kB 3.4MB/s eta 0:00:01[K     |███████████████▉                | 112kB 3.4MB/s eta 0:00:01[K     |█████████████████▎              | 122kB 3.4MB/s eta 0:00

In [35]:
import stanza
stanza.download('en')       # This downloads the English models for the neural pipeline
nlp = stanza.Pipeline('en') # This sets up a default neural pipeline in English

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 11.4MB/s]                    
2020-10-05 10:49:03 INFO: Downloading default packages for language: en (English)...
2020-10-05 10:49:05 INFO: File exists: /root/stanza_resources/en/default.zip.
2020-10-05 10:49:11 INFO: Finished downloading models and saved to /root/stanza_resources.
2020-10-05 10:49:11 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2020-10-05 10:49:11 INFO: Use device: cpu
2020-10-05 10:49:11 INFO: Loading: tokenize
2020-10-05 10:49:11 INFO: Loading: pos
2020-10-05 10:49:12 INFO: Loading: lemma
2020-10-05 10:49:12 INFO: Loading: depparse
2020-10-05 10:49:13 INFO: Loading: sentiment
2020-10-05 10:49:14 INFO: Loading: ner
2020-10-05 10:49:15 

In [40]:
doc = nlp("I can address the problem. What is your address? My address is xxx")
doc.sentences[0].print_dependencies()
doc.sentences[1].print_dependencies()
doc.sentences[2].print_dependencies()

('I', 3, 'nsubj')
('can', 3, 'aux')
('address', 0, 'root')
('the', 5, 'det')
('problem', 3, 'obj')
('.', 3, 'punct')
('What', 0, 'root')
('is', 1, 'cop')
('your', 4, 'nmod:poss')
('address', 1, 'nsubj')
('?', 1, 'punct')
('My', 2, 'nmod:poss')
('address', 4, 'nsubj')
('is', 4, 'cop')
('xxx', 0, 'root')


In [98]:
doc = nlp(' '.join(conversation))

for parsed_sentence in doc.sentences:
  flag = 0
  pos_tags = parsed_sentence.to_dict()
  print(pos_tags)


[{'id': 1, 'text': 'I', 'lemma': 'I', 'upos': 'PRON', 'xpos': 'PRP', 'feats': 'Case=Nom|Number=Sing|Person=1|PronType=Prs', 'head': 3, 'deprel': 'nsubj', 'misc': 'start_char=0|end_char=1', 'ner': 'O'}, {'id': 2, 'text': 'can', 'lemma': 'can', 'upos': 'AUX', 'xpos': 'MD', 'feats': 'VerbForm=Fin', 'head': 3, 'deprel': 'aux', 'misc': 'start_char=2|end_char=5', 'ner': 'O'}, {'id': 3, 'text': 'address', 'lemma': 'address', 'upos': 'VERB', 'xpos': 'VB', 'feats': 'VerbForm=Inf', 'head': 0, 'deprel': 'root', 'misc': 'start_char=6|end_char=13', 'ner': 'O'}, {'id': 4, 'text': 'the', 'lemma': 'the', 'upos': 'DET', 'xpos': 'DT', 'feats': 'Definite=Def|PronType=Art', 'head': 5, 'deprel': 'det', 'misc': 'start_char=14|end_char=17', 'ner': 'O'}, {'id': 5, 'text': 'problem', 'lemma': 'problem', 'upos': 'NOUN', 'xpos': 'NN', 'feats': 'Number=Sing', 'head': 3, 'deprel': 'obj', 'misc': 'start_char=18|end_char=25', 'ner': 'O'}, {'id': 6, 'text': '.', 'lemma': '.', 'upos': 'PUNCT', 'xpos': '.', 'head': 3, 

In [103]:
# Verb differentiation for 'address' 
# Implement the Skills Conflicting with the Developer Specifications methond [Section 4.2 in the paper[1]]

sentence_flagged = []

for parsed_sentence in doc.sentences:
  flag_noun, flag_object = 0,0
  pos_tags = parsed_sentence.to_dict()
  
  for word in pos_tags:
    
    # Identify if the flagged words are addressed as nouns
    if (word['text'] == 'address' or word['text'] == 'phone' or  word['text'] == 'number') and word['upos'] == 'NOUN':
      flag_noun = 1

    # Identify if the subject addressed is the user in the conversation
    if (word['text'] == 'you' or word['text'] == 'your') and word['deprel'] == 'nmod:poss':
      flag_object = 1

  sentence_flagged.append(flag_noun and flag_object) 
    

# References:

* https://www.usenix.org/conference/usenixsecurity20/presentation/guo
* https://corenlp.run/
* https://github.com/stanfordnlp/stanza/