In [1]:
import numpy as np
import pandas as pd
import math, string, re
import sklearn
import random
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

import spacy
nlp = spacy.load('lib/en_core_web_sm-2.3.1/en_core_web_sm/en_core_web_sm-2.3.1/')

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dinuka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Dinuka\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
def pos_tag_word(w,l='x'):
    return (w,nlp(w)[0].tag_,l)

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

def _entity_string_to_dict(entity_string):
    entity_extract_pattern = re.compile(r'(?P<entity>\[(?P<value>.+?)\]\((?P<name_and_synonyms>.+?)\))')
    new_string = ''
    start = 0
    output = dict()
    output['entities'] = []
    for item in re.finditer(entity_extract_pattern, entity_string):
        d = dict()
        new_string += entity_string[start:item.start()]
        start = item.start()
        d['span_start'] = len(new_string)
        new_string += item.group('value')
        d['span_end'] = len(new_string)
        start += len(item.group('entity'))
        d['entity_value'] = item.group('value')

        syn_items = item.group('name_and_synonyms').split('|')
        d['entity_type'] = syn_items[0]
        d['synonyms'] = list()
        if len(syn_items) > 1:
            d['synonyms'] += [t for t in syn_items[1:]]

        output['entities'].append(d)
    new_string += entity_string[start:]
    output['statement'] = new_string
    return output


In [3]:
data = pd.read_csv("evalution_data_set.csv")
data.head()

Unnamed: 0,label,expression
0,intent_flight_search,flight search
1,intent_flight_search,can you help me to find a flight ?
2,intent_flight_search,please help me to search for a flight
3,intent_flight_search,I want a flight from [Colombo](Departure.Airpo...
4,intent_flight_search,We need a flight from [jfk](Departure.Airport....


In [4]:
data['entity'] = data.apply(lambda x: _entity_string_to_dict(x['expression']), axis=1)
data['statement'] = data.apply(lambda x: _entity_string_to_dict(x['expression'])['statement'], axis=1)
data.head()


Unnamed: 0,label,expression,entity,statement
0,intent_flight_search,flight search,"{'entities': [], 'statement': 'flight search'}",flight search
1,intent_flight_search,can you help me to find a flight ?,"{'entities': [], 'statement': 'can you help me...",can you help me to find a flight ?
2,intent_flight_search,please help me to search for a flight,"{'entities': [], 'statement': 'please help me ...",please help me to search for a flight
3,intent_flight_search,I want a flight from [Colombo](Departure.Airpo...,"{'entities': [{'span_start': 21, 'span_end': 2...",I want a flight from Colombo
4,intent_flight_search,We need a flight from [jfk](Departure.Airport....,"{'entities': [{'span_start': 22, 'span_end': 2...",We need a flight from jfk airport


In [5]:
sent_list=[]
for ent in data['entity']:
    word_list = []
    last_index= 0 
    for entobj in ent['entities']:
        words_before = ent['statement'][last_index:int(entobj["span_start"])]
        for word in words_before.split():
            word_list.append(pos_tag_word(word))
        last_index = int(entobj["span_end"])
        word_list.append((entobj["entity_value"],nlp(entobj["entity_value"])[0].tag_,entobj["entity_type"]))
    for word in ent['statement'][last_index:].split():
        word_list.append(pos_tag_word(word))
    sent_list.append(word_list)




In [6]:
random.shuffle(sent_list)
test_per = 0.7
n=int(test_per*len(sent_list))
train_se = sent_list[:n].copy()
test_se = sent_list[n:].copy()


tokenized_data = pd.DataFrame(sent_list[50],columns=['Token', 'POS', 'Label'])
tokenized_data

Unnamed: 0,Token,POS,Label
0,I'm,PRP,x
1,leaving,VBG,x
2,from,IN,x
3,LAX,NNP,Departure.Airport.AirportCode
4,to,IN,x
5,colombo,NNP,Arrival.Airport.AirportCode
6,on,IN,x
7,flight,NN,x
8,581,CD,FlightNumber


In [7]:
data['entity'][5]['entities']

entity=[]
array = data['entity'].copy()
for x in array:
    for en in x["entities"]:
        en['sentence'] = x['statement']
        entity.append(en)
        
newdf = pd.DataFrame(entity)
newdf

Unnamed: 0,span_start,span_end,entity_value,entity_type,synonyms,sentence
0,21,28,Colombo,Departure.Airport.AirportCode,"[cmb, colombo, CMB]",I want a flight from Colombo
1,22,25,jfk,Departure.Airport.AirportCode,"[JFK, John F. Kennedy]",We need a flight from jfk airport
2,30,37,Calgary,Departure.Airport.AirportCode,"[calgary, yyc, YYC]",can you find me a flight from Calgary
3,19,26,colombo,Arrival.Airport.AirportCode,"[CMB, Colombo, cmb]",I want a flight to colombo
4,27,32,Dubai,Arrival.Airport.AirportCode,"[dubai, DXB, dxb]",please find me a flight to Dubai
...,...,...,...,...,...,...
114,53,61,Heathrow,Arrival.Airport.AirportCode,"[heathrow, LHR, lhr]",on next friday I'm taking flight 874 from Calg...
115,17,20,234,FlightNumber,"[600, 666, 923, 234]",we are on flight 234 from jfk airport to Calga...
116,26,29,jfk,Departure.Airport.AirportCode,"[JFK, John F. Kennedy]",we are on flight 234 from jfk airport to Calga...
117,41,48,Calgary,Arrival.Airport.AirportCode,"[calgary, yyc, YYC]",we are on flight 234 from jfk airport to Calga...


In [8]:
X_train = [sent2features(s) for s in train_se]
y_train = [sent2labels(s) for s in train_se]

X_test = [sent2features(s) for s in test_se]
y_test = [sent2labels(s) for s in test_se]


In [9]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [10]:
labels = list(crf.classes_)
labels.remove('x')
labels

['Departure.Airport.AirportCode',
 'Arrival.Airport.AirportCode',
 'FlightNumber',
 'Departure.EstimatedDate']

In [11]:
from sklearn_crfsuite import metrics

y_pred = crf.predict(X_test)
print("Accuracy : {}".format(metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)))

Accuracy : 0.9707524985302763


In [12]:
print('Train set classification report: \n\n{}'.format(metrics.flat_classification_report(
y_test, y_pred, labels=labels, digits=3
)))

Train set classification report: 

                               precision    recall  f1-score   support

Departure.Airport.AirportCode      0.929     1.000     0.963        13
  Arrival.Airport.AirportCode      1.000     0.750     0.857         4
                 FlightNumber      1.000     1.000     1.000         6
      Departure.EstimatedDate      1.000     1.000     1.000        13

                    micro avg      0.972     0.972     0.972        36
                    macro avg      0.982     0.938     0.955        36
                 weighted avg      0.974     0.972     0.971        36





In [13]:
pred_data = [ pos_tag_word(w,'') for w in ["we","are","planning","to","leave","next monday","on","flight","981"]]
pd.DataFrame(pred_data)

Unnamed: 0,0,1,2
0,we,PRP,
1,are,VBP,
2,planning,NN,
3,to,IN,
4,leave,VB,
5,next monday,JJ,
6,on,IN,
7,flight,NN,
8,981,CD,


In [14]:

feature_arr = [sent2features(vbg) for vbg in [pred_data]]
pd.DataFrame(feature_arr[0])


Unnamed: 0,bias,word.lower(),word[-3:],word[-2:],word.isupper(),word.istitle(),word.isdigit(),postag,postag[:2],BOS,...,+1:word.istitle(),+1:word.isupper(),+1:postag,+1:postag[:2],-1:word.lower(),-1:word.istitle(),-1:word.isupper(),-1:postag,-1:postag[:2],EOS
0,1.0,we,we,we,False,False,False,PRP,PR,True,...,False,False,VBP,VB,,,,,,
1,1.0,are,are,re,False,False,False,VBP,VB,,...,False,False,NN,NN,we,False,False,PRP,PR,
2,1.0,planning,ing,ng,False,False,False,NN,NN,,...,False,False,IN,IN,are,False,False,VBP,VB,
3,1.0,to,to,to,False,False,False,IN,IN,,...,False,False,VB,VB,planning,False,False,NN,NN,
4,1.0,leave,ave,ve,False,False,False,VB,VB,,...,False,False,JJ,JJ,to,False,False,IN,IN,
5,1.0,next monday,day,ay,False,False,False,JJ,JJ,,...,False,False,IN,IN,leave,False,False,VB,VB,
6,1.0,on,on,on,False,False,False,IN,IN,,...,False,False,NN,NN,next monday,False,False,JJ,JJ,
7,1.0,flight,ght,ht,False,False,False,NN,NN,,...,False,False,CD,CD,on,False,False,IN,IN,
8,1.0,981,981,81,False,False,True,CD,CD,,...,,,,,flight,False,False,NN,NN,True


In [15]:
y_pred = crf.predict(feature_arr)
print(y_pred)

[['x', 'x', 'x', 'x', 'x', 'Departure.EstimatedDate', 'x', 'x', 'FlightNumber']]
