# Capstone Project
### Chatbot for Airbnb Messages to Common Queries






#### 4. NLP and Modelling

#pip install nltk
#pip install sklearn
#nltk.download()



In [6]:
import pandas as pd

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
import pickle
import joblib

In [7]:
questions = pd.read_csv('questionClassified.txt', delimiter='\t', encoding="utf-8")

In [8]:
X = questions.body
y = questions.classification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [9]:
#FINAL MODEL

sgd = Pipeline([('vect', CountVectorizer(tokenizer=LemmaTokenizer())),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5)),
               ])

model = sgd.fit(X, y)

# save the model to disk
filename = 'finalized_model.sav'
joblib.dump(model, filename)


['finalized_model.sav']

In [10]:
nb = Pipeline([('vect', CountVectorizer(tokenizer=LemmaTokenizer())),
#               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.23690205011389523
                         precision    recall  f1-score   support

           access issue       0.00      0.00      0.00         6
                address       0.00      0.00      0.00         8
    airbnb policy japan       0.00      0.00      0.00         8
             ammenities       0.00      0.00      0.00         9
             appliances       0.00      0.00      0.00        10
                   beds       0.00      0.00      0.00         6
    building directions       0.00      0.00      0.00         4
           cancellation       0.00      0.00      0.00         3
       check in process       0.13      1.00      0.23        31
      check out process       0.00      0.00      0.00         4
                chinese       0.00      0.00      0.00         2
               cleaning       0.00      0.00      0.00         1
         cleaning issue       0.00      0.00      0.00         1
   confirmation request       0.00      0.00      0.00      

  'precision', 'predicted', average, warn_for)


In [16]:
sgd = Pipeline([('vect', CountVectorizer(tokenizer=LemmaTokenizer())),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=6)),
               ])

sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.5603644646924829
                         precision    recall  f1-score   support

           access issue       0.00      0.00      0.00         6
                address       0.83      0.62      0.71         8
    airbnb policy japan       0.50      0.38      0.43         8
             ammenities       0.50      0.78      0.61         9
             appliances       0.75      0.30      0.43        10
                   beds       0.80      0.67      0.73         6
    building directions       0.00      0.00      0.00         4
           cancellation       1.00      1.00      1.00         3
       check in process       0.44      0.77      0.56        31
      check out process       0.00      0.00      0.00         4
                chinese       1.00      0.50      0.67         2
               cleaning       0.00      0.00      0.00         1
         cleaning issue       0.00      0.00      0.00         1
   confirmation request       0.43      0.27      0.33       

In [17]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=300)),
               ])
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))



accuracy 0.530751708428246
                         precision    recall  f1-score   support

           access issue       0.40      0.33      0.36         6
                address       1.00      0.50      0.67         8
    airbnb policy japan       0.50      0.25      0.33         8
             ammenities       0.54      0.78      0.64         9
             appliances       0.50      0.10      0.17        10
                   beds       1.00      0.33      0.50         6
    building directions       0.00      0.00      0.00         4
           cancellation       1.00      0.33      0.50         3
       check in process       0.37      0.68      0.48        31
      check out process       1.00      0.25      0.40         4
                chinese       0.00      0.00      0.00         2
               cleaning       0.00      0.00      0.00         1
         cleaning issue       0.00      0.00      0.00         1
   confirmation request       0.50      0.45      0.48        

In [18]:
sgd.predict(["can you recommend any restaurants nearby?"])

array(['food recommendations'], dtype='<U23')

In [19]:
sgd.predict(["I will be arriving earlier. Can I leave my bags before 3pm?"])

array(['early luggage storage'], dtype='<U23')

In [20]:
sgd.predict(["can you speak chinese?"])

array(['chinese'], dtype='<U23')

In [21]:
sgd.predict(["I will be arriving earlier. Can I check in early?"])

array(['early check in'], dtype='<U23')

In [22]:
sgd.predict(["how do i get to your place from Haneda Airport?"])

array(['transport directions'], dtype='<U23')

In [23]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier #(setting multi_class = “one_vs_rest”)
from sklearn.svm import LinearSVC #(setting multi_class=”ovr”)
from sklearn.linear_model import LogisticRegressionCV, Perceptron, PassiveAggressiveClassifier

In [24]:
grad = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', GradientBoostingClassifier()),
               ])
grad.fit(X_train, y_train)

y_pred = grad.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.3895216400911162
                         precision    recall  f1-score   support

           access issue       0.50      0.33      0.40         6
                address       0.75      0.38      0.50         8
    airbnb policy japan       1.00      0.12      0.22         8
             ammenities       0.33      0.22      0.27         9
             appliances       0.50      0.30      0.37        10
                   beds       1.00      0.50      0.67         6
    building directions       0.00      0.00      0.00         4
           cancellation       0.00      0.00      0.00         3
       check in process       0.14      0.71      0.24        31
      check out process       0.38      0.75      0.50         4
                chinese       0.00      0.00      0.00         2
               cleaning       0.00      0.00      0.00         1
         cleaning issue       0.00      0.00      0.00         1
   confirmation request       0.17      0.09      0.12       

In [25]:
labels = questions.classification.value_counts().head(6).keys()

In [26]:
svm = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LinearSVC()),
               ])
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.5763097949886105
                         precision    recall  f1-score   support

           access issue       0.33      0.17      0.22         6
                address       0.83      0.62      0.71         8
    airbnb policy japan       0.50      0.25      0.33         8
             ammenities       0.64      0.78      0.70         9
             appliances       0.75      0.30      0.43        10
                   beds       1.00      0.50      0.67         6
    building directions       0.00      0.00      0.00         4
           cancellation       1.00      1.00      1.00         3
       check in process       0.43      0.71      0.54        31
      check out process       0.50      0.25      0.33         4
                chinese       1.00      0.50      0.67         2
               cleaning       0.00      0.00      0.00         1
         cleaning issue       0.00      0.00      0.00         1
   confirmation request       0.56      0.45      0.50       

In [27]:
perc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', Perceptron()),
               ])
perc.fit(X_train, y_train)

y_pred = perc.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.4533029612756264
                         precision    recall  f1-score   support

           access issue       0.29      0.33      0.31         6
                address       0.67      0.75      0.71         8
    airbnb policy japan       0.67      0.25      0.36         8
             ammenities       0.67      0.67      0.67         9
             appliances       1.00      0.20      0.33        10
                   beds       1.00      0.33      0.50         6
    building directions       0.00      0.00      0.00         4
           cancellation       1.00      0.67      0.80         3
       check in process       0.51      0.61      0.56        31
      check out process       0.00      0.00      0.00         4
                chinese       0.33      0.50      0.40         2
                cleaner       0.00      0.00      0.00         0
               cleaning       0.50      1.00      0.67         1
         cleaning issue       0.00      0.00      0.00       



In [28]:


pagc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', PassiveAggressiveClassifier()),
               ])
pagc.fit(X_train, y_train)

y_pred = pagc.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))



accuracy 0.5375854214123007
                         precision    recall  f1-score   support

           access issue       0.50      0.33      0.40         6
                address       0.80      0.50      0.62         8
    airbnb policy japan       0.40      0.25      0.31         8
             ammenities       0.64      0.78      0.70         9
             appliances       1.00      0.20      0.33        10
                   beds       1.00      0.33      0.50         6
    building directions       0.00      0.00      0.00         4
           cancellation       1.00      1.00      1.00         3
       check in process       0.43      0.68      0.53        31
      check out process       0.67      0.50      0.57         4
                chinese       1.00      0.50      0.67         2
               cleaning       1.00      1.00      1.00         1
         cleaning issue       0.00      0.00      0.00         1
   confirmation request       0.43      0.27      0.33       

In [29]:
faq = pd.read_csv('FAQ.txt', delimiter='\t', encoding="utf-8", header = None)
faq = faq.set_index(0).to_dict()[1]

In [30]:
question = input("Enter a question: ")
category = sgd.predict([question])
output = faq[category[0]]
print(output)

Enter a question:  how do i get to your place from the airport?


From Haneda Airport, take the Keikyu Line to Nihonbashi Station. Switch to the Tozai Line and head to Nakano Station.
From Narita Airport, take the Narita Express to Tokyo Station. Switch to the JR Chuo Line and head to Nakano Station.


In [103]:
faq['access issue']

'Please provide a phone number I can contact you on.'