In [1]:
import numpy as np
import pandas as pd

# Uncomment to see full cell text
pd.set_option('display.max_colwidth', 0)

faq = pd.read_csv('../data/interim/faq-text-separated.csv', keep_default_na=False)

# NLP Spacy

In [2]:
import string
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import spacy 


Resources:
https://spacy.io/usage/spacy-101#training
https://www.kaggle.com/nirant/hitchhiker-s-guide-to-nlp-in-spacy
https://towardsdatascience.com/machine-learning-for-text-classification-using-spacy-in-python-b276b4051a49

# Sentence Tokenizing

In [3]:
def preprocess(entry):
    entry = entry.replace('\n', ' ')
    entry = unicodedata.normalize("NFKD", entry)
    entry = entry.replace('?', '? ')
#     re.sub(r'.(?=[A-Z])', '. ', entry)
    return entry + ' '

raw = pd.read_excel('../data/raw/inquire-boulder-faq-text/inquire-boulder-active-faqs-2019-01-02.xls.xlsx')
raw = raw.drop(labels=('Active'), axis=1).dropna()
raw.FAQ = raw.FAQ.apply(preprocess)

In [13]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(raw.FAQ[14])

In [14]:
# Built-in sentence tokenizer
sentences = list(doc.sents)
print(len(sentences))

47


In [15]:
[print('%s\n-----\n' % (x)) for x in sentences]

Please call Code Enforcement at 303-441-1875 to ask questions or report a code violation.  
-----

Service requests submitted online will receive a response within four business days (7:00 am - 5:00 pm, except holidays).
-----

-----

The purpose of this chapter is to protect native wildife and the public health, safety and welfare by regulating the secure storage of trash and compost materials from being accessible to bears and other wildlife.
-----

Visit boulderwildlifeplan.net for additional information.
-----

Am I required to have a bear-resistant container?  
-----

If so, by when?
-----

Residents living in the Secure Trash Regulation Zone are required to have and properly use bear-resistant containers for compost and trash by June 15, 2016.
-----

Bear Continers and dumpsters need to be latched at all times.
-----

Click here for video on how to use the Western Disposal bear containers and dumpsters.
-----

Does the new ordinance only apply to trash and compost, or does it als

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [7]:
# Compare spaCy stopwords with nltk
from nltk.corpus import stopwords
sw = set(stopwords.words('English'))

print('spaCy stopwords: ', len(nlp.Defaults.stop_words))
print('NLTK stopwords: ', len(sw))
print('Words used by spaCy but not NLTK: ', len(nlp.Defaults.stop_words - sw))
print('Words used by NLTK but not spaCy: ', len(sw - nlp.Defaults.stop_words))
sw.update(nlp.Defaults.stop_words)
print('Combined stopwords: ', len(sw))

spaCy stopwords:  305
NLTK stopwords:  179
Words used by spaCy but not NLTK:  182
Words used by NLTK but not spaCy:  56
Combined stopwords:  361


# spaCy work stops here!

# Stemming

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

porter=PorterStemmer()

def stemSentence(sentence):
    sentence = " ".join(sentence)
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
    return stem_sentence

def text_process(text):
    # Remove punctuation
    text = text.replace('-', ' ')
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    nopunc = [word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    nopunc = stemSentence(nopunc)
    
    return nopunc

In [11]:
text_process('# Vectorization with Bag of Words')

['vector', 'bag', 'word']

# Lemmatization

In [12]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

wordnet_lemmatizer = WordNetLemmatizer()

def lemSentence(sentence):
    sentence = " ".join(sentence)
    token_words=word_tokenize(sentence)
    token_words
    lem_sentence=[]
    for word in token_words:
        lem_sentence.append(wordnet_lemmatizer.lemmatize(word))
    return lem_sentence

def text_process(text):
    # Remove punctuation
    text = text.replace('-', ' ')
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    # Remove stopwords
    nopunc = [word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    nopunc = lemSentence(nopunc)
    
    return nopunc

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/willscott/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
text_process('# Vectorization with Bag of Words')

['vectorization', 'bag', 'word']

# Vectorization with Bag of Words

In [14]:
faq['answer'].apply(text_process)

0      [generally, two, week]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       
1      [everyone, life, household, part, household, addition, people, listed, property, title, hous

In [15]:
# Might take a while...
bow_transformer = CountVectorizer(analyzer=text_process).fit(faq.question)

# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

1088


In [16]:
# Let's take one question and get its bag-of-words counts as a vector, putting to use our new `bow_transformer`:
entry = faq['question'][0]
print(entry)

How long does it take to become income-certified? 


In [17]:
# Now let's see its vector representation:
bow = bow_transformer.transform([entry])
print(bow)
print(bow.shape)

  (0, 96)	1
  (0, 164)	1
  (0, 486)	1
  (0, 571)	1
  (0, 985)	1
(1, 1088)


This means that there are X unique words in message (after removing common stop words). 

In [18]:
print(bow_transformer.get_feature_names()[867])

right


Now we can use .transform on our Bag-of-Words (bow) transformed object and transform the entire DataFrame. Let's go ahead and check out how the bag-of-words counts for the entire FAQ corpus is a large, sparse matrix:

In [19]:
faq_bow = bow_transformer.transform(faq['question'])
print('Shape of Sparse Matrix: ', faq_bow.shape)
print('Amount of Non-Zero occurences: ', faq_bow.nnz)

Shape of Sparse Matrix:  (649, 1088)
Amount of Non-Zero occurences:  2842


In [20]:
sparsity = (100.0 * faq_bow.nnz / (faq_bow.shape[0] * faq_bow.shape[1]))
print('sparsity: {}% Non-zero values'.format(round(sparsity, ndigits=2)))

sparsity: 0.4% Non-zero values


In [21]:
tfidf_transformer = TfidfTransformer().fit(faq_bow)
tfidf = tfidf_transformer.transform(bow)
print(tfidf)

  (0, 985)	0.3771095145034031
  (0, 571)	0.40075395419269094
  (0, 486)	0.4356506169713436
  (0, 164)	0.5036830462713718
  (0, 96)	0.5036830462713718


In [22]:
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['boulder']])
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['county']])

3.287317620863257
6.090678001769792


In [23]:
faq_tfidf = tfidf_transformer.transform(faq_bow)
print(faq_tfidf.shape)

(649, 1088)


### Train the model

In [24]:
from sklearn.naive_bayes import MultinomialNB
faq_match_model = MultinomialNB().fit(faq_tfidf, faq['answer'])

In [25]:
print('predicted:', faq_match_model.predict(tfidf)[0])
print('expected:', faq.answer[0])

predicted: Generally up to two weeks. 
expected: Generally up to two weeks. 


### Evaluate the model

In [35]:
pred = faq_match_model.predict(faq_tfidf)
print(pred)

['Generally up to two weeks. '
 "Everyone who lives in the household or is part of the household, in addition to all people who will be listed on the property title. Household members should include all individuals: legal spouse, domestic partner, or common-law spouse; children(under 18 who reside with the applicant at least 50 percent of the time; and/or a significant other whom the applicant chooses to include as a member of his/her household, who will be occupying the house. The applicant's spouse must be included unless they are legally divorced or separated. Persons not counted include foster children, unborn children, and children under 18 who reside with the applicant less then 50 percent of the time. "
 "Annual income is defined as the anticipated total income for the next 12-month period received from all sources by each member (over the age of 18) of the household. It is assumed that today's circumstances will continue for the next 12 months. The applicant is required to veri

In [39]:
from sklearn.metrics import classification_report
print(classification_report(faq['answer'], pred).strip())

precision    recall  f1-score   support

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

  'precision', 'predicted', average, warn_for)


In [30]:
from sklearn.model_selection import train_test_split

faq_train, faq_test, topic_train, topic_test = \
train_test_split(faq['question'], faq['answer'], test_size=0.2)

print(len(faq_train), len(faq_test), len(faq_train) + len(faq_test))

519 130 649


In [31]:
from sklearn.pipeline import Pipeline

tfidf_transformer = TfidfTransformer().fit(faq_bow)

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', tfidf_transformer),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [32]:
pipeline.fit(faq_train,topic_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_process at 0x109d9cd08>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None,...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [33]:
pred = pipeline.predict(faq_test)

In [34]:
print(classification_report(pred, faq_test))

                                                                                                                                                                                                                                                                                                                                                                            precision    recall  f1-score   support

                                                                                                                                                                                                                                                                                                                                                                   Airport       0.00      0.00      0.00         1
                                                                                                                                                                                               

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
