In [9]:
import pandas as pd
import numpy as np
import nltk
import pickle

In [10]:
from nltk.corpus import wordnet as wn, brown
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
stemmer = nltk.stem.porter.PorterStemmer()
lemmatizer = nltk.WordNetLemmatizer()

def filter_terms(in_str, sub_list=None):
    if sub_list is None:
        return in_str

    for pattern in sub_list:
        in_str = re.sub("\\b" + pattern + "\\b", '', in_str)

    return in_str

def normalize(word):
    """Normalizes words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    word = stemmer.stem_word(word)
    word = lemmatizer.lemmatize(word)
    return word

def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted

def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        yield subtree.leaves()
        
def get_terms(tree):
    for leaf in leaves(tree):
        term = [ w.lower() for w,t in leaf if acceptable_word(w) ]
        yield term

In [27]:
try:
    # data/bank_services.pickle or data/customer_service.pickle
    with open('data/customer_service.pickle', 'rb') as f:
        picked_data = pickle.load(f)
except NameError as e:
    print(e)

In [28]:
# Define Chunker Rules
# Taken from Su Nam Kim Paper...
noun_chunker = nltk.RegexpParser('''
    NP:
    {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
    {(<NN.*|N.*>+<VBZ><DT|RB>?<JJ.*><NN.*|N.*>+?)}
    {<VB.*>+<RB><JJ.*>+}
''')

# Key Phrases from Facebook
key_phrases = []
for i, comment in enumerate(picked_data):
    tree = noun_chunker.parse(comment)
    noun_phrase = [w for w in get_terms(tree)]
    if len(noun_phrase) > 0:
        for i, phrase in enumerate(noun_phrase):
            key_phrases.append(phrase)

In [29]:
key_phrases[:100]

[['informationbankb', 'customer', 'service', 'rep', 'mr'],
 ['whats'],
 ['name'],
 ['creditor'],
 ['youre'],
 ['stop', 'payment'],
 ['ron', 'child', 'supportshe', 'stops'],
 ['strbankeht'],
 ['eyes'],
 ['eyesmr'],
 ['cant'],
 ['hold'],
 ['child', 'support'],
 ['didnt'],
 ['sosmilingof', 'course'],
 [],
 ['transaction', 'happy', 'hump', 'day'],
 ['twithndl', 'provocateur'],
 ['worlds'],
 ['customer', 'service'],
 ['bankd'],
 ['exchange', 'theyd'],
 ['return'],
 ['replacement'],
 ['day', 'courier'],
 ['twithndlbankb'],
 ['page'],
 ['complaints'],
 ['complaints'],
 ['customer', 'service'],
 [],
 [],
 ['st', 'hand', 'bankb'],
 ['shits'],
 ['customers'],
 ['customers', 'theyve'],
 ['yrs'],
 ['twithndlbankb'],
 [],
 ['fraudulent', 'card'],
 ['paycheck'],
 ['pay', 'card'],
 ['fraud'],
 ['account'],
 ['bankb'],
 ['customer', 'service', 'representatives'],
 [],
 ['twithndl'],
 [],
 ['customer', 'service', 'experience'],
 ['additional', 'details'],
 ['feedbankb'],
 ['leadership', 'team'],
 ['con