In [59]:
import pandas as pd
import numpy as np
import nltk
import re
from tabulate import tabulate
import pickle

In [35]:
from nltk.corpus import wordnet as wn, brown
from nltk.corpus import stopwords

stopwords = stopwords.words('english')
stemmer = nltk.stem.porter.PorterStemmer()
lemmatizer = nltk.WordNetLemmatizer()

def filter_terms(in_str, sub_list=None):
    if sub_list is None:
        return in_str

    for pattern in sub_list:
        in_str = re.sub("\\b" + pattern + "\\b", '', in_str)

    return in_str

def normalize(word):
    """Normalizes words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    word = stemmer.stem_word(word)
    word = lemmatizer.lemmatize(word)
    return word

def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted

def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        yield subtree.leaves()
        
def get_terms(tree):
    for leaf in leaves(tree):
        term = [ w.lower() for w,t in leaf if acceptable_word(w) ]
        yield term

# Build POS Tagger

The default pos_tag method from 

In [36]:
from nltk.corpus import brown

default_tagger = nltk.DefaultTagger('NN')

brown_tagged_sents = brown.tagged_sents()

def build_backoff_tagger(train_sents):
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    t3 = nltk.TrigramTagger(train_sents, backoff=t2)
    return t3
try:
    with open('ngram_tagger.pickle', 'rb') as f:
        ngram_tagger = pickle.load(f)
    ngram_tagger
except NameError:
    ngram_tagger = build_backoff_tagger(brown_tagged_sents)
    with open('ngram_tagger.pickle', 'wb') as f:
        pickle.dump(ngram_tagger, f)

In [37]:
dataset = pd.read_csv("dataset.txt", delimiter="|", encoding="ISO-8859-1", index_col="AutoID")

# We only care about banks a-d
# every other bank is irrelevant
relevant_banks = ["twit_hndl_BankA", "twit_hndl_BankB", 
                  "twit_hndl_BankC", "twit_hndl_BankD", 
                  "BankA", "BankB", "BankC", "BankD"]
print("before:", len(dataset))
dataset = dataset[dataset["FullText"].apply(lambda x: any(substring in x for substring in relevant_banks))]
print("after:", len(dataset))

print(dataset.head())

before: 220377
after: 192180
             Date  Year  Month MediaType  \
AutoID                                     
1       8/26/2015  2015      8   twitter   
2        8/5/2015  2015      8   twitter   
3       8/12/2015  2015      8   twitter   
4        8/5/2015  2015      8   twitter   
5       8/12/2015  2015      8   twitter   

                                                 FullText  
AutoID                                                     
1       3 ways the internet of things will change Bank...  
2       BankB BankB Name downgrades apple stock to neu...  
3       BankB returns to profit on INTERNET/! board2? ...  
4       BankB tells advisers to exit paulson hedge fun...  
5       BankC may plead guilty over foreign exchange p...  


In [38]:
fb_mask = (dataset["MediaType"] == "facebook")
fb_data = dataset[fb_mask].reset_index()
try:
    with open('fb_tagged.pickle', 'rb') as f:
        fb_tagged = pickle.load(f)
    fb_tagged
except NameError:
    # POS tag tokens from comments
    %time fb_tagged = np.array([ngram_tagger.tag(nltk.tokenize.word_tokenize(comment)) for comment in fb_data["FullText"]])
    with open('fb_tagged.pickle', 'wb') as f:
        pickle.dump(fb_tagged, f)

In [39]:
# Common Bi-Grams
# ('financial', 'advisers'), 
# ('wealth', 'managers’),
# ('bank', 'account’),
# ('debit', 'card’), 
# ('credit', 'card’),
# ('checking', 'account’),
# ('close', 'account’),
# ('worst', 'bank’),
# ('data', 'breach’),
# ('bank', 'robbery’),
# ('new', 'bank’),
# ('cash', 'check’),
# ('direct', 'deposit’),
# ('open', 'account’),
# ('bank', 'card'),
# ('savings', 'account'),
# ('online', 'banking’),
# ('account', 'number'),
# ('asset', 'management’)

topics = np.array(["customer service"])

In [61]:
# get key phrases for each topic
out = np.array([])
for topic in topics:
    # separate topics visually when printed
    print()
    print(topic)
    print("*" * 100)
    
    # Define Chunker Rules
    # Taken from Su Nam Kim Paper...
    noun_chunker = nltk.RegexpParser('''
        NP:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        {(<NN.*|N.*>+<VBZ><DT|RB>?<JJ.*><NN.*|N.*>+?)}
        {<VB.*>+<RB><JJ.*>+}
    ''')
    
    filters = fb_data[fb_data["FullText"].str.contains(topic)].index.tolist()
    print(len(fb_tagged.take(filters, axis=0)))

    # Key Phrases from Facebook
    fb = np.array([])
    for i, comment in enumerate(fb_tagged.take(filters, axis=0)):
        tree = noun_chunker.parse(comment)
        noun_phrase = [w for w in get_terms(tree)]
        if noun_phrase:
            np.append(fb, noun_phrase)
    np.append(out, fb)    
    
    print("completed iteration", len(fb))


customer service
****************************************************************************************************
1034


KeyboardInterrupt: 

In [None]:
print(len(out))