In [1]:
import pandas as pd
import numpy as np
import nltk
import re

In [2]:
dataset = pd.read_csv("dataset.txt", delimiter="|", encoding="ISO-8859-1", index_col="AutoID")
dataset.head()

Unnamed: 0_level_0,Date,Year,Month,MediaType,FullText
AutoID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,8/26/2015,2015,8,twitter,3 ways the internet of things will change Bank...
2,8/5/2015,2015,8,twitter,BankB BankB Name downgrades apple stock to neu...
3,8/12/2015,2015,8,twitter,BankB returns to profit on INTERNET/! board2? ...
4,8/5/2015,2015,8,twitter,BankB tells advisers to exit paulson hedge fun...
5,8/12/2015,2015,8,twitter,BankC may plead guilty over foreign exchange p...


In [3]:
def filter_terms(in_str, sub_list=None):
    if sub_list is None:
        return in_str

    for pattern in sub_list:
        in_str = re.sub("\\b" + pattern + "\\b", '', in_str)

    return in_str

In [4]:
# Common Bi-Grams
# ('financial', 'advisers'), 
# ('wealth', 'managers’),
# ('bank', 'account’),
# ('debit', 'card’), 
# ('credit', 'card’),
# ('checking', 'account’),
# ('close', 'account’),
# ('worst', 'bank’),
# ('data', 'breach’),
# ('bank', 'robbery’),
# ('new', 'bank’),
# ('cash', 'check’),
# ('direct', 'deposit’),
# ('open', 'account’),
# ('bank', 'card'),
# ('savings', 'account'),
# ('online', 'banking’),
# ('account', 'number'),
# ('asset', 'management’)

In [6]:
# customer service
s = re.compile("customer service", re.IGNORECASE)
fb_mask = (dataset["MediaType"] == "facebook") &\
    (dataset["FullText"].apply(lambda x: 1 if s.search(x) else 0))
#        (dataset["FullText"].apply(lambda x: len(x)) > 1000)
tw_mask = (dataset["MediaType"] == "twitter") &\
    (dataset["FullText"].apply(lambda x: 1 if s.search(x) else 0))

print("FB Comments that meet criteria: %s" % len(dataset[fb_mask]))
print("TW Comments that meet criteria: %s" % len(dataset[tw_mask]))

# sub_list = ["ADDRESS", "INTERNET", "Name", 
#             "twit_hndl", "PHONE", "twit_hndl_BankA", 
#             "twit_hndl_BankB", "twit_hndl_BankC", "twit_hndl_BankD"
#             "BankA", "BankB", "BankC", "BankD"]
# sub_list += nltk.corpus.stopwords.words('english')

for comment in dataset[fb_mask]["FullText"]:
    print(comment)
#     print(filter_terms(comment, sub_list))
    print()

FB Comments that meet criteria: 1045
TW Comments that meet criteria: 1175
( after processing my information)BankB customer service rep....... mr Name whats the name of the creditor that youre making a stop payment with. ron don don........ child support!(she stops typing and looks me strbankeht in the eyes and says to me while blinking her eyes)mr Name we cant put a hold on your child support. Name don...... i didnt think so.(smiling)of course i told her i was joking and she was still laughing when she completed my transaction. happy hump day

twit_hndl provocateur. possibly the worlds best customer service.... Name them to BankD an exchange. theyd not received the return but have sent the replacement on a next day courier. very impressed.

twit_hndl_BankB Name page has complaints after complaints about the customer service. Name i surprised? no i witnessed it 1st hand. BankB gives 2 shits about their customers. even customers theyve had for over 30 yrs.

twit_hndl_BankB thank you for 

## Key Phrases

In [8]:
from nltk.corpus import wordnet as wn, brown

def get_sents(collection):
    sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(collection) 
    return [nltk.word_tokenize(word) for word in raw_sents]

def pos_tagger(train_sents):
    return nltk.pos_tag(train_sents)

In [9]:
# this is a bit slow... might be able to refactor
fb_tagged = [pos_tagger(sent) for comment in dataset[fb_mask]["FullText"] for sent in get_sents(comment)]
tw_tagged = [pos_tagger(sent) for comment in dataset[tw_mask]["FullText"] for sent in get_sents(comment)]
print(fb_tagged[:10])
print()

[[('(', 'NN'), ('after', 'IN'), ('processing', 'NN'), ('my', 'PRP$'), ('information', 'NN'), (')', ':'), ('BankB', 'NNP'), ('customer', 'NN'), ('service', 'NN'), ('rep', 'NN'), ('...', ':'), ('...', ':'), ('.', '.'), ('mr', 'NN'), ('Name', 'NNP'), ('whats', 'NNS'), ('the', 'DT'), ('name', 'NN'), ('of', 'IN'), ('the', 'DT'), ('creditor', 'NN'), ('that', 'IN'), ('youre', 'NN'), ('making', 'VBG'), ('a', 'DT'), ('stop', 'NN'), ('payment', 'NN'), ('with', 'IN'), ('.', '.')], [('ron', 'NN'), ('don', 'VBD'), ('don', 'NN'), ('...', ':'), ('...', ':'), ('..', ':'), ('child', 'NN'), ('support', 'NN'), ('!', '.')], [('(', 'NN'), ('she', 'PRP'), ('stops', 'VBZ'), ('typing', 'VBG'), ('and', 'CC'), ('looks', 'NNS'), ('me', 'PRP'), ('strbankeht', 'VBD'), ('in', 'IN'), ('the', 'DT'), ('eyes', 'NNS'), ('and', 'CC'), ('says', 'VBZ'), ('to', 'TO'), ('me', 'PRP'), ('while', 'IN'), ('blinking', 'VBG'), ('her', 'PRP'), ('eyes', 'VBZ'), (')', ':'), ('mr', 'NN'), ('Name', 'NNP'), ('we', 'PRP'), ('cant', 'VBP'

In [10]:
def traverse(t):
    try:
        t.label()
    except AttributeError:
        return
    else:
        if t.label() == 'NP':  
            print(t)
        # or do something else
        else:
            for child in t:
                traverse(child)

noun_chunker = nltk.RegexpParser('''
NP: {(<NN.*|N.*>+<VBZ><DT|RB>?<JJ.*>)}
    {<VB.*>+<RB><JJ.*>}
    {<JJ.*>+<NN.*>+}
''')

In [12]:
for line in fb_tagged[:100]:
    traverse(noun_chunker.parse(line))

(NP best/JJS customer/NN service/NN)
(NP next/JJ day/NN courier/NN)
(NP last/JJ paycheck/NN)
(NP additional/JJ details/NNS)
(NP horrible/JJ horrible/JJ bank/NN)
(NP customer/NN service/NN is/VBZ terrible/JJ)
(NP bad/JJ customer/NN service/NN)
(NP BankD/NNP bank/NN customer/NN service/NN is/VBZ the/DT worst/JJS)
(NP terrible/JJ experience/NN)
(NP terrible/JJ experience/NN)
(NP actual/JJ teller/NN)
(NP cash/NN is/VBZ no/DT good/JJ)
(NP inconsiderate/JJ customer/NN service/NN service/NN)
(NP whole/JJ embarrassing/NN ordeal/NN)
(NP healthy/JJ person/NN)
(NP horrible/JJ customer/NN service/NN service/NN)
(NP horrible/JJ customer/NN service/NN service/NN)
(NP good/JJ customer/NN service/NN)
(NP past/JJ several/JJ different/JJ banking/NN companies/NNS)
(NP drive/JJ throughs/NNS)
(NP more/JJR convenient/JJ hours/NNS)
(NP drive/JJ thru/NN)
(NP enough/JJ people/NNS)
(NP western/JJ cape/NN)
(NP
  other/JJ
  customer/NN
  service/NN
  advisors/NNS
  emergency/NN
  services/NNS
  permanent/NN
  pos

In [13]:
for line in tw_tagged[:100]:
    traverse(noun_chunker.parse(line))

(NP twit_hndl_BankB/NNP is/VBZ the/DT worst/JJS)
(NP horrible/JJ customer/NN service/NN service/NN)
(NP customer/NN service/NN line/NN line/NN is/VBZ tedious/JJ)
(NP usual-/JJ thanks/NNS)
(NP were/VBD very/RB helpful/JJ)
(NP worst/JJS customer/NN service/NN)
(NP great/JJ customer/NN service/NN)
(NP last/JJ month/NN)
(NP severe/JJ bug/NN)
(NP BankD/NNP is/VBZ so/RB attentive/JJ)
(NP twit_hndl_BankB/NNP has/VBZ the/DT worst/JJS)
(NP best/JJS online/NN customer/NN service/NN)
(NP service/NN is/VBZ worse/JJR)
(NP customer/NN service/NN is/VBZ human.-/JJ)
(NP best/JJS possible/JJ customer/NN service/NN)
(NP international/JJ collect/NN customer/NN service/NN line/NN)
(NP
  international/JJ
  collect/NN
  customer/NN
  service/NN
  customer/NN
  service/NN
  line/NN)
(NP best/JJS customer/NN service/NN)
(NP international/JJ collect/NN customer/NN service/NN line/NN)
(NP avail/JJ Name/NNP)
(NP local/JJ time/NN)
(NP good/JJ customer/NN service/NN)
(NP knowledgeable/JJ employees/NNS)
(NP best/JJ