In [15]:
import pandas as pd
import numpy as np
import nltk
import re

In [16]:
dataset = pd.read_csv("dataset.txt", delimiter="|", encoding="ISO-8859-1", index_col="AutoID")
dataset.head()

Unnamed: 0_level_0,Date,Year,Month,MediaType,FullText
AutoID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,8/26/2015,2015,8,twitter,3 ways the internet of things will change Bank...
2,8/5/2015,2015,8,twitter,BankB BankB Name downgrades apple stock to neu...
3,8/12/2015,2015,8,twitter,BankB returns to profit on INTERNET/! board2? ...
4,8/5/2015,2015,8,twitter,BankB tells advisers to exit paulson hedge fun...
5,8/12/2015,2015,8,twitter,BankC may plead guilty over foreign exchange p...


In [21]:
from nltk.corpus import wordnet as wn, brown

def filter_terms(in_str, sub_list=None):
    if sub_list is None:
        return in_str

    for pattern in sub_list:
        in_str = re.sub("\\b" + pattern + "\\b", '', in_str)

    return in_str

def get_sents(collection):
    sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(collection) 
    return [nltk.word_tokenize(word) for word in raw_sents]

def pos_tagger(train_sents):
    return nltk.pos_tag(train_sents)

def traverse(t):
    try:
        t.label()
    except AttributeError:
        return
    else:
        if t.label() == 'NP':  
            print(t)
        # or do something else
        else:
            for child in t:
                traverse(child)

In [24]:
# Common Bi-Grams
# ('financial', 'advisers'), 
# ('wealth', 'managers’),
# ('bank', 'account’),
# ('debit', 'card’), 
# ('credit', 'card’),
# ('checking', 'account’),
# ('close', 'account’),
# ('worst', 'bank’),
# ('data', 'breach’),
# ('bank', 'robbery’),
# ('new', 'bank’),
# ('cash', 'check’),
# ('direct', 'deposit’),
# ('open', 'account’),
# ('bank', 'card'),
# ('savings', 'account'),
# ('online', 'banking’),
# ('account', 'number'),
# ('asset', 'management’)

topics = ["financial advisers", "customer service", "credit card"]

In [29]:
# get key phrases for each topic
for topic in topics:
    # separate topics visually when printed
    print()
    print()
    print("*" * 100)
    
    s = re.compile(topic, re.IGNORECASE)
    fb_mask = (dataset["MediaType"] == "facebook") &\
        (dataset["FullText"].apply(lambda x: 1 if s.search(x) else 0))
    #        (dataset["FullText"].apply(lambda x: len(x)) > 1000)
    tw_mask = (dataset["MediaType"] == "twitter") &\
        (dataset["FullText"].apply(lambda x: 1 if s.search(x) else 0))
    
    print()
    print("Facebook Comments that mention %s: %s" % (topic, len(dataset[fb_mask])))
    print("Twitter Comments that mention %s: %s" % (topic, len(dataset[tw_mask])))
    
    fb_tagged = [pos_tagger(sent) for comment in dataset[fb_mask]["FullText"] for sent in get_sents(comment)]
    tw_tagged = [pos_tagger(sent) for comment in dataset[tw_mask]["FullText"] for sent in get_sents(comment)]

    # {<JJ.*>+<NN.*>+}
    noun_chunker = nltk.RegexpParser('''
    NP: {(<NN.*|N.*>+<VBZ><DT|RB>?<JJ.*><NN.*|N.*>+?)}
        {<VB.*>+<RB><JJ.*>}
    ''')
    
    print()
    print("Key Phrases from Facebook")
    for line in fb_tagged:
        traverse(noun_chunker.parse(line))
    
    print()
    print("Key Phrases from Twitter")
    for line in tw_tagged:
        traverse(noun_chunker.parse(line))



****************************************************************************************************

Facebook Comments that mention financial advisers: 2140
Twitter Comments that mention financial advisers: 3

Key Phrases from Facebook
(NP is/VBZ still/RB plenty/JJ)
(NP is/VBZ still/RB plenty/JJ)
(NP tesco/NN is/VBZ still/RB unclear/JJ BankD/NNP)
(NP weakness/NN means/VBZ key/JJ issues/NNS)
(NP weakness/NN means/VBZ key/JJ issues/NNS)
(NP
  buyers/NNS
  INTERNET/NNP
  alpari/NN
  uk/NN
  has/VBZ
  potential/JJ
  buyers/NNS)
(NP forex/NN INTERNET/NN barclays/NNS sets/VBZ aside/JJ â/NNP)
(NP Name/NNP Name/NNP boosts/VBZ easy-access/JJ savings/NNS)
(NP
  airways/NNS
  Name/NNP
  owner/NN
  Name/NNP
  makes/VBZ
  third/JJ
  offer/NN)
(NP Name/NNP owner/NN Name/NNP makes/VBZ third/JJ offer/NN)
(NP bytemark/NN makes/VBZ first/JJ acquisition/NN)
(NP INTERNET/NN bytemark/NN makes/VBZ first/JJ acquisition/NN)
(NP sold/VBN rotten/RB retirement/JJ)
(NP sold/VBN rotten/RB retirement/JJ)
(NP dea

KeyboardInterrupt: 