In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from tabulate import tabulate

In [2]:
dataset = pd.read_csv("dataset.txt", delimiter="|", encoding="ISO-8859-1", index_col="AutoID")
dataset.head()

Unnamed: 0_level_0,Date,Year,Month,MediaType,FullText
AutoID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,8/26/2015,2015,8,twitter,3 ways the internet of things will change Bank...
2,8/5/2015,2015,8,twitter,BankB BankB Name downgrades apple stock to neu...
3,8/12/2015,2015,8,twitter,BankB returns to profit on INTERNET/! board2? ...
4,8/5/2015,2015,8,twitter,BankB tells advisers to exit paulson hedge fun...
5,8/12/2015,2015,8,twitter,BankC may plead guilty over foreign exchange p...


In [5]:
from nltk.corpus import wordnet as wn, brown
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stemmer = nltk.stem.porter.PorterStemmer()
lemmatizer = nltk.WordNetLemmatizer()

def filter_terms(in_str, sub_list=None):
    if sub_list is None:
        return in_str

    for pattern in sub_list:
        in_str = re.sub("\\b" + pattern + "\\b", '', in_str)

    return in_str

def pos_tagger(train_sents):
    return nltk.pos_tag(train_sents)

def normalize(word):
    """Normalizes words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    word = stemmer.stem_word(word)
    word = lemmatizer.lemmatize(word)
    return word

def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted

def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP'):
        yield subtree.leaves()
        
def get_terms(tree):
    for leaf in leaves(tree):
        term = [ w.lower() for w,t in leaf if acceptable_word(w) ]
        yield term

In [4]:
# Common Bi-Grams
# ('financial', 'advisers'), 
# ('wealth', 'managers’),
# ('bank', 'account’),
# ('debit', 'card’), 
# ('credit', 'card’),
# ('checking', 'account’),
# ('close', 'account’),
# ('worst', 'bank’),
# ('data', 'breach’),
# ('bank', 'robbery’),
# ('new', 'bank’),
# ('cash', 'check’),
# ('direct', 'deposit’),
# ('open', 'account’),
# ('bank', 'card'),
# ('savings', 'account'),
# ('online', 'banking’),
# ('account', 'number'),
# ('asset', 'management’)

topics = ["customer service"]

In [7]:
# get key phrases for each topic
for topic in topics:
    # separate topics visually when printed
    print()
    print()
    print("*" * 100)
    
    s = re.compile(topic, re.IGNORECASE)
    fb_mask = (dataset["MediaType"] == "facebook") &\
        (dataset["FullText"].apply(lambda x: 1 if s.search(x) else 0))

    tw_mask = (dataset["MediaType"] == "twitter") &\
        (dataset["FullText"].apply(lambda x: 1 if s.search(x) else 0))
    
    print()
    print("Facebook Comments that mention %s: %s" % (topic, len(dataset[fb_mask])))
    print("Twitter Comments that mention %s: %s" % (topic, len(dataset[tw_mask])))
    
    # POS tag tokens from comments
    fb_tagged = [pos_tagger(nltk.tokenize.word_tokenize(comment)) for comment in dataset[fb_mask]["FullText"][:100]]
    tw_tagged = [pos_tagger(nltk.tokenize.word_tokenize(comment)) for comment in dataset[tw_mask]["FullText"][:100]]
    

    # Define Chunker Rules
    noun_chunker = nltk.RegexpParser('''
    NP: {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        {(<NN.*|N.*>+<VBZ><DT|RB>?<JJ.*><NN.*|N.*>+?)}
        {<VB.*>+<RB><JJ.*>+}
    ''')

    #Taken from Su Nam Kim Paper...
#     noun_chunker = nltk.RegexpParser("""
#         NBAR:
#             {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns

#         NP:
#             {<NBAR>}
#             {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
#     """)
    
    # Key Phrases from Facebook
    fb = []
    for line in fb_tagged:
        tree = noun_chunker.parse(line)
        np = [w for w in get_terms(tree)]
        if np:
            fb.append([topic, 'Facebook', np])

    print(tabulate(fb, headers=['Topic', 'Source', 'Noun Phrase']))
    
    # Key Phrases from Twitter
    tw = []
    for line in tw_tagged:
        tree = noun_chunker.parse(line)
        np = [w for w in get_terms(tree)]
        if np:
            tw.append([topic, 'Twitter', np])

    print(tabulate(tw, headers=['Topic', 'Source', 'Noun Phrase']))



****************************************************************************************************

Facebook Comments that mention customer service: 1045
Twitter Comments that mention customer service: 1175
Topic             Source    Noun Phrase
----------------  --------  -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------