In [378]:
import pandas as pd
from pandas import option_context
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer 

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from collections import Counter

from textblob import TextBlob

In [379]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tawneykirkland/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## This is the main notebook for topic modeling and sentiment analysis (version 2)

In [380]:
df = pd.read_csv('train.csv')
df.shape

(107037, 24)

In [381]:
df_analysis = df[['username','tweet']].copy()
df_analysis.reset_index(drop=True,inplace=True)
df_analysis.dropna(inplace=True)

### Count words

In [382]:
create_compounds = (lambda x: 
                 x.replace('debit card', 'debitcard')
                 .replace('bank card', 'debitcard')
                 .replace('credit card', 'creditcard')
                 .replace('business account', 'budinessaccount')
                 .replace('savings account', 'savingsaccount')
                 .replace('bank account', 'bankaccount')
                 .replace('doge coin', 'dogecoin')
                 .replace('challenger bank', 'fintech')
                 .replace('business bank', 'businessbank')
                 .replace('virtual card', 'virtualcard')
                 .replace('rainbow card', 'debitcard')
                 .replace('metal card', 'metalaccount')
                 .replace('metal user', 'metalaccount')
                 .replace('revolut business', 'revolutbusiness')
                 .replace('google pay', 'googlepay')
                 .replace('cryptocurrency', 'crypto')
                 .replace('fraudolut', 'fraudulent')
                 .replace('premium account', 'premiumaccount')
                 .replace('premium user', 'premiumaccount')
                 .replace('premium plan', 'premiumaccount')
                 .replace('customer service', 'customerservice')
                 .replace('informationrmationrmationrmationrmation', 'information')
                 .replace('informationrmationrmationrmationrmationrmation', 'information')
                 .replace('supoort', 'support')
                 .replace('customer support', 'customerservice')
                 .replace('phone number', 'phonenumber')
                 .replace('social media', 'socialmedia')
                 .replace('app chat', 'appchat')
                 .replace('neobanking', 'fintech')
                 .replace('neobanks', 'fintech')
                 .replace('neobank', 'fintech')
                 .replace('transferthis', 'transfer')
                 .replace('application', 'app')
                 .replace('apple pay', 'applepay')
                 .replace('junior card', 'juniorcard')
                 .replace('send', 'sent')
                 .replace('pay', 'payment')
                 .replace('locked', 'lock'))


df_analysis['tweet_compound'] = df_analysis.tweet.map(create_compounds)

In [383]:
counter = Counter()
lemma = nltk.stem.WordNetLemmatizer()

for tweet in df_analysis['tweet_compound']:
    tweet = lemma.lemmatize(tweet)
    tweet = TextBlob(tweet).words  # tokenize words
    tweet = [w for w in tweet if w not in stop]

    counter += Counter(tweet)

In [384]:
len(counter)

32873

In [385]:
for phrase, count in counter.most_common(100):
    print('%20s %i' % ("".join(phrase), count))

             account 29213
                 app 16284
               money 15623
                chat 7770
               block 7054
                card 7027
                sent 5792
             support 5515
                lock 5409
               agent 4483
                bank 4126
                  dm 3786
               issue 3553
             problem 3362
     customerservice 3204
            response 3024
             message 2951
               funds 2859
            identity 2775
                guys 2725
               going 2723
               email 2648
              crypto 2640
               phone 2585
                 way 2558
           didentity 2542
             getting 2439
            transfer 2386
                  us 2380
             nothing 2353
             service 2325
             working 2310
             fintech 2309
                good 2278
                team 2275
            accounts 2207
           customers 2202
              number 2091
         

### Topic modeling

In [386]:
create_compounds = (lambda x: 
                 x.replace('debit card', 'debitcard')
                 .replace('bank card', 'debitcard')
                 .replace('credit card', 'creditcard')
                 .replace('business account', 'budinessaccount')
                 .replace('savings account', 'savingsaccount')
                 .replace('bank account', 'bankaccount')
                 .replace('doge coin', 'dogecoin')
                 .replace('challenger bank', 'fintech')
                 .replace('business bank', 'businessbank')
                 .replace('virtual card', 'virtualcard')
                 .replace('rainbow card', 'debitcard')
                 .replace('metal card', 'metalaccount')
                 .replace('metal user', 'metalaccount')
                 .replace('revolut business', 'revolutbusiness')
                 .replace('google pay', 'googlepay')
                 .replace('cryptocurrency', 'crypto')
                 .replace('fraudolut', 'fraudulent')
                 .replace('premium account', 'premiumaccount')
                 .replace('premium user', 'premiumaccount')
                 .replace('premium plan', 'premiumaccount')
                 .replace('customer service', 'customerservice')
                 .replace('informationrmationrmationrmationrmation', 'information')
                 .replace('informationrmationrmationrmationrmationrmation', 'information')
                 .replace('supoort', 'support')
                 .replace('customer support', 'customerservice')
                 .replace('phone number', 'phonenumber')
                 .replace('social media', 'socialmedia')
                 .replace('app chat', 'appchat')
                 .replace('neobanking', 'fintech')
                 .replace('neobanks', 'fintech')
                 .replace('neobank', 'fintech')
                 .replace('transferthis', 'transfer')
                 .replace('application', 'app')
                 .replace('apple pay', 'applepay')
                 .replace('junior card', 'juniorcard')
                 .replace('send', 'sent')
                 .replace('pay', 'payment')
                 .replace('locked', 'lock'))


df_analysis['tweet_compound'] = df_analysis.tweet.map(create_compounds)

In [387]:
stop = stopwords.words('english')
stop.extend(['ive', 'im', 'itd', 'youre', 'hi', 'hello', 'hey', 'eg', 'l', 'h','w','v','u',
             'please','get','dont','cant','help','dont','need','thanks','thank you',
             'revolut','since','back','access','days','one','months','want','blocked',
            'people','new','contact','use','cannot','weeks','revolut','hours','thank','someone',
             "'d", "'ll", "'re", "'s", "'ve", 'could', 'day', 'doe', 'ha', 'hour', 'might', 'month', 
              'must', "n't", 'sha', 'wa', 'week', 'wo', 'would','like','got','waiting',
            'answer','tried','trying','try','know','done','reply','last','via','wont','says',
            'able','keep','every','saying','say','havent','asked','many','well','let','reason',
             'time','live'])

In [388]:
# Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','pretty','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])

# Extend for most common irregular verbs (except pay,lose,send,buy,spend)

stop.extend(['say','make','go','take','come','see','know','get','got','give',
            'find','think','tell','become','show','leave','feel','put','bring',
            'begin','keep','hold','write','stand','hear','let','mean','set','meet',
            'run','sit','speak','lie','lead','read','grow','fall',
            'build','understand','draw','break','cut','rise','drive','wear',
            'choose'])

## Extend for most common irregular verb past tense and past participle

stop.extend(['said','made','gone','went','took','taken','came','saw','seen','knew',
             'known','gotten','gave','given','found','thought','became',''
            'find','think','told','became','showed','shown','left','felt','brought',
            'began','begun','kept','held','wrote','written','stood','heard','let','meant',
            'met','ran','sat','spoke','spoken','lay','lain','led','grew','grown','fell',
             'fallen','built','understood','drew','drawn','broke','broken','rose','risen',
            'drove','driven','wore','worn'])

## Extend for prepositions
stop.extend(['without','among'])

## Extend for verbs
stop.extend(['work','look','looking','seem','seems'])

In [391]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
    
vectorizer = CountVectorizer(tokenizer = LemmaTokenizer(),
                              stop_words = stop,
                              min_df=2)
doc_word = vectorizer.fit_transform(df_analysis.tweet_compound)
doc_word.shape



(107036, 13162)

In [392]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(107036, 5)

In [393]:
topic_word = nmf_model.components_
topic_word.shape

(5, 13162)

In [394]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['account', 'lock', 'block', 'fund', 'unlock', 'sent'],
 ['app', 'phone', 'email', 'log', 'support', 'problem'],
 ['money', 'block', 'sent', 'bank', 'transfer', 'bankaccount'],
 ['card', 'bank', 'free', 'identity', 'payment', 'order'],
 ['chat', 'agent', 'support', 'issue', 'response', 'lock']]

- Component 1 (topic 1) seems to be about account queries
- Component 2 (topic 2) seems to be about app-related queries
- Component 3 (topic 3) seems to be about fund transfers
- Component 4 (topic 4) is about card payments / transactions?

In [395]:
doc_topic

array([[6.15987859e-02, 2.15874124e-05, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 8.62011104e-04, 1.24814716e-03,
        3.70814816e-02],
       [9.34999517e-04, 0.00000000e+00, 9.08439001e-03, 7.67642669e-03,
        8.31385367e-02],
       ...,
       [4.00014297e-05, 4.87274408e-04, 1.29056268e-04, 1.05300126e-03,
        2.17830436e-04],
       [6.53107017e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00],
       [2.23937401e-03, 5.72363549e-03, 2.99717460e-03, 3.45274035e-03,
        1.74047309e-02]])

In [396]:
doc_topic.argmax(axis=1)

array([0, 4, 4, ..., 3, 0, 4])

In [397]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [398]:
with option_context('display.max_colwidth', 600):
    display(df_analysis.sample(20))

Unnamed: 0,username,tweet,tweet_compound,topic
76235,fdberlking,definitely a big fan of disposable virtual card feature since i got it yesterday and tried it out a couple of times good job guys,definitely a big fan of disposable virtualcard feature since i got it yesterday and tried it out a couple of times good job guys,3
5550,adatin78,can you please stop your app continuously asking to add my card to google pay its really annoying,can you please stop your app continuously asking to add my card to googlepayment its really annoying,3
66743,kevkevdgl,any chance an agenty will get back to me,any chance an agenty will get back to me,4
42755,adrian41367576,my account is blocked and i can no longer make payments for days i wait for an agent to answer me and nothing happened please help me,my account is block and i can no longer make paymentments for days i wait for an agent to answer me and nothing happened please help me,0
67254,mobunlim,your response was again that i need to wait wich i have been now ran out of food cant wait longer,your response was again that i need to wait wich i have been now ran out of food cant wait longer,4
31908,akisv3,thailand anytime soon,thailand anytime soon,3
30015,guillegargonz,really your dm is just the compliance team is checking my account and i have to wait an undetermined time for them to contact me with my money on hold i have been waiting for over h and all you are saying is that i should keep waiting indefinitelysounds like mafia,really your dm is just the compliance team is checking my account and i have to wait an undetermined time for them to contact me with my money on hold i have been waiting for over h and all you are saying is that i should keep waiting indefinitelysounds like mafia,2
49142,tobimuelhauser,there is a design bug in the title on the notifications page in your app,there is a design bug in the title on the notifications page in your app,1
19134,eimeareile,oh now safety of finances is paramount i was thinking of moving my salary to revolut account but perhaps not,oh now safety of finances is paramount i was thinking of moving my salary to revolut account but perhaps not,0
63646,thebizonek,yes i cant sent you private message can u send me message please,yes i cant sent you private message can u sent me message please,4


### Sentiment analysis

In [399]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for tweet in df_analysis.tweet:
    sentiment.append(sid_obj.polarity_scores(tweet))

In [400]:
sentiment_df = pd.DataFrame(sentiment)

In [401]:
sentiment_df

Unnamed: 0,neg,neu,pos,compound
0,0.000,0.722,0.278,0.4019
1,0.329,0.671,0.000,-0.9136
2,0.138,0.862,0.000,-0.7269
3,0.000,0.952,0.048,0.2023
4,0.139,0.696,0.165,-0.1531
...,...,...,...,...
107031,0.000,0.580,0.420,0.4404
107032,0.085,0.855,0.060,-0.2144
107033,0.000,0.000,1.000,0.6369
107034,0.000,0.896,0.104,0.2732


In [402]:
merged_df = pd.concat([df_analysis, sentiment_df], axis=1)

In [403]:
full_df = pd.concat([df,merged_df],axis=1)

### Analysis

In [404]:
## Add month feature to enable analysis over time
full_df['month'] = pd.DatetimeIndex(full_df['date']).month
full_df['day'] = pd.DatetimeIndex(full_df['date']).day

In [405]:
with option_context('display.max_colwidth', 600):
    display(full_df[(full_df['month']==11) & (full_df['day']==1) & (full_df['topic']==1)].sample(20))

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,tweet,...,username.1,tweet.1,tweet_compound,topic,neg,neu,pos,compound,month,day
17026,59481,1322898729796591618,1322898729796591618,2020-11-01 08:50:10 EST,2020-11-01,08:50:10,-500,311951399,conobrien3,hi how do you contact customer service on the junior app,...,conobrien3,hi how do you contact customer service on the junior app,hi how do you contact customerservice on the junior app,1.0,0.0,1.0,0.0,0.0,11,1
17025,59479,1322898752911351814,1322663923162730496,2020-11-01 08:50:15 EST,2020-11-01,08:50:15,-500,1037084895988908032,bozomedia,update it in whatever app store you use,...,bozomedia,update it in whatever app store you use,update it in whatever app store you use,1.0,0.0,1.0,0.0,0.0,11,1
16820,59046,1322966303997628416,1322961484108992513,2020-11-01 13:18:41 EST,2020-11-01,13:18:41,-500,572870197,mcardlepaula,ive deleted the app and reinstalled perhaps you guys should have a look for ios users thanks,...,mcardlepaula,ive deleted the app and reinstalled perhaps you guys should have a look for ios users thanks,ive deleted the app and reinstalled perhaps you guys should have a look for ios users thanks,1.0,0.0,0.847,0.153,0.4404,11,1
17192,59810,1322857308892913669,1322856065357254656,2020-11-01 06:05:34 EST,2020-11-01,06:05:34,-500,82331535,world_is_nuts,stop trying to pretend youre clever just update the app like everyone else has done revolut do employ computer experts you know theyre hardly scrolling through twitter looking for the fix,...,world_is_nuts,stop trying to pretend youre clever just update the app like everyone else has done revolut do employ computer experts you know theyre hardly scrolling through twitter looking for the fix,stop trying to pretend youre clever just update the app like everyone else has done revolut do employ computer experts you know theyre hardly scrolling through twitter looking for the fix,1.0,0.1,0.748,0.152,0.4404,11,1
16948,59308,1322920708582871046,1322663923162730496,2020-11-01 10:17:30 EST,2020-11-01,10:17:30,-500,593726528,brianjo42039768,my app keeps crashing after i use my fingerprint or passcode to log in,...,brianjo42039768,my app keeps crashing after i use my fingerprint or passcode to log in,my app keeps crashing after i use my fingerprint or passcode to log in,1.0,0.0,1.0,0.0,0.0,11,1
17159,59758,1322864653169549312,1322663923162730496,2020-11-01 06:34:45 EST,2020-11-01,06:34:45,-500,33457664,stedas,hi my revolut app is still crashing any updates on when will this issue be resolved,...,stedas,hi my revolut app is still crashing any updates on when will this issue be resolved,hi my revolut app is still crashing any updates on when will this issue be resolved,1.0,0.0,0.898,0.102,0.1779,11,1
17205,59828,1322852933428486146,1322852933428486146,2020-11-01 05:48:11 EST,2020-11-01,05:48:11,-500,431046100,freddiemcc1,every time i open my rev app this morning its crashing before im able to login please advise,...,freddiemcc1,every time i open my rev app this morning its crashing before im able to login please advise,every time i open my rev app this morning its crashing before im able to login please advise,1.0,0.0,0.881,0.119,0.3182,11,1
16881,59181,1322941600054579204,1322663923162730496,2020-11-01 11:40:31 EST,2020-11-01,11:40:31,-500,929817405047664642,zulsey,i reinstalled the app but now it wont allow me back in because i cant access the email on my phone that you need to authenticate joke,...,zulsey,i reinstalled the app but now it wont allow me back in because i cant access the email on my phone that you need to authenticate joke,i reinstalled the app but now it wont allow me back in because i cant access the email on my phone that you need to authenticate joke,1.0,0.067,0.839,0.094,0.2025,11,1
17262,59928,1322840906739916801,1322663923162730496,2020-11-01 05:00:24 EST,2020-11-01,05:00:24,-500,4776177291,slamjamde,i am on ios though,...,slamjamde,i am on ios though,i am on ios though,1.0,0.0,1.0,0.0,0.0,11,1
17211,59836,1322851249721503744,1322851249721503744,2020-11-01 05:41:29 EST,2020-11-01,05:41:29,-500,1085106822,mchal11,after the update the application now works well,...,mchal11,after the update the application now works well,after the update the app now works well,1.0,0.0,0.769,0.231,0.2732,11,1


In [406]:
full_df.groupby(['topic','month'])['compound'].mean()

topic  month
0.0    1        0.037839
       2        0.088104
       3        0.091985
       4        0.071265
       5        0.072332
       6        0.038354
       7        0.078414
       8        0.071208
       9        0.064293
       10       0.068454
       11       0.056945
       12       0.088749
1.0    1        0.054159
       2        0.124268
       3        0.109901
       4        0.085414
       5        0.080431
       6        0.035185
       7        0.109305
       8        0.094359
       9        0.094436
       10       0.114452
       11       0.085522
       12       0.097149
2.0    1        0.058671
       2        0.086376
       3        0.077127
       4        0.041302
       5        0.060680
       6        0.034097
       7        0.069240
       8        0.057424
       9        0.037712
       10      -0.024231
       11      -0.185129
       12      -0.016859
3.0    1        0.079337
       2        0.112528
       3        0.117433
       4    

In [407]:
full_df.groupby('topic')['topic'].count()

topic
0.0    20771
1.0    15999
2.0    15416
3.0    22019
4.0    32831
Name: topic, dtype: int64

In [408]:
## Export to csv for Tableau analysis
full_df.to_csv('datatableau.csv')

### TFI-DF

In [409]:
# function to display top n terms associated with each topic

def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [410]:
# tuning vectorizer params
tf_idf = TfidfVectorizer(stop_words=stop,
                         tokenizer=word_tokenize,
                         min_df=2)

# document-term matrix
doc_word2 = tf_idf.fit_transform(df_analysis.tweet_compound)
print(doc_word2.shape)



(107036, 14569)


In [411]:
n = 4 # number of topics

# model selection, fit/trans, and hyperparameter tuning
nmf_model_2 = NMF(n_components =n)

# doc-topic matrix
doc_topic2 = nmf_model_2.fit_transform(doc_word2)

# creating ids for each topic
topic_ids2 = ["topic"+str(val) for val in range(n)]

# topic-term matrix
topic_word2 = pd.DataFrame(nmf_model_2.components_.round(n),
             index = topic_ids2,
             columns = tf_idf.get_feature_names())

# prints top x words in each topic
display_topics(nmf_model_2, 
               tf_idf.get_feature_names(), 
               10) # number of top words/topic


Topic  0
account, block, lock, unlock, chat, agent, funds, problem, closed, response

Topic  1
dm, sent, followed, check, follow, issue, message, replied, problem, asap

Topic  2
app, chat, support, agent, card, working, log, problem, phone, issue

Topic  3
money, sent, bank, card, transfer, block, didentity, bankaccount, going, lost
