In [69]:
import pandas as pd
from pandas import option_context
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer 

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from collections import Counter

from textblob import TextBlob

## This is the main notebook for topic modeling and sentiment analysis (version 2)

In [70]:
df = pd.read_csv('04-data/train.csv')
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.shape

(96446, 24)

In [71]:
df_analysis = df[['username','tweet']].copy()
df_analysis.dropna(inplace=True)
df_analysis.reset_index(drop=True,inplace=True)

### Count words

In [72]:
stop = stopwords.words('english')
stop.extend(['hi', 'hey', 'hello','ha', 'followed','wa','dm','dont','cant','wont','get','still','like','need',
            'someone','people','im','ive','month','week','day','could','give','want','please','pls','since','one',
             'back','thanks','thank','take','doesnt','does', 'might', 'must','lock','access','sent','closed','use',
            'u','contact','new','email','time','message','tried','answer','waiting','issue','reason','going',
            'work','agent','bank','card','trying','even','every','problem','via','chat','support','log',
            'getting','anyone','reply','transfer','nothing'])

In [73]:
# # Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','pretty','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])

# # Extend for most common irregular verbs (except pay,lose,send,buy,spend)

stop.extend(['say','make','go','take','come','see','know','get','got','give',
            'find','think','tell','become','show','leave','feel','put','bring',
            'begin','keep','hold','write','stand','hear','let','mean','set','meet',
            'run','sit','speak','lie','lead','read','grow','fall',
            'build','understand','draw','break','cut','rise','drive','wear',
            'choose'])

# ## Extend for prepositions
stop.extend(['without','among'])

#### Compounds

In [74]:
create_compounds = (lambda x: 
                 x.replace('debit card', 'debitcard')
                 .replace('rainbow card', 'debitcard')
                 .replace('bank card', 'debitcard')
                 .replace('revolut card', 'debitcard')
                 .replace('revcards', 'debitcard')
                 .replace('debitcardcrypto', 'debitcard crypto')
                 .replace('credit card', 'creditcard')
                 .replace('junior card', 'juniorcard')
                 .replace('revolut junior', 'juniorcard')
                 .replace('revolut youth', 'juniorcard')
                 .replace('junior accounts', 'juniorcard')
                 .replace('junior account', 'juniorcard')
                 .replace('business account', 'budinessaccount')
                 .replace('savings account', 'savingsaccount')
                 .replace('bank account', 'bankaccount')
                 .replace('premium account', 'premiumaccount')
                 .replace('revolut premium', 'premiumaccount')
                 .replace('premium user', 'premiumaccount')
                 .replace('premium plan', 'premiumaccount')
                 .replace('premium membership', 'premiumaccount')
                 .replace('premium member', 'premiumaccount')
                 .replace('premium paid', 'premiumaccount')
                 .replace('premium payment', 'premiumaccount')
                 .replace('mypremiumaccountworthnothing', 'my premiumaccount worth nothing')
                 .replace('premium service', 'premiumaccount service')
                 .replace('metal account', 'metalaccount')
                 .replace('metal plan', 'metalaccount')
                 .replace('metal customer', 'metalaccount')
                 .replace('metal card', 'metalaccount')
                 .replace('metal customers', 'metalaccount')
                 .replace('metal user', 'metalaccount')
                 .replace('business bank', 'businessbank')
                 .replace('virtual card', 'virtualcard')
                 .replace('revolut business', 'revolutbusiness')
                 .replace('google pay', 'googlepay')
                 .replace('apple pay', 'applepay')
                 .replace('apple wallet', 'applepay')
                 .replace('applepayment', 'applepay')
                 .replace('samsung pay', 'samsungpay')
                 .replace('cryptocurrency', 'crypto')
                 .replace('cryptorelated', 'crypto related')
                 .replace('cryptofriendly', 'crypto friendly')
                 .replace('criptos', 'crypto')
                 .replace('cripto', 'crypto')
                 .replace('cryptos', 'crypto')
                 .replace('cryptoasset', 'crypto')
                 .replace('doge coin', 'crypto')
                 .replace(' doge ', ' crypto ')
                 .replace('customer service', 'customerservice')
                 .replace('supoort', 'support')
                 .replace('customer support', 'customerservice')
                 .replace(' cs ', ' customerservice ')
                 .replace('phone number', 'phonenumber')
                 .replace('social media', 'socialmedia')
                 .replace('app chat', 'appchat')
                 .replace('challenger bank', 'fintech')
                 .replace('challengerbank', 'fintech')
                 .replace('neobanking', 'fintech')
                 .replace('neobanks', 'fintech')
                 .replace('neobank', 'fintech')
                 .replace('transferthis', 'transfer')
                 .replace('application', 'app')
                 .replace('locked', 'lock')
                 .replace('unlock', 'lock')
                 .replace('block', 'lock')
                 .replace('dark mode', 'darkmode')
                 .replace('dark theme', 'darkmode')
                 .replace('xfers', 'transfer')
                 .replace('xfer', 'transfer'))


df_analysis['tweet_compound'] = df_analysis.tweet.map(create_compounds)

#### Count

In [75]:
counter = Counter()
lemma = nltk.stem.WordNetLemmatizer()

for tweet in df_analysis['tweet_compound']:
    tweet = lemma.lemmatize(tweet)
    tweet = TextBlob(tweet).words  # tokenize words
    tweet = [w for w in tweet if w not in stop]

    counter += Counter(tweet)

In [76]:
len(counter)

32555

In [77]:
for phrase, count in counter.most_common(10):
    print('%20s %i' % ("".join(phrase), count))

             account 28770
             revolut 16841
                 app 16158
               money 15495
                help 12044
                days 5350
               would 3982
              months 3511
     customerservice 3296
               weeks 2995


### Topic modeling

In [78]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
    
vectorizer = CountVectorizer(tokenizer = LemmaTokenizer(),
                              stop_words = stop,
                              min_df=2,
                              max_df = 0.9)

doc_word = vectorizer.fit_transform(df_analysis.tweet_compound)
doc_word.shape



(96446, 13051)

In [79]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(96446, 5)

In [80]:
topic_word = nmf_model.components_
topic_word.shape

(5, 13051)

In [81]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['account', 'fund', 'document', 'open', 'customerservice', 'information'],
 ['app', 'phone', 'update', 'id', 'working', 'login'],
 ['revolut', 'crypto', 'customer', 'child', 'fintech', 'payment'],
 ['money', 'send', 'pay', 'bankaccount', 'lost', 'transferred'],
 ['help', 'live', 'phone', 'hour', 'number', 'verify']]

- Component 0 (topic 1) seems to be about account queries (opening, accessing)
- Component 1 (topic 2) seems to be about app-related queries / issues
- Component 2 (topic 3) seems to be about fintech innovations
- Component 3 (topic 4) is about transfers and not being able to access accounts / money
- Component 4 (topic 5) is about general requests for support

In [82]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [83]:
with option_context('display.max_colwidth', 600):
    display(df_analysis.sample(5))

Unnamed: 0,username,tweet,tweet_compound,topic
68596,rafaelfcm,thank you steve for organizing another insightful conference and discuss today how and nik storonsky have developed exciting digital models to serve the financial needs of regular consumers,thank you steve for organizing another insightful conference and discuss today how and nik storonsky have developed exciting digital models to serve the financial needs of regular consumers,2
74046,walshultan,i stupidly tried to sign up the day before i turned i am now and it will not let me verify my identity,i stupidly tried to sign up the day before i turned i am now and it will not let me verify my identity,1
21491,webb_tobias,your now much worse app seems to have removed the option to contact customer service about a lost metal card why no cs for your best customers at the very least please advise,your now much worse app seems to have removed the option to contact customerservice about a lost metalaccount why no customerservice for your best customers at the very least please advise,1
73384,seepancu,i have the account blocked for almost days an no one is replying,i have the account lock for almost days an no one is replying,0
25538,ana_mariju,i do not know how to do it could you please help me,i do not know how to do it could you please help me,4


In [84]:
topic_names = {0: 'Account queries', 1: 'App queries', 2:'Fintech innovation',
              3: 'Unable to access funds / transfers', 4: 'Contact support'}

In [85]:
components_df = pd.DataFrame(nmf_model.components_, 
                             columns = vectorizer.get_feature_names()).T.rename(
                             columns=topic_names).reset_index()

In [86]:
components_df

Unnamed: 0,index,Account queries,App queries,Fintech innovation,Unable to access funds / transfers,Contact support
0,aa,0.000085,0.001092,0.000408,0.000000,0.000000
1,aaaaaand,0.000000,0.000588,0.000000,0.000000,0.000000
2,aaaand,0.000000,0.000000,0.000568,0.000000,0.000000
3,aaccess,0.000024,0.000000,0.000000,0.000000,0.003029
4,aacount,0.000322,0.000000,0.000134,0.000914,0.000517
...,...,...,...,...,...,...
13046,zloty,0.000378,0.000410,0.000179,0.000992,0.000000
13047,zone,0.001715,0.000317,0.000000,0.000661,0.000000
13048,zoom,0.000000,0.001235,0.000445,0.000487,0.000000
13049,zrx,0.000000,0.000995,0.001924,0.000000,0.000000


In [87]:
components_melt = components_df.melt(id_vars = 'index').rename(columns={'index': 'word(s)','variable':'topic'})
components_melt

Unnamed: 0,word(s),topic,value
0,aa,Account queries,0.000085
1,aaaaaand,Account queries,0.000000
2,aaaand,Account queries,0.000000
3,aaccess,Account queries,0.000024
4,aacount,Account queries,0.000322
...,...,...,...
65250,zloty,Contact support,0.000000
65251,zone,Contact support,0.000000
65252,zoom,Contact support,0.000000
65253,zrx,Contact support,0.000000


In [88]:
components_melt.to_csv('components_melt.csv')

### Sentiment analysis

In [89]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for tweet in df_analysis.tweet:
    sentiment.append(sid_obj.polarity_scores(tweet))

In [90]:
sentiment_df = pd.DataFrame(sentiment)
sentiment_df.head()

Unnamed: 0,neg,neu,pos,compound
0,0.0,0.722,0.278,0.4019
1,0.329,0.671,0.0,-0.9136
2,0.138,0.862,0.0,-0.7269
3,0.0,0.952,0.048,0.2023
4,0.139,0.696,0.165,-0.1531


In [91]:
merged_df = pd.concat([df_analysis, sentiment_df], axis=1)

In [92]:
full_df = pd.concat([df,merged_df],axis=1)

### RF sentiment classifier

In [93]:
import pickle
filename = 'sentimentRF_model.sav'
loaded_model = pickle.load(open(filename, 'rb'))

In [94]:
# creating bag of words model 
cv1 = CountVectorizer(max_features=1000) 
  
X = cv1.fit_transform(df_analysis['tweet']).toarray() 
result = loaded_model.predict(X)

In [95]:
full_df['sentiment'] = result

In [96]:
full_df.sample(10)

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,tweet,...,year,username.1,tweet.1,tweet_compound,topic,neg,neu,pos,compound,sentiment
78078,187546,1243113154378432512,1243110140741660673,2020-03-26 05:50:46 EDT,2020-03-26,05:50:46,-500,1127215039906238464,alesdonoso,spanish iban and tons tons of investment features,...,2020,alesdonoso,spanish iban and tons tons of investment features,spanish iban and tons tons of investment features,2,0.0,1.0,0.0,0.0,0
69617,169906,1254373833102016513,1254373833102016513,2020-04-26 07:36:41 EDT,2020-04-26,07:36:41,-500,1254371791100928000,marymur65207816,hi i have been trying to resolve an issue on m...,...,2020,marymur65207816,hi i have been trying to resolve an issue on m...,hi i have been trying to resolve an issue on m...,0,0.052,0.904,0.044,0.1441,-1
67102,164787,1258146143814537216,1258145633954869250,2020-05-06 17:26:30 EDT,2020-05-06,17:26:30,-500,171440419,guillaumephotos,yeah definatly a scam trying to get my revplut...,...,2020,guillaumephotos,yeah definatly a scam trying to get my revplut...,yeah definatly a scam trying to get my revplut...,2,0.183,0.594,0.223,-0.0516,-1
37862,104418,1292066041909125120,1292066041909125120,2020-08-08 07:52:04 EDT,2020-08-08,07:52:04,-500,1292063314135179264,szvonovagmailc1,hello ive accidentally deleted app from my pho...,...,2020,szvonovagmailc1,hello ive accidentally deleted app from my pho...,hello ive accidentally deleted app from my pho...,1,0.127,0.737,0.136,0.1164,-1
83718,198569,1235910977025183747,1234887037687390213,2020-03-06 07:51:53 EST,2020-03-06,07:51:53,-500,457440945,parisbtm,hey guys at dont forget to have a life instead...,...,2020,parisbtm,hey guys at dont forget to have a life instead...,hey guys at dont forget to have a life instead...,1,0.0,0.878,0.122,0.1695,0
82845,196824,1237343475248631811,1237343475248631811,2020-03-10 07:44:07 EDT,2020-03-10,07:44:07,-500,1237342996263317504,thomas11724857,hi i just made a revolut and gor my wages oaid...,...,2020,thomas11724857,hi i just made a revolut and gor my wages oaid...,hi i just made a revolut and gor my wages oaid...,4,0.069,0.793,0.138,0.25,-1
12197,53058,1327008886923137024,1327008886923137024,2020-11-12 17:02:27 EST,2020-11-12,17:02:27,-500,128116734,3dvertex,literally times tying to verify id different l...,...,2020,3dvertex,literally times tying to verify id different l...,literally times tying to verify id different l...,1,0.096,0.807,0.098,0.0108,-1
90281,211045,1227975407099613185,1222215259110375427,2020-02-13 10:18:46 EST,2020-02-13,10:18:46,-500,1225435366766804994,bourgeoisyann1,helloooo can you answer,...,2020,bourgeoisyann1,helloooo can you answer,helloooo can you answer,0,0.0,1.0,0.0,0.0,0
87343,205660,1231664919143178241,1221552294212382721,2020-02-23 14:39:34 EST,2020-02-23,14:39:34,-500,2759813242,thx84,same here i got refunded for the previous tran...,...,2020,thx84,same here i got refunded for the previous tran...,same here i got refunded for the previous tran...,3,0.0,1.0,0.0,0.0,1
5020,38542,1336765891153965059,1330823473627803650,2020-12-09 15:13:19 EST,2020-12-09,15:13:19,-500,711917125116542977,piero_rubeca,good evening you can put me in a position to w...,...,2020,piero_rubeca,good evening you can put me in a position to w...,good evening you can put me in a position to w...,3,0.15,0.8,0.05,-0.5719,-1


### Intent classification

In [97]:
import pickle
filename1 = 'intentxgb_model.sav'
intent_model = pickle.load(open(filename1, 'rb'))

In [98]:
# creating bag of words model 
cv2 = CountVectorizer(max_features=880) 
  
X2 = cv2.fit_transform(df_analysis['tweet']).toarray() 
intent_result = intent_model.predict(X2)

In [99]:
full_df['intent'] = intent_result

In [100]:
with option_context('display.max_colwidth', 600):
    display(full_df[(full_df['topic']==2)].sample(2))

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,tweet,...,username.1,tweet.1,tweet_compound,topic,neg,neu,pos,compound,sentiment,intent
24219,77216,1310649247969218560,1310649247969218560,2020-09-28 14:35:06 EDT,2020-09-28,14:35:06,-500,2600733590,mrkushmasta,hi i am a premium member of revolut and have been trying to contact customer support for hours whats the point pf premium service if there no service at all please contact me because i have funds that have not been recieved thank you,...,mrkushmasta,hi i am a premium member of revolut and have been trying to contact customer support for hours whats the point pf premium service if there no service at all please contact me because i have funds that have not been recieved thank you,hi i am a premiumaccount of revolut and have been trying to contact customerservice for hours whats the point pf premiumaccount service if there no service at all please contact me because i have funds that have not been recieved thank you,2,0.087,0.811,0.101,0.1754,-1,0
33826,96264,1297417808075161601,1297417808075161601,2020-08-23 02:18:04 EDT,2020-08-23,02:18:04,-500,955311858596175872,sameer27038259,i was using revolut extensively until dec thereafter i submitted my name change new passport on email as directed by your team no success have now switched to monzo that is far superior my xrp is there in revolut n need to transfer can u help,...,sameer27038259,i was using revolut extensively until dec thereafter i submitted my name change new passport on email as directed by your team no success have now switched to monzo that is far superior my xrp is there in revolut n need to transfer can u help,i was using revolut extensively until dec thereafter i submitted my name change new passport on email as directed by your team no success have now switched to monzo that is far superior my xrp is there in revolut n need to transfer can u help,2,0.057,0.824,0.119,0.4943,-1,4


In [101]:
full_df.groupby('intent').count()

Unnamed: 0_level_0,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,tweet,...,year,username,tweet,tweet_compound,topic,neg,neu,pos,compound,sentiment
intent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,18367,18367,18367,18367,18367,18367,18367,18367,18367,18367,...,18367,18367,18367,18367,18367,18367,18367,18367,18367,18367
2,29923,29923,29923,29923,29923,29923,29923,29923,29923,29923,...,29923,29923,29923,29923,29923,29923,29923,29923,29923,29923
3,7571,7571,7571,7571,7571,7571,7571,7571,7571,7571,...,7571,7571,7571,7571,7571,7571,7571,7571,7571,7571
4,40585,40585,40585,40585,40585,40585,40585,40585,40585,40585,...,40585,40585,40585,40585,40585,40585,40585,40585,40585,40585


### Analysis - high level topics

In [102]:
## Add month feature to enable analysis over time
full_df['month'] = pd.DatetimeIndex(full_df['date']).month
full_df['day'] = pd.DatetimeIndex(full_df['date']).day

In [103]:
with option_context('display.max_colwidth', 600):
    display(full_df[(full_df['topic']==2)].sample(2))

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,tweet,...,tweet_compound,topic,neg,neu,pos,compound,sentiment,intent,month,day
60461,150295,1267053549168590849,1267050424009461760,2020-05-31 07:21:21 EDT,2020-05-31,07:21:21,-500,3019802613,flikq,are you a customer of revolut,...,are you a customer of revolut,2,0.0,1.0,0.0,0.0,0,4,5,31
67466,165516,1257726230885019654,1257726230885019654,2020-05-05 13:37:55 EDT,2020-05-05,13:37:55,-500,1239118647353847809,financeco1,the european neobank intends to take advantage of the crisis to make acquisitions and even possibly buy out competitors revolut business crisis,...,the european fintech intends to take advantage of the crisis to make acquisitions and even possibly buy out competitors revolutbusiness crisis,2,0.281,0.651,0.068,-0.802,0,2,5,5


In [104]:
full_df.groupby(['topic','month'])['compound'].mean()

topic  month
0      1       -0.013253
       2       -0.013951
       3       -0.010863
       4       -0.012134
       5        0.007910
       6       -0.045602
       7       -0.021706
       8       -0.025227
       9       -0.005690
       10       0.015154
       11       0.000724
       12       0.018909
1      1        0.057885
       2        0.063390
       3        0.079860
       4        0.027052
       5        0.054660
       6        0.036043
       7        0.058974
       8        0.071886
       9        0.041276
       10       0.069149
       11       0.040215
       12       0.055014
2      1        0.124337
       2        0.173101
       3        0.155744
       4        0.115257
       5        0.112584
       6        0.067807
       7        0.158784
       8        0.091477
       9        0.122732
       10       0.137600
       11       0.063308
       12       0.156095
3      1       -0.064614
       2       -0.059314
       3       -0.003534
       4    

In [105]:
full_df.groupby('topic')['topic'].count()/len(full_df)

topic
0    0.161075
1    0.270120
2    0.256579
3    0.160577
4    0.151650
Name: topic, dtype: float64

In [106]:
full_df.shape

(96446, 36)

In [107]:
# full_df.to_csv('check.csv')

In [109]:
## Export to csv for Tableau analysis
full_df.to_csv('datatableau3.csv')

In [110]:
# charles94240716

### TFI-DF

In [111]:
# function to display top n terms associated with each topic

def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [112]:
# tuning vectorizer params
tf_idf = TfidfVectorizer(stop_words=stop,
                         tokenizer=word_tokenize,
                         min_df= 2,
                         max_df= 0.9)

# document-term matrix
doc_word2 = tf_idf.fit_transform(df_analysis.tweet_compound)
print(doc_word2.shape)



(96446, 14482)


In [113]:
n = 5 # number of topics

# model selection, fit/trans, and hyperparameter tuning
nmf_model_2 = NMF(n_components =n)

# doc-topic matrix
doc_topic2 = nmf_model_2.fit_transform(doc_word2)

# creating ids for each topic
topic_ids2 = ["topic"+str(val) for val in range(n)]

# topic-term matrix
topic_word2 = pd.DataFrame(nmf_model_2.components_.round(n),
             index = topic_ids2,
             columns = tf_idf.get_feature_names())

# prints top x words in each topic
display_topics(nmf_model_2, 
               tf_idf.get_feature_names(), 
               5) # number of top words/topic


Topic  0
account, days, months, weeks, funds

Topic  1
help, live, verification, login, asap

Topic  2
app, phone, working, login, open

Topic  3
money, days, send, months, weeks

Topic  4
revolut, crypto, would, customerservice, team
