In [78]:
import pandas as pd
from pandas import option_context
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer 

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from collections import Counter

from textblob import TextBlob

## This is the main notebook for topic modeling and sentiment analysis (version 2)

In [79]:
df = pd.read_csv('train.csv')
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.shape

(96446, 24)

In [80]:
df_analysis = df[['username','tweet']].copy()
df_analysis.dropna(inplace=True)
df_analysis.reset_index(drop=True,inplace=True)

### Count words

In [81]:
stop = stopwords.words('english')
stop.extend(['hi', 'hey', 'hello','ha', 'followed','wa','dm','dont','cant','wont','get','still','like','need',
            'someone','people','im','ive','month','week','day','could','give','want','please','pls','since','one',
             'back','thanks','thank','take','doesnt','does', 'might', 'must','lock','access','sent','closed','use',
            'u','contact','new','email','time','message','tried','answer','waiting','issue','reason','going',
            'work','agent','bank','card','trying','even','every','problem','via','chat','support','log',
            'getting','anyone','reply','transfer','nothing'])

In [82]:
# # Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','pretty','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])

# # Extend for most common irregular verbs (except pay,lose,send,buy,spend)

stop.extend(['say','make','go','take','come','see','know','get','got','give',
            'find','think','tell','become','show','leave','feel','put','bring',
            'begin','keep','hold','write','stand','hear','let','mean','set','meet',
            'run','sit','speak','lie','lead','read','grow','fall',
            'build','understand','draw','break','cut','rise','drive','wear',
            'choose'])

# ## Extend for prepositions
stop.extend(['without','among'])

#### Compounds

In [83]:
create_compounds = (lambda x: 
                 x.replace('debit card', 'debitcard')
                 .replace('rainbow card', 'debitcard')
                 .replace('bank card', 'debitcard')
                 .replace('revolut card', 'debitcard')
                 .replace('revcards', 'debitcard')
                 .replace('debitcardcrypto', 'debitcard crypto')
                 .replace('credit card', 'creditcard')
                 .replace('junior card', 'juniorcard')
                 .replace('revolut junior', 'juniorcard')
                 .replace('revolut youth', 'juniorcard')
                 .replace('junior accounts', 'juniorcard')
                 .replace('junior account', 'juniorcard')
                 .replace('business account', 'budinessaccount')
                 .replace('savings account', 'savingsaccount')
                 .replace('bank account', 'bankaccount')
                 .replace('premium account', 'premiumaccount')
                 .replace('revolut premium', 'premiumaccount')
                 .replace('premium user', 'premiumaccount')
                 .replace('premium plan', 'premiumaccount')
                 .replace('premium membership', 'premiumaccount')
                 .replace('premium member', 'premiumaccount')
                 .replace('premium paid', 'premiumaccount')
                 .replace('premium payment', 'premiumaccount')
                 .replace('mypremiumaccountworthnothing', 'my premiumaccount worth nothing')
                 .replace('premium service', 'premiumaccount service')
                 .replace('metal account', 'metalaccount')
                 .replace('metal plan', 'metalaccount')
                 .replace('metal customer', 'metalaccount')
                 .replace('metal card', 'metalaccount')
                 .replace('metal customers', 'metalaccount')
                 .replace('metal user', 'metalaccount')
                 .replace('business bank', 'businessbank')
                 .replace('virtual card', 'virtualcard')
                 .replace('revolut business', 'revolutbusiness')
                 .replace('google pay', 'googlepay')
                 .replace('apple pay', 'applepay')
                 .replace('apple wallet', 'applepay')
                 .replace('applepayment', 'applepay')
                 .replace('samsung pay', 'samsungpay')
                 .replace('cryptocurrency', 'crypto')
                 .replace('cryptorelated', 'crypto related')
                 .replace('cryptofriendly', 'crypto friendly')
                 .replace('criptos', 'crypto')
                 .replace('cripto', 'crypto')
                 .replace('cryptos', 'crypto')
                 .replace('cryptoasset', 'crypto')
                 .replace('doge coin', 'crypto')
                 .replace(' doge ', ' crypto ')
                 .replace('customer service', 'customerservice')
                 .replace('supoort', 'support')
                 .replace('customer support', 'customerservice')
                 .replace(' cs ', ' customerservice ')
                 .replace('phone number', 'phonenumber')
                 .replace('social media', 'socialmedia')
                 .replace('app chat', 'appchat')
                 .replace('challenger bank', 'fintech')
                 .replace('challengerbank', 'fintech')
                 .replace('neobanking', 'fintech')
                 .replace('neobanks', 'fintech')
                 .replace('neobank', 'fintech')
                 .replace('transferthis', 'transfer')
                 .replace('application', 'app')
                 .replace('locked', 'lock')
                 .replace('unlock', 'lock')
                 .replace('block', 'lock')
                 .replace('dark mode', 'darkmode')
                 .replace('dark theme', 'darkmode')
                 .replace('xfers', 'transfer')
                 .replace('xfer', 'transfer'))


df_analysis['tweet_compound'] = df_analysis.tweet.map(create_compounds)

#### Count

In [84]:
counter = Counter()
lemma = nltk.stem.WordNetLemmatizer()

for tweet in df_analysis['tweet_compound']:
    tweet = lemma.lemmatize(tweet)
    tweet = TextBlob(tweet).words  # tokenize words
    tweet = [w for w in tweet if w not in stop]

    counter += Counter(tweet)

In [85]:
len(counter)

32555

In [86]:
for phrase, count in counter.most_common(10):
    print('%20s %i' % ("".join(phrase), count))

             account 28770
             revolut 16841
                 app 16158
               money 15495
                help 12044
                days 5350
               would 3982
              months 3511
     customerservice 3296
               weeks 2995


### Topic modeling

In [87]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
    
vectorizer = CountVectorizer(tokenizer = LemmaTokenizer(),
                              stop_words = stop,
                              min_df=2,
                              max_df = 0.9)

doc_word = vectorizer.fit_transform(df_analysis.tweet_compound)
doc_word.shape



(96446, 13051)

In [88]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(96446, 5)

In [89]:
topic_word = nmf_model.components_
topic_word.shape

(5, 13051)

In [90]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['account', 'fund', 'document', 'open', 'customerservice', 'information'],
 ['app', 'phone', 'update', 'id', 'working', 'login'],
 ['revolut', 'crypto', 'customer', 'child', 'fintech', 'payment'],
 ['money', 'send', 'pay', 'bankaccount', 'lost', 'transferred'],
 ['help', 'live', 'phone', 'hour', 'number', 'verify']]

- Component 0 (topic 1) seems to be about account queries
- Component 1 (topic 2) seems to be about app-related queries
- Component 2 (topic 3) seems to be about fintech innovations
- Component 3 (topic 4) is about transfers and not being able to access accounts / money
- Component 4 (topic 5) is about general requests for support

In [91]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [92]:
with option_context('display.max_colwidth', 600):
    display(df_analysis.sample(5))

Unnamed: 0,username,tweet,tweet_compound,topic
4662,laugucci,make a crypto wallet i want to make paystransfers,make a crypto wallet i want to make paystransfers,2
94567,triona_c,actually isnt the above false advertising while you dont charge a monthly atm fee you do charge at source no,actually isnt the above false advertising while you dont charge a monthly atm fee you do charge at source no,3
91100,farrellcoin,i was already following,i was already following,4
78632,hopla21368453,thank you with a little help everything is possible i chated with customer support thank you very much,thank you with a little help everything is possible i chated with customerservice thank you very much,4
83734,francoistango,little bugs in my code little bugs in my code found one fixed one little bugs in my code little bugs in my code,little bugs in my code little bugs in my code found one fixed one little bugs in my code little bugs in my code,1


In [93]:
topic_names = {0: 'Account queries', 1: 'App queries', 2:'Fintech innovation',
              3: 'Unable to access funds / transfers', 4: 'Contact support'}

In [94]:
components_df = pd.DataFrame(nmf_model.components_, 
                             columns = vectorizer.get_feature_names()).T.rename(
                             columns=topic_names).reset_index()

In [95]:
components_df

Unnamed: 0,index,Account queries,App queries,Fintech innovation,Unable to access funds / transfers,Contact support
0,aa,0.000085,0.001092,0.000408,0.000000,0.000000
1,aaaaaand,0.000000,0.000588,0.000000,0.000000,0.000000
2,aaaand,0.000000,0.000000,0.000568,0.000000,0.000000
3,aaccess,0.000024,0.000000,0.000000,0.000000,0.003029
4,aacount,0.000322,0.000000,0.000134,0.000914,0.000517
...,...,...,...,...,...,...
13046,zloty,0.000378,0.000410,0.000179,0.000992,0.000000
13047,zone,0.001715,0.000317,0.000000,0.000661,0.000000
13048,zoom,0.000000,0.001235,0.000445,0.000487,0.000000
13049,zrx,0.000000,0.000995,0.001924,0.000000,0.000000


In [114]:
components_melt = components_df.melt(id_vars = 'index').rename(columns={'index': 'word(s)','variable':'topic'})
components_melt

Unnamed: 0,word(s),topic,value
0,aa,Account queries,0.000085
1,aaaaaand,Account queries,0.000000
2,aaaand,Account queries,0.000000
3,aaccess,Account queries,0.000024
4,aacount,Account queries,0.000322
...,...,...,...
65250,zloty,Contact support,0.000000
65251,zone,Contact support,0.000000
65252,zoom,Contact support,0.000000
65253,zrx,Contact support,0.000000


In [98]:
components_melt.to_csv('components_melt.csv')

### Sentiment analysis

In [99]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for tweet in df_analysis.tweet:
    sentiment.append(sid_obj.polarity_scores(tweet))

In [100]:
sentiment_df = pd.DataFrame(sentiment)
sentiment_df.head()

Unnamed: 0,neg,neu,pos,compound
0,0.0,0.722,0.278,0.4019
1,0.329,0.671,0.0,-0.9136
2,0.138,0.862,0.0,-0.7269
3,0.0,0.952,0.048,0.2023
4,0.139,0.696,0.165,-0.1531


In [101]:
merged_df = pd.concat([df_analysis, sentiment_df], axis=1)

In [102]:
full_df = pd.concat([df,merged_df],axis=1)

### Analysis - high level topics

In [103]:
## Add month feature to enable analysis over time
full_df['month'] = pd.DatetimeIndex(full_df['date']).month
full_df['day'] = pd.DatetimeIndex(full_df['date']).day

In [104]:
with option_context('display.max_colwidth', 600):
    display(full_df[(full_df['topic']==2)].sample(2))

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,tweet,...,username.1,tweet.1,tweet_compound,topic,neg,neu,pos,compound,month,day
77918,187302,1243121772674912256,1243121772674912256,2020-03-26 06:25:01 EDT,2020-03-26,06:25:01,-500,351257711,timhc22,not really many perks available to be used at the moment might just make it if covid finishes in months,...,timhc22,not really many perks available to be used at the moment might just make it if covid finishes in months,not really many perks available to be used at the moment might just make it if covid finishes in months,2,0.0,1.0,0.0,0.0,3,26
16994,62811,1321429150314516483,1321397770939412481,2020-10-28 08:30:35 EDT,2020-10-28,08:30:35,-500,725041628948381697,hedihans,a way to transfer crypto out,...,hedihans,a way to transfer crypto out,a way to transfer crypto out,2,0.0,1.0,0.0,0.0,10,28


In [105]:
full_df.groupby(['topic','month'])['compound'].mean()

topic  month
0      1       -0.013253
       2       -0.013951
       3       -0.010863
       4       -0.012134
       5        0.007910
       6       -0.045602
       7       -0.021706
       8       -0.025227
       9       -0.005690
       10       0.015154
       11       0.000724
       12       0.018909
1      1        0.057885
       2        0.063390
       3        0.079860
       4        0.027052
       5        0.054660
       6        0.036043
       7        0.058974
       8        0.071886
       9        0.041276
       10       0.069149
       11       0.040215
       12       0.055014
2      1        0.124337
       2        0.173101
       3        0.155744
       4        0.115257
       5        0.112584
       6        0.067807
       7        0.158784
       8        0.091477
       9        0.122732
       10       0.137600
       11       0.063308
       12       0.156095
3      1       -0.064614
       2       -0.059314
       3       -0.003534
       4    

In [106]:
full_df.groupby('topic')['topic'].count()

topic
0    15535
1    26052
2    24746
3    15487
4    14626
Name: topic, dtype: int64

In [107]:
full_df.shape

(96446, 34)

In [108]:
# full_df.to_csv('check.csv')

In [109]:
## Export to csv for Tableau analysis
full_df.to_csv('datatableau.csv')

In [110]:
# charles94240716

### TFI-DF

In [111]:
# function to display top n terms associated with each topic

def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [112]:
# tuning vectorizer params
tf_idf = TfidfVectorizer(stop_words=stop,
                         tokenizer=word_tokenize,
                         min_df= 2,
                         max_df= 0.9)

# document-term matrix
doc_word2 = tf_idf.fit_transform(df_analysis.tweet_compound)
print(doc_word2.shape)



(96446, 14482)


In [113]:
n = 5 # number of topics

# model selection, fit/trans, and hyperparameter tuning
nmf_model_2 = NMF(n_components =n)

# doc-topic matrix
doc_topic2 = nmf_model_2.fit_transform(doc_word2)

# creating ids for each topic
topic_ids2 = ["topic"+str(val) for val in range(n)]

# topic-term matrix
topic_word2 = pd.DataFrame(nmf_model_2.components_.round(n),
             index = topic_ids2,
             columns = tf_idf.get_feature_names())

# prints top x words in each topic
display_topics(nmf_model_2, 
               tf_idf.get_feature_names(), 
               5) # number of top words/topic


Topic  0
account, days, months, weeks, funds

Topic  1
help, live, verification, login, asap

Topic  2
app, phone, working, login, open

Topic  3
money, days, send, months, weeks

Topic  4
revolut, crypto, would, customerservice, team


### Fix split of data

In [31]:
# df_intent = pd.read_csv('training_base.csv',
#                        usecols = ['username','tweet','intent'])

# def remove_space(text):
#     text = text.strip()
#     text = re.sub("\s\s+", " ", text)
#     return text
    
# df_intent['tweet'] = df_intent.tweet.map(remove_space)
# merged_df['tweet'] = merged_df.tweet.map(remove_space)

In [32]:
merged_df

Unnamed: 0,username,tweet,tweet_compound,topic,neg,neu,pos,compound
0,richard64637000,i need help with account verification skaterbo...,i need help with account verification skaterbo...,4,0.000,0.722,0.278,0.4019
1,darley_darren,your agents are beyond useless lied to me and ...,your agents are beyond useless lied to me and ...,4,0.329,0.671,0.000,-0.9136
2,darley_darren,nd time revolut put me in a bad position and h...,nd time revolut put me in a bad position and h...,2,0.138,0.862,0.000,-0.7269
3,aoife_galvin,how can i talk to someone about currency excha...,how can i talk to someone about currency excha...,1,0.000,0.952,0.048,0.2023
4,cian47477384,how do i change my number if ive no access to ...,how do i change my number if ive no access to ...,1,0.139,0.696,0.165,-0.1531
...,...,...,...,...,...,...,...,...
97060,cristinayo12,omg months mine is locked for months and im tr...,omg months mine is lock for months and im tryi...,1,0.133,0.867,0.000,-0.3818
97061,towerspro,my account has been hacked and i cant contact ...,my account has been hacked and i cant contact ...,0,0.137,0.863,0.000,-0.4019
97062,zlymeda,good for you i was not even contacted via inap...,good for you i was not even contacted via inap...,0,0.085,0.855,0.060,-0.2144
97063,_saulmm,after sending them a bunch of documents they u...,after sending them a bunch of documents they l...,0,0.000,0.896,0.104,0.2732


In [33]:
# merged_df['key'] = merged_df['username'] + merged_df['tweet']
# d = merged_df[~merged_df['key'].isin(df_intent['username'] + df_intent['tweet'])].drop(['key'], axis=1)

In [34]:
# tr1, test1 = train_test_split(d,train_size=200,random_state=10)

In [35]:
#test1

Unnamed: 0,username,tweet,tweet_compound,topic,neg,neu,pos,compound
623,riaz65257734,can you help me get my money from revolut,can you help me get my money from revolut,4,0.000,0.748,0.252,0.4019
39752,denieseday,i cant log in either i can put my phone number...,i cant log in either i can put my phonenumber ...,1,0.000,0.905,0.095,0.2144
24780,manshuag,ive had the same issue i have k stuck in a bus...,ive had the same issue i have k stuck in a bud...,3,0.125,0.839,0.035,-0.5152
72773,princetubby01,and it goes quiet terrible support everywhere,and it goes quiet terrible support everywhere,1,0.287,0.463,0.250,-0.1027
76482,abbygreen3,id advise following for advice on good banksac...,id advise following for advice on good banksac...,1,0.000,0.811,0.189,0.4939
...,...,...,...,...,...,...,...,...
73101,elenidamis,i just did and of course no reply whatsoever i...,i just did and of course no reply whatsoever i...,4,0.136,0.864,0.000,-0.2960
35993,carlestt,how can i change my phone if i get an sms in m...,how can i change my phone if i get an sms in m...,1,0.102,0.837,0.060,-0.2263
32176,z3kron_,nah it was when i actually had an office,nah it was when i actually had an office,2,0.149,0.851,0.000,-0.1027
12965,aaronmccarthy33,any idea why i tried to take out euro from my ...,any idea why i tried to take out euro from my ...,3,0.064,0.936,0.000,-0.4215


In [36]:
# tr1.to_csv('training_8.csv')

In [37]:
# test1.to_csv('testing_base.csv')

In [38]:
#tr2, test2 = train_test_split(test1,train_size=300,random_state=10)

In [39]:
# tr2.to_csv('training_9.csv')