In [881]:
import pandas as pd
from pandas import option_context
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer 

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from collections import Counter

from textblob import TextBlob

## This is the main notebook for topic modeling and sentiment analysis (version 2)

In [882]:
df = pd.read_csv('train.csv')
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.shape

(99296, 24)

In [883]:
df_analysis = df[['username','tweet']].copy()
df_analysis.dropna(inplace=True)
df_analysis.reset_index(drop=True,inplace=True)

### Count words

In [884]:
stop = stopwords.words('english')
stop.extend(['hi', 'hey', 'hello','ha', 'followed','wa','dm','dont','cant','wont','get','still','like','need',
            'someone','people','im','ive','month','week','day','could','give','want','please','pls','since','one',
             'back','thanks','thank','take'])

In [885]:
# # Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','pretty','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])

# # Extend for most common irregular verbs (except pay,lose,send,buy,spend)

stop.extend(['say','make','go','take','come','see','know','get','got','give',
            'find','think','tell','become','show','leave','feel','put','bring',
            'begin','keep','hold','write','stand','hear','let','mean','set','meet',
            'run','sit','speak','lie','lead','read','grow','fall',
            'build','understand','draw','break','cut','rise','drive','wear',
            'choose'])

# ## Extend for prepositions
stop.extend(['without','among'])

In [886]:
create_compounds = (lambda x: 
                 x.replace('debit card', 'debitcard')
                 .replace('rainbow card', 'debitcard')
                 .replace('bank card', 'debitcard')
                 .replace('revolut card', 'debitcard')
                 .replace('revcards', 'debitcard')
                 .replace('credit card', 'creditcard')
                 .replace('junior card', 'juniorcard')
                 .replace('revolut junior', 'juniorcard')
                 .replace('business account', 'budinessaccount')
                 .replace('savings account', 'savingsaccount')
                 .replace('bank account', 'bankaccount')
                 .replace('premium account', 'premiumaccount')
                 .replace('premium user', 'premiumaccount')
                 .replace('premium plan', 'premiumaccount')
                 .replace('metal account', 'metalaccount')
                 .replace('metal card', 'metalaccount')
                 .replace('metal customers', 'metalaccount')
                 .replace('metal user', 'metalaccount')
                 .replace('business bank', 'businessbank')
                 .replace('virtual card', 'virtualcard')
                 .replace('revolut business', 'revolutbusiness')
                 .replace('google pay', 'googlepay')
                 .replace('apple pay', 'applepay')
                 .replace('applepayment', 'applepay')
                 .replace('samsung pay', 'samsungpay')
                 .replace('cryptocurrency', 'crypto')
                 .replace('cryptos', 'crypto')
                 .replace('cryptoasset', 'crypto')
                 .replace('doge coin', 'crypto')
                 .replace(' doge ', ' crypto ')
                 .replace('customer service', 'customerservice')
                 .replace('supoort', 'support')
                 .replace('customer support', 'customerservice')
                 .replace(' cs ', ' customerservice ')
                 .replace('phone number', 'phonenumber')
                 .replace('social media', 'socialmedia')
                 .replace('app chat', 'appchat')
                 .replace('challenger bank', 'fintech')
                 .replace('challengerbank', 'fintech')
                 .replace('neobanking', 'fintech')
                 .replace('neobanks', 'fintech')
                 .replace('neobank', 'fintech')
                 .replace('transferthis', 'transfer')
                 .replace('application', 'app')
                 .replace('locked', 'lock')
                 .replace('unlock', 'lock')
                 .replace('block', 'lock')
                 .replace('dark mode', 'darkmode')
                 .replace('wealth management', 'wealthmanagement'))


df_analysis['tweet_compound'] = df_analysis.tweet.map(create_compounds)

In [887]:
counter = Counter()
lemma = nltk.stem.WordNetLemmatizer()

for tweet in df_analysis['tweet_compound']:
    tweet = lemma.lemmatize(tweet)
    tweet = TextBlob(tweet).words  # tokenize words
    tweet = [w for w in tweet if w not in stop]

    counter += Counter(tweet)

In [888]:
len(counter)

32774

In [889]:
for phrase, count in counter.most_common(10):
    print('%20s %i' % ("".join(phrase), count))

             account 29068
             revolut 17033
                 app 16213
               money 15544
                lock 13818
                help 12151
                chat 7750
                card 6317
              access 6215
             support 5496


### Topic modeling

In [890]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
    
vectorizer = CountVectorizer(tokenizer = LemmaTokenizer(),
                              stop_words = stop,
                              min_df=2,
                              max_df = 0.9)

doc_word = vectorizer.fit_transform(df_analysis.tweet_compound)
doc_word.shape



(99296, 13172)

In [891]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(99296, 5)

In [892]:
topic_word = nmf_model.components_
topic_word.shape

(5, 13172)

In [893]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['account', 'lock', 'access', 'fund', 'reason', 'document'],
 ['app', 'access', 'new', 'phone', 'card', 'email'],
 ['revolut', 'bank', 'crypto', 'use', 'customer', 'fintech'],
 ['money', 'bank', 'lock', 'transfer', 'send', 'card'],
 ['help', 'chat', 'agent', 'live', 'waiting', 'support']]

- Component 0 (topic 1) seems to be about account queries
- Component 1 (topic 2) seems to be about app-related queries
- Component 2 (topic 3) seems to be about fintech payments / transactions
- Component 3 (topic 4) is about transfers and not being able to access accounts / money
- Component 4 (topic 5) is about general issues / requests for support

In [894]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [915]:
with option_context('display.max_colwidth', 600):
    display(df_analysis.sample(5))

Unnamed: 0,username,tweet,tweet_compound,topic
42283,realsemihboyuk,hi i have a authorisation transaction on my bank account that is indicate as terminate its an transport for london authorisation but i never use my revolut card on tfl card reader my question is when i can have my money back thanks for reply,hi i have a authorisation transaction on my bankaccount that is indicate as terminate its an transport for london authorisation but i never use my debitcard on tfl card reader my question is when i can have my money back thanks for reply,3
94372,alvin06142008,hello my account has been blocked because the system couldnt verify my verification code can i seek your assistance in getting someone to assist me in the app thanks,hello my account has been lock because the system couldnt verify my verification code can i seek your assistance in getting someone to assist me in the app thanks,1
83676,al_shone,yep the bank have no record of the refund but say that the bank has accepted it so what can i do now ive had months of issues between you two and payments i think you should have a chat with each other,yep the bank have no record of the refund but say that the bank has accepted it so what can i do now ive had months of issues between you two and payments i think you should have a chat with each other,4
90482,qstweet,what was the outcome im having yesterday too,what was the outcome im having yesterday too,4
80776,fuckoff_u_cunt,can someone chat with me in dm i need help with the app and no agent from revolut is answering i need help,can someone chat with me in dm i need help with the app and no agent from revolut is answering i need help,4


### Sentiment analysis

In [897]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for tweet in df_analysis.tweet_compound:
    sentiment.append(sid_obj.polarity_scores(tweet))

In [917]:
sentiment_df = pd.DataFrame(sentiment)
sentiment_df.head()

Unnamed: 0,neg,neu,pos,compound
0,0.0,0.722,0.278,0.4019
1,0.329,0.671,0.0,-0.9136
2,0.138,0.862,0.0,-0.7269
3,0.0,0.952,0.048,0.2023
4,0.139,0.696,0.165,-0.1531


In [900]:
merged_df = pd.concat([df_analysis, sentiment_df], axis=1)

In [901]:
full_df = pd.concat([df,merged_df],axis=1)

### Analysis

In [903]:
## Add month feature to enable analysis over time
full_df['month'] = pd.DatetimeIndex(full_df['date']).month
full_df['day'] = pd.DatetimeIndex(full_df['date']).day

In [914]:
with option_context('display.max_colwidth', 600):
    display(full_df[(full_df['topic']==2)].sample(2))

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,tweet,...,username.1,tweet.1,tweet_compound,topic,neg,neu,pos,compound,month,day
3465,3465,1338853447324463106,1338511787805773826,2020-12-15 09:28:31 EST,2020-12-15,09:28:31,-500,957772878132498432,delonixkidal,love for my country best joke,...,delonixkidal,love for my country best joke,love for my country best joke,2,0.0,0.221,0.779,0.891,12,15
79448,79448,1243477119767805952,1243476481679986689,2020-03-27 05:57:02 EDT,2020-03-27,05:57:02,-500,765214431651463168,ignitedigitalhq,there have been many exciting launches in the tech world this week we thought you might like our roundup,...,ignitedigitalhq,there have been many exciting launches in the tech world this week we thought you might like our roundup,there have been many exciting launches in the tech world this week we thought you might like our roundup,2,0.0,0.749,0.251,0.6908,3,27


In [905]:
full_df.groupby(['topic','month'])['compound'].mean()

topic  month
0      1        0.033063
       2        0.021700
       3        0.021529
       4        0.043048
       5        0.041482
       6       -0.010150
       7        0.026898
       8        0.009280
       9        0.041911
       10       0.043748
       11       0.025930
       12       0.047196
1      1        0.068411
       2        0.092013
       3        0.103172
       4        0.057641
       5        0.082436
       6        0.055252
       7        0.088006
       8        0.094351
       9        0.086824
       10       0.104480
       11       0.076168
       12       0.082135
2      1        0.120690
       2        0.197029
       3        0.166473
       4        0.130331
       5        0.111652
       6        0.079333
       7        0.160844
       8        0.112680
       9        0.143523
       10       0.135904
       11       0.042663
       12       0.185003
3      1       -0.048408
       2       -0.013104
       3        0.021404
       4    

In [906]:
full_df.groupby('topic')['topic'].count()

topic
0    15334
1    22689
2    19313
3    13987
4    27973
Name: topic, dtype: int64

In [907]:
full_df.shape

(99296, 34)

In [908]:
## Export to csv for Tableau analysis
full_df.to_csv('datatableau.csv')

### TFI-DF

In [909]:
# function to display top n terms associated with each topic

def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [910]:
# tuning vectorizer params
tf_idf = TfidfVectorizer(stop_words=stop,
                         tokenizer=word_tokenize,
                         min_df= 2,
                         max_df= 0.9)

# document-term matrix
doc_word2 = tf_idf.fit_transform(df_analysis.tweet_compound)
print(doc_word2.shape)



(99296, 14605)


In [911]:
n = 5 # number of topics

# model selection, fit/trans, and hyperparameter tuning
nmf_model_2 = NMF(n_components =n)

# doc-topic matrix
doc_topic2 = nmf_model_2.fit_transform(doc_word2)

# creating ids for each topic
topic_ids2 = ["topic"+str(val) for val in range(n)]

# topic-term matrix
topic_word2 = pd.DataFrame(nmf_model_2.components_.round(n),
             index = topic_ids2,
             columns = tf_idf.get_feature_names())

# prints top x words in each topic
display_topics(nmf_model_2, 
               tf_idf.get_feature_names(), 
               5) # number of top words/topic


Topic  0
account, lock, access, months, reason

Topic  1
help, account, problem, revolut, trying

Topic  2
app, access, contact, support, new

Topic  3
waiting, chat, agent, live, reply

Topic  4
money, revolut, bank, transfer, send
