In [85]:
import pandas as pd
from pandas import option_context
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer 

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from collections import Counter

from textblob import TextBlob

## This is the main notebook for topic modeling and sentiment analysis (version 2)

In [2]:
df = pd.read_csv('train.csv')
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.shape

(97065, 24)

In [3]:
df_analysis = df[['username','tweet']].copy()
df_analysis.dropna(inplace=True)
df_analysis.reset_index(drop=True,inplace=True)

### Count words

In [118]:
stop = stopwords.words('english')
stop.extend(['hi', 'hey', 'hello','ha', 'followed','wa','dm','dont','cant','wont','get','still','like','need',
            'someone','people','im','ive','month','week','day','could','give','want','please','pls','since','one',
             'back','thanks','thank','take','doesnt','does', 'might', 'must','lock','access','sent','closed','use',
            'u','contact','new','email','time','message','tried','answer','waiting','issue','reason','going',
            'work','agent','bank','card','trying','even','every','problem','via','chat','support','log',
            'getting','anyone','reply','transfer','nothing'])

In [119]:
# # Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','pretty','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])

# # Extend for most common irregular verbs (except pay,lose,send,buy,spend)

stop.extend(['say','make','go','take','come','see','know','get','got','give',
            'find','think','tell','become','show','leave','feel','put','bring',
            'begin','keep','hold','write','stand','hear','let','mean','set','meet',
            'run','sit','speak','lie','lead','read','grow','fall',
            'build','understand','draw','break','cut','rise','drive','wear',
            'choose'])

# ## Extend for prepositions
stop.extend(['without','among'])

#### Compounds

In [6]:
create_compounds = (lambda x: 
                 x.replace('debit card', 'debitcard')
                 .replace('rainbow card', 'debitcard')
                 .replace('bank card', 'debitcard')
                 .replace('revolut card', 'debitcard')
                 .replace('revcards', 'debitcard')
                 .replace('debitcardcrypto', 'debitcard crypto')
                 .replace('credit card', 'creditcard')
                 .replace('junior card', 'juniorcard')
                 .replace('revolut junior', 'juniorcard')
                 .replace('revolut youth', 'juniorcard')
                 .replace('junior accounts', 'juniorcard')
                 .replace('junior account', 'juniorcard')
                 .replace('business account', 'budinessaccount')
                 .replace('savings account', 'savingsaccount')
                 .replace('bank account', 'bankaccount')
                 .replace('premium account', 'premiumaccount')
                 .replace('revolut premium', 'premiumaccount')
                 .replace('premium user', 'premiumaccount')
                 .replace('premium plan', 'premiumaccount')
                 .replace('premium membership', 'premiumaccount')
                 .replace('premium member', 'premiumaccount')
                 .replace('premium paid', 'premiumaccount')
                 .replace('premium payment', 'premiumaccount')
                 .replace('mypremiumaccountworthnothing', 'my premiumaccount worth nothing')
                 .replace('premium service', 'premiumaccount service')
                 .replace('metal account', 'metalaccount')
                 .replace('metal plan', 'metalaccount')
                 .replace('metal customer', 'metalaccount')
                 .replace('metal card', 'metalaccount')
                 .replace('metal customers', 'metalaccount')
                 .replace('metal user', 'metalaccount')
                 .replace('business bank', 'businessbank')
                 .replace('virtual card', 'virtualcard')
                 .replace('revolut business', 'revolutbusiness')
                 .replace('google pay', 'googlepay')
                 .replace('apple pay', 'applepay')
                 .replace('apple wallet', 'applepay')
                 .replace('applepayment', 'applepay')
                 .replace('samsung pay', 'samsungpay')
                 .replace('cryptocurrency', 'crypto')
                 .replace('cryptorelated', 'crypto related')
                 .replace('cryptofriendly', 'crypto friendly')
                 .replace('criptos', 'crypto')
                 .replace('cripto', 'crypto')
                 .replace('cryptos', 'crypto')
                 .replace('cryptoasset', 'crypto')
                 .replace('doge coin', 'crypto')
                 .replace(' doge ', ' crypto ')
                 .replace('customer service', 'customerservice')
                 .replace('supoort', 'support')
                 .replace('customer support', 'customerservice')
                 .replace(' cs ', ' customerservice ')
                 .replace('phone number', 'phonenumber')
                 .replace('social media', 'socialmedia')
                 .replace('app chat', 'appchat')
                 .replace('challenger bank', 'fintech')
                 .replace('challengerbank', 'fintech')
                 .replace('neobanking', 'fintech')
                 .replace('neobanks', 'fintech')
                 .replace('neobank', 'fintech')
                 .replace('transferthis', 'transfer')
                 .replace('application', 'app')
                 .replace('locked', 'lock')
                 .replace('unlock', 'lock')
                 .replace('block', 'lock')
                 .replace('dark mode', 'darkmode')
                 .replace('dark theme', 'darkmode')
                 .replace('xfers', 'transfer')
                 .replace('xfer', 'transfer'))


df_analysis['tweet_compound'] = df_analysis.tweet.map(create_compounds)

#### Count

In [7]:
counter = Counter()
lemma = nltk.stem.WordNetLemmatizer()

for tweet in df_analysis['tweet_compound']:
    tweet = lemma.lemmatize(tweet)
    tweet = TextBlob(tweet).words  # tokenize words
    tweet = [w for w in tweet if w not in stop]

    counter += Counter(tweet)

In [8]:
len(counter)

32671

In [9]:
for phrase, count in counter.most_common(10):
    print('%20s %i' % ("".join(phrase), count))

             account 28816
             revolut 16940
                 app 16176
               money 15512
                help 12066
                chat 7708
             support 5474
                days 5365
               would 4011
              months 3512


### Topic modeling

In [120]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
    
vectorizer = CountVectorizer(tokenizer = LemmaTokenizer(),
                              stop_words = stop,
                              min_df=2,
                              max_df = 0.9)

doc_word = vectorizer.fit_transform(df_analysis.tweet_compound)
doc_word.shape



(97065, 13088)

In [121]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(97065, 5)

In [122]:
topic_word = nmf_model.components_
topic_word.shape

(5, 13088)

In [123]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['account', 'fund', 'document', 'open', 'customerservice', 'information'],
 ['app', 'phone', 'update', 'id', 'working', 'login'],
 ['revolut', 'crypto', 'customer', 'child', 'fintech', 'payment'],
 ['money', 'send', 'pay', 'bankaccount', 'lost', 'transferred'],
 ['help', 'live', 'phone', 'hour', 'number', 'verify']]

- Component 0 (topic 1) seems to be about account queries
- Component 1 (topic 2) seems to be about app-related queries
- Component 2 (topic 3) seems to be about fintech innovations
- Component 3 (topic 4) is about transfers and not being able to access accounts / money
- Component 4 (topic 5) is about general requests for support

In [124]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [125]:
with option_context('display.max_colwidth', 600):
    display(df_analysis.sample(5))

Unnamed: 0,username,tweet,tweet_compound,topic
89581,s3ran0,i was requested to verify my funds i submitted all the necessary information and there is no reponse,i was requested to verify my funds i submitted all the necessary information and there is no reponse,0
92225,nizamaniasfand,kindly dm me i am facing issues since last days and support chat didnt reply,kindly dm me i am facing issues since last days and support chat didnt reply,1
16412,saraahowens,hi ive been trying to get through to an agent on the chat for over a day now really need some help with my account,hi ive been trying to get through to an agent on the chat for over a day now really need some help with my account,4
78237,oblon_fitz_ob,cashdeposits via very important for german market,cashdeposits via very important for german market,2
12671,phil08game,is everything ok with the app cant use my account for anything,is everything ok with the app cant use my account for anything,1


### Sentiment analysis

In [126]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for tweet in df_analysis.tweet:
    sentiment.append(sid_obj.polarity_scores(tweet))

In [127]:
sentiment_df = pd.DataFrame(sentiment)
sentiment_df.head()

Unnamed: 0,neg,neu,pos,compound
0,0.0,0.722,0.278,0.4019
1,0.329,0.671,0.0,-0.9136
2,0.138,0.862,0.0,-0.7269
3,0.0,0.952,0.048,0.2023
4,0.139,0.696,0.165,-0.1531


In [128]:
merged_df = pd.concat([df_analysis, sentiment_df], axis=1)

In [129]:
full_df = pd.concat([df,merged_df],axis=1)

### Analysis - high level topics

In [130]:
## Add month feature to enable analysis over time
full_df['month'] = pd.DatetimeIndex(full_df['date']).month
full_df['day'] = pd.DatetimeIndex(full_df['date']).day

In [131]:
with option_context('display.max_colwidth', 600):
    display(full_df[(full_df['topic']==2)].sample(2))

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,tweet,...,username.1,tweet.1,tweet_compound,topic,neg,neu,pos,compound,month,day
17045,62557,1321476143804022793,1321476143804022793,2020-10-28 11:37:19 EDT,2020-10-28,11:37:19,-500,117465252,timpbreden,whats happening im a long term customer cant get in my bank or pay payees in my system your response is non existent,...,timpbreden,whats happening im a long term customer cant get in my bank or pay payees in my system your response is non existent,whats happening im a long term customer cant get in my bank or pay payees in my system your response is non existent,2,0.06,0.94,0.0,-0.1027,10,28
10841,49899,1329510904468287498,1329510904468287498,2020-11-19 14:44:35 EST,2020-11-19,14:44:35,-500,28559072,zippertheory,your customer service is absolute garbage stay far far away from this horrendous neobank,...,zippertheory,your customer service is absolute garbage stay far far away from this horrendous neobank,your customerservice is absolute garbage stay far far away from this horrendous fintech,2,0.257,0.743,0.0,-0.6705,11,19


In [132]:
full_df.groupby(['topic','month'])['compound'].mean()

topic  month
0      1       -0.013182
       2       -0.013563
       3       -0.011696
       4       -0.012413
       5        0.008540
       6       -0.046260
       7       -0.020787
       8       -0.024922
       9       -0.007432
       10       0.014977
       11       0.001018
       12       0.022280
1      1        0.056801
       2        0.063006
       3        0.079246
       4        0.026384
       5        0.055621
       6        0.034792
       7        0.058540
       8        0.072775
       9        0.040242
       10       0.070952
       11       0.041433
       12       0.056215
2      1        0.125854
       2        0.171484
       3        0.155860
       4        0.116767
       5        0.114207
       6        0.063835
       7        0.156966
       8        0.089978
       9        0.124518
       10       0.134403
       11       0.061899
       12       0.156189
3      1       -0.067235
       2       -0.057253
       3       -0.002995
       4    

In [133]:
full_df.groupby('topic')['topic'].count()

topic
0    15560
1    26169
2    25213
3    15423
4    14700
Name: topic, dtype: int64

In [134]:
full_df.shape

(97065, 34)

In [25]:
# full_df.to_csv('check.csv')

In [26]:
## Export to csv for Tableau analysis
#full_df.to_csv('datatableau.csv')

In [27]:
# charles94240716

### TFI-DF

In [28]:
# function to display top n terms associated with each topic

def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [29]:
# tuning vectorizer params
tf_idf = TfidfVectorizer(stop_words=stop,
                         tokenizer=word_tokenize,
                         min_df= 2,
                         max_df= 0.9)

# document-term matrix
doc_word2 = tf_idf.fit_transform(df_analysis.tweet_compound)
print(doc_word2.shape)



(97065, 14531)


In [30]:
n = 5 # number of topics

# model selection, fit/trans, and hyperparameter tuning
nmf_model_2 = NMF(n_components =n)

# doc-topic matrix
doc_topic2 = nmf_model_2.fit_transform(doc_word2)

# creating ids for each topic
topic_ids2 = ["topic"+str(val) for val in range(n)]

# topic-term matrix
topic_word2 = pd.DataFrame(nmf_model_2.components_.round(n),
             index = topic_ids2,
             columns = tf_idf.get_feature_names())

# prints top x words in each topic
display_topics(nmf_model_2, 
               tf_idf.get_feature_names(), 
               5) # number of top words/topic


Topic  0
account, months, weeks, days, funds

Topic  1
help, revolut, problem, log, anyone

Topic  2
app, log, phone, support, problem

Topic  3
money, revolut, send, transfer, months

Topic  4
chat, live, reply, days, support


### Fix split of data

In [86]:
df_intent = pd.read_csv('training_base.csv',
                       usecols = ['username','tweet','intent'])

def remove_space(text):
    text = text.strip()
    text = re.sub("\s\s+", " ", text)
    return text
    
df_intent['tweet'] = df_intent.tweet.map(remove_space)
merged_df['tweet'] = merged_df.tweet.map(remove_space)

In [87]:
merged_df

Unnamed: 0,username,tweet,tweet_compound,topic,neg,neu,pos,compound,key
0,richard64637000,i need help with account verification skaterbo...,i need help with account verification skaterbo...,4,0.000,0.722,0.278,0.4019,richard64637000i need help with account verifi...
1,darley_darren,your agents are beyond useless lied to me and ...,your agents are beyond useless lied to me and ...,4,0.329,0.671,0.000,-0.9136,darley_darrenyour agents are beyond useless li...
2,darley_darren,nd time revolut put me in a bad position and h...,nd time revolut put me in a bad position and h...,2,0.138,0.862,0.000,-0.7269,darley_darrennd time revolut put me in a bad p...
3,aoife_galvin,how can i talk to someone about currency excha...,how can i talk to someone about currency excha...,1,0.000,0.952,0.048,0.2023,aoife_galvinhow can i talk to someone about cu...
4,cian47477384,how do i change my number if ive no access to ...,how do i change my number if ive no access to ...,1,0.139,0.696,0.165,-0.1531,cian47477384how do i change my number if ive n...
...,...,...,...,...,...,...,...,...,...
97060,cristinayo12,omg months mine is locked for months and im tr...,omg months mine is lock for months and im tryi...,1,0.133,0.867,0.000,-0.3818,cristinayo12omg months mine is locked for mont...
97061,towerspro,my account has been hacked and i cant contact ...,my account has been hacked and i cant contact ...,0,0.137,0.863,0.000,-0.4019,towerspromy account has been hacked and i cant...
97062,zlymeda,good for you i was not even contacted via inap...,good for you i was not even contacted via inap...,1,0.085,0.855,0.060,-0.2144,zlymedagood for you i was not even contacted v...
97063,_saulmm,after sending them a bunch of documents they u...,after sending them a bunch of documents they l...,0,0.000,0.896,0.104,0.2732,_saulmmafter sending them a bunch of documents...


In [88]:
merged_df['key'] = merged_df['username'] + merged_df['tweet']
d = merged_df[~merged_df['key'].isin(df_intent['username'] + df_intent['tweet'])].drop(['key'], axis=1)

In [89]:
tr1, test1 = train_test_split(d,train_size=200,random_state=10)

In [90]:
test1

Unnamed: 0,username,tweet,tweet_compound,topic,neg,neu,pos,compound
76694,mushroomsouffle,hi my app is refusing to top up because it say...,hi my app is refusing to top up because it say...,1,0.107,0.746,0.147,0.1551
92999,oliverbeirne,well that dm was no help you have my money giv...,well that dm was no help you have my money giv...,4,0.101,0.805,0.094,-0.0408
28918,nucleargeo,can someone direct message me please concernin...,can someone direct message me please concernin...,0,0.000,0.777,0.223,0.3182
84763,channerz_maiden,with the collapse if we have booked flights is...,with the collapse if we have booked flights is...,3,0.167,0.833,0.000,-0.4939
32115,soso13651675,she did not asked she is just assuming unbelie...,she did not asked she is just assuming unbelie...,1,0.233,0.650,0.117,-0.4201
...,...,...,...,...,...,...,...,...
35886,herr_malcom,it would request for a different card is this ...,it would request for a different card is this ...,1,0.000,0.813,0.187,0.3182
32077,hsearson93,i had to log out of my app and now i cant log ...,i had to log out of my app and now i cant log ...,1,0.000,0.773,0.227,0.6124
12944,tarsnotteboom,same for me maybe we can hire a professional p...,same for me maybe we can hire a professional p...,3,0.289,0.711,0.000,-0.6908
19171,sneakersnow,i had revolut until you locked me out for mont...,i had revolut until you lock me out for months...,3,0.000,1.000,0.000,0.0000


In [92]:
tr1.to_csv('training_8.csv')

In [93]:
test1.to_csv('testing_base.csv')

In [135]:
tr2, test2 = train_test_split(test1,train_size=300,random_state=10)

In [139]:
tr2.to_csv('training_9.csv')