In [87]:
import pandas as pd
from pandas import option_context
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer 

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from collections import Counter

from textblob import TextBlob

## This is the main notebook for topic modeling and sentiment analysis (version 2)

In [88]:
df = pd.read_csv('train.csv')
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.shape

(97188, 24)

In [89]:
df_analysis = df[['username','tweet']].copy()
df_analysis.dropna(inplace=True)
df_analysis.reset_index(drop=True,inplace=True)

### Count words

In [90]:
stop = stopwords.words('english')
stop.extend(['hi', 'hey', 'hello','ha', 'followed','wa','dm','dont','cant','wont','get','still','like','need',
            'someone','people','im','ive','month','week','day','could','give','want','please','pls','since','one',
             'back','thanks','thank','take','doesnt','does', 'might', 'must','lock','access','sent','closed','use',
            'u','contact','new','email','time','message','tried','answer','waiting','issue','reason','going',
            'work','agent','bank','card','trying','even','every'])

In [91]:
# # Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','pretty','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])

# # Extend for most common irregular verbs (except pay,lose,send,buy,spend)

stop.extend(['say','make','go','take','come','see','know','get','got','give',
            'find','think','tell','become','show','leave','feel','put','bring',
            'begin','keep','hold','write','stand','hear','let','mean','set','meet',
            'run','sit','speak','lie','lead','read','grow','fall',
            'build','understand','draw','break','cut','rise','drive','wear',
            'choose'])

# ## Extend for prepositions
stop.extend(['without','among'])

#### Compounds

In [92]:
create_compounds = (lambda x: 
                 x.replace('debit card', 'debitcard')
                 .replace('rainbow card', 'debitcard')
                 .replace('bank card', 'debitcard')
                 .replace('revolut card', 'debitcard')
                 .replace('revcards', 'debitcard')
                 .replace('debitcardcrypto', 'debitcard crypto')
                 .replace('credit card', 'creditcard')
                 .replace('junior card', 'juniorcard')
                 .replace('revolut junior', 'juniorcard')
                 .replace('revolut youth', 'juniorcard')
                 .replace('junior accounts', 'juniorcard')
                 .replace('junior account', 'juniorcard')
                 .replace('business account', 'budinessaccount')
                 .replace('savings account', 'savingsaccount')
                 .replace('bank account', 'bankaccount')
                 .replace('premium account', 'premiumaccount')
                 .replace('revolut premium', 'premiumaccount')
                 .replace('premium user', 'premiumaccount')
                 .replace('premium plan', 'premiumaccount')
                 .replace('premium membership', 'premiumaccount')
                 .replace('premium member', 'premiumaccount')
                 .replace('premium paid', 'premiumaccount')
                 .replace('premium payment', 'premiumaccount')
                 .replace('mypremiumaccountworthnothing', 'my premiumaccount worth nothing')
                 .replace('premium service', 'premiumaccount service')
                 .replace('metal account', 'metalaccount')
                 .replace('metal plan', 'metalaccount')
                 .replace('metal customer', 'metalaccount')
                 .replace('metal card', 'metalaccount')
                 .replace('metal customers', 'metalaccount')
                 .replace('metal user', 'metalaccount')
                 .replace('business bank', 'businessbank')
                 .replace('virtual card', 'virtualcard')
                 .replace('revolut business', 'revolutbusiness')
                 .replace('google pay', 'googlepay')
                 .replace('apple pay', 'applepay')
                 .replace('apple wallet', 'applepay')
                 .replace('applepayment', 'applepay')
                 .replace('samsung pay', 'samsungpay')
                 .replace('cryptocurrency', 'crypto')
                 .replace('cryptorelated', 'crypto related')
                 .replace('cryptofriendly', 'crypto friendly')
                 .replace('criptos', 'crypto')
                 .replace('cripto', 'crypto')
                 .replace('cryptos', 'crypto')
                 .replace('cryptoasset', 'crypto')
                 .replace('doge coin', 'crypto')
                 .replace(' doge ', ' crypto ')
                 .replace('customer service', 'customerservice')
                 .replace('supoort', 'support')
                 .replace('customer support', 'customerservice')
                 .replace(' cs ', ' customerservice ')
                 .replace('phone number', 'phonenumber')
                 .replace('social media', 'socialmedia')
                 .replace('app chat', 'appchat')
                 .replace('challenger bank', 'fintech')
                 .replace('challengerbank', 'fintech')
                 .replace('neobanking', 'fintech')
                 .replace('neobanks', 'fintech')
                 .replace('neobank', 'fintech')
                 .replace('transferthis', 'transfer')
                 .replace('application', 'app')
                 .replace('locked', 'lock')
                 .replace('unlock', 'lock')
                 .replace('block', 'lock')
                 .replace('dark mode', 'darkmode')
                 .replace('dark theme', 'darkmode')
                 .replace('xfers', 'transfer')
                 .replace('xfer', 'transfer'))


df_analysis['tweet_compound'] = df_analysis.tweet.map(create_compounds)

#### Count

In [93]:
counter = Counter()
lemma = nltk.stem.WordNetLemmatizer()

for tweet in df_analysis['tweet_compound']:
    tweet = lemma.lemmatize(tweet)
    tweet = TextBlob(tweet).words  # tokenize words
    tweet = [w for w in tweet if w not in stop]

    counter += Counter(tweet)

In [94]:
len(counter)

32679

In [95]:
for phrase, count in counter.most_common(10):
    print('%20s %i' % ("".join(phrase), count))

             account 28824
             revolut 16947
                 app 16179
               money 15512
                help 12070
                chat 7711
             support 5474
                days 5372
               would 4011
              months 3516


### Topic modeling

In [96]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
    
vectorizer = CountVectorizer(tokenizer = LemmaTokenizer(),
                              stop_words = stop,
                              min_df=2,
                              max_df = 0.9)

doc_word = vectorizer.fit_transform(df_analysis.tweet_compound)
doc_word.shape



(97188, 13102)

In [97]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(97188, 5)

In [98]:
topic_word = nmf_model.components_
topic_word.shape

(5, 13102)

In [99]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['account', 'fund', 'chat', 'document', 'open', 'information'],
 ['app', 'support', 'phone', 'chat', 'log', 'problem'],
 ['revolut', 'crypto', 'customer', 'child', 'fintech', 'transfer'],
 ['money', 'transfer', 'send', 'pay', 'bankaccount', 'lost'],
 ['help', 'chat', 'live', 'support', 'hour', 'problem']]

- Component 0 (topic 1) seems to be about account queries
- Component 1 (topic 2) seems to be about app-related queries
- Component 2 (topic 3) seems to be about fintech innovations
- Component 3 (topic 4) is about transfers and not being able to access accounts / money
- Component 4 (topic 5) is about general requests for support

In [100]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [101]:
with option_context('display.max_colwidth', 600):
    display(df_analysis.sample(5))

Unnamed: 0,username,tweet,tweet_compound,topic
14997,elpublloret,hi how can i change my pin when i cant remember my old pin please,hi how can i change my pin when i cant remember my old pin please,1
23520,nunoamo33671947,anything i can do on my end to speed this up,anything i can do on my end to speed this up,2
14587,_rinakay,omg what that is awful sucks that happened to you surely that must be illegalyou can report it somewhere,omg what that is awful sucks that happened to you surely that must be illegalyou can report it somewhere,3
47101,iskeyrol,did they manage to fix your account,did they manage to fix your account,0
86973,auduccio_,ill contact them then thank you,ill contact them then thank you,3


### Sentiment analysis

In [102]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for tweet in df_analysis.tweet_compound:
    sentiment.append(sid_obj.polarity_scores(tweet))

In [103]:
sentiment_df = pd.DataFrame(sentiment)
sentiment_df.head()

Unnamed: 0,neg,neu,pos,compound
0,0.0,0.722,0.278,0.4019
1,0.329,0.671,0.0,-0.9136
2,0.138,0.862,0.0,-0.7269
3,0.0,0.952,0.048,0.2023
4,0.139,0.696,0.165,-0.1531


In [104]:
merged_df = pd.concat([df_analysis, sentiment_df], axis=1)

In [105]:
full_df = pd.concat([df,merged_df],axis=1)

### Analysis - high level topics

In [106]:
## Add month feature to enable analysis over time
full_df['month'] = pd.DatetimeIndex(full_df['date']).month
full_df['day'] = pd.DatetimeIndex(full_df['date']).day

In [107]:
with option_context('display.max_colwidth', 600):
    display(full_df[(full_df['topic']==2)].sample(2))

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,tweet,...,username.1,tweet.1,tweet_compound,topic,neg,neu,pos,compound,month,day
88789,206952,1230603091352936449,1230603091352936449,2020-02-20 16:20:15 EST,2020-02-20,16:20:15,-500,240235447,pabloasensiouk,pure joy kuddos to revolut hijacked a monzo twitter thread and things got weird really fast fintech socialmedia,...,pabloasensiouk,pure joy kuddos to revolut hijacked a monzo twitter thread and things got weird really fast fintech socialmedia,pure joy kuddos to revolut hijacked a monzo twitter thread and things got weird really fast fintech socialmedia,2,0.079,0.744,0.177,0.4767,2,20
42653,113345,1286706949153849345,1286605753575497729,2020-07-24 12:56:57 EDT,2020-07-24,12:56:57,-500,1859656675,drashyagoel,it looked like a very commerciallybent decision to me addon products like crypto trading were moved closer to the home screen while your spending behaviour and savings were moved farther,...,drashyagoel,it looked like a very commerciallybent decision to me addon products like crypto trading were moved closer to the home screen while your spending behaviour and savings were moved farther,it looked like a very commerciallybent decision to me addon products like crypto trading were moved closer to the home screen while your spending behaviour and savings were moved farther,2,0.0,0.848,0.152,0.6124,7,24


In [108]:
full_df.groupby(['topic','month'])['compound'].mean()

topic  month
0      1        0.033938
       2        0.024787
       3        0.021817
       4        0.030276
       5        0.037097
       6       -0.016794
       7        0.014467
       8        0.011020
       9        0.032301
       10       0.043128
       11       0.023449
       12       0.039786
1      1        0.070382
       2        0.074837
       3        0.093229
       4        0.046080
       5        0.070670
       6        0.045055
       7        0.076895
       8        0.087954
       9        0.063721
       10       0.086634
       11       0.061015
       12       0.077712
2      1        0.117221
       2        0.185554
       3        0.162885
       4        0.131136
       5        0.118165
       6        0.082941
       7        0.170698
       8        0.104704
       9        0.143378
       10       0.135509
       11       0.061563
       12       0.164905
3      1       -0.043102
       2       -0.022411
       3        0.023103
       4    

In [109]:
full_df.groupby('topic')['topic'].count()

topic
0    14725
1    25697
2    22151
3    15667
4    18948
Name: topic, dtype: int64

In [110]:
full_df.shape

(97188, 34)

In [123]:
full_df.to_csv('check.csv')

In [111]:
## Export to csv for Tableau analysis
#full_df.to_csv('datatableau.csv')

In [None]:
charles94240716

### TFI-DF

In [112]:
# function to display top n terms associated with each topic

def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [113]:
# tuning vectorizer params
tf_idf = TfidfVectorizer(stop_words=stop,
                         tokenizer=word_tokenize,
                         min_df= 2,
                         max_df= 0.9)

# document-term matrix
doc_word2 = tf_idf.fit_transform(df_analysis.tweet_compound)
print(doc_word2.shape)



(97188, 14535)


In [114]:
n = 5 # number of topics

# model selection, fit/trans, and hyperparameter tuning
nmf_model_2 = NMF(n_components =n)

# doc-topic matrix
doc_topic2 = nmf_model_2.fit_transform(doc_word2)

# creating ids for each topic
topic_ids2 = ["topic"+str(val) for val in range(n)]

# topic-term matrix
topic_word2 = pd.DataFrame(nmf_model_2.components_.round(n),
             index = topic_ids2,
             columns = tf_idf.get_feature_names())

# prints top x words in each topic
display_topics(nmf_model_2, 
               tf_idf.get_feature_names(), 
               5) # number of top words/topic


Topic  0
account, months, weeks, days, funds

Topic  1
help, revolut, problem, log, anyone

Topic  2
app, log, phone, support, problem

Topic  3
money, revolut, send, transfer, months

Topic  4
chat, live, reply, days, support


### Fix split of data

In [177]:
sub_test6.to_csv('test6.csv')

In [178]:
df_intent = pd.read_csv('training_base.csv',
                       usecols = ['username','tweet','intent'])

def remove_space(text):
    text = text.strip()
    return text
    
df_intent['tweet'] = df_intent.tweet.map(remove_space)

In [181]:
merged_df['key'] = merged_df['username'] + merged_df['tweet']
d = merged_df[~merged_df['key'].isin(df_intent['username'] + df_intent['tweet'])].drop(['key'], axis=1)

In [185]:
tr1, test1 = train_test_split(d,train_size=200,random_state=10)

In [187]:
tr1.to_csv('training_7.csv')