In [141]:
import pandas as pd
from pandas import option_context
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer 

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

from collections import Counter

from textblob import TextBlob

## This is the main notebook for topic modeling and sentiment analysis (version 2)

In [142]:
df = pd.read_csv('train.csv')
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.shape

(99296, 24)

In [143]:
df_analysis = df[['username','tweet']].copy()
df_analysis.dropna(inplace=True)
df_analysis.reset_index(drop=True,inplace=True)

### Count words

In [233]:
stop = stopwords.words('english')
stop.extend(['hi', 'hey', 'hello','ha', 'followed','wa','dm','dont','cant','wont','get','still','like','need',
            'someone','people','im','ive','month','week','day','could','give','want','please','pls','since','one',
             'back','thanks','thank','take','doesnt','does', 'might', 'must','lock','access','sent','closed','use',
            'u','contact','new','email','time','message','tried','answer','waiting','issue','reason','going'])

In [234]:
# # Extend for most common English adverbs

stop.extend(['up','so','out','just','now','how','then','more','also','here',
            'well','only','very','even','back','there','down','still','in',
            'as','to','when','never','really','most','on','why','about','over',
            'again','where','right','off','always','today','all','far','long',
            'away','yet','often','ever','however','almost','later','much',
            'once','least','ago','together','around','already','enough','both',
            'maybe','actually','probably','home','of course','perhaps','little',
            'else','sometimes','finally','less','better','early','especially',
            'either','quite','simply','nearly','soon','certainly','quickly',
            'no','recently','before','usually','thus','exactly','hard',
            'particularly','pretty','forward','ok','okay','clearly','indeed',
            'rather','that','tonight','close','suddenly','best','instead',
            'ahead','fast','alone','eventually','directly'])

# # Extend for most common irregular verbs (except pay,lose,send,buy,spend)

stop.extend(['say','make','go','take','come','see','know','get','got','give',
            'find','think','tell','become','show','leave','feel','put','bring',
            'begin','keep','hold','write','stand','hear','let','mean','set','meet',
            'run','sit','speak','lie','lead','read','grow','fall',
            'build','understand','draw','break','cut','rise','drive','wear',
            'choose'])

# ## Extend for prepositions
stop.extend(['without','among'])

#### Compounds

In [146]:
create_compounds = (lambda x: 
                 x.replace('debit card', 'debitcard')
                 .replace('rainbow card', 'debitcard')
                 .replace('bank card', 'debitcard')
                 .replace('revolut card', 'debitcard')
                 .replace('revcards', 'debitcard')
                 .replace('credit card', 'creditcard')
                 .replace('junior card', 'juniorcard')
                 .replace('revolut junior', 'juniorcard')
                 .replace('business account', 'budinessaccount')
                 .replace('savings account', 'savingsaccount')
                 .replace('bank account', 'bankaccount')
                 .replace('premium account', 'premiumaccount')
                 .replace('premium user', 'premiumaccount')
                 .replace('premium plan', 'premiumaccount')
                 .replace('metal account', 'metalaccount')
                 .replace('metal card', 'metalaccount')
                 .replace('metal customers', 'metalaccount')
                 .replace('metal user', 'metalaccount')
                 .replace('business bank', 'businessbank')
                 .replace('virtual card', 'virtualcard')
                 .replace('revolut business', 'revolutbusiness')
                 .replace('google pay', 'googlepay')
                 .replace('apple pay', 'applepay')
                 .replace('applepayment', 'applepay')
                 .replace('samsung pay', 'samsungpay')
                 .replace('cryptocurrency', 'crypto')
                 .replace('cryptos', 'crypto')
                 .replace('cryptoasset', 'crypto')
                 .replace('doge coin', 'crypto')
                 .replace(' doge ', ' crypto ')
                 .replace('customer service', 'customerservice')
                 .replace('supoort', 'support')
                 .replace('customer support', 'customerservice')
                 .replace(' cs ', ' customerservice ')
                 .replace('phone number', 'phonenumber')
                 .replace('social media', 'socialmedia')
                 .replace('app chat', 'appchat')
                 .replace('challenger bank', 'fintech')
                 .replace('challengerbank', 'fintech')
                 .replace('neobanking', 'fintech')
                 .replace('neobanks', 'fintech')
                 .replace('neobank', 'fintech')
                 .replace('transferthis', 'transfer')
                 .replace('application', 'app')
                 .replace('locked', 'lock')
                 .replace('unlock', 'lock')
                 .replace('block', 'lock')
                 .replace('dark mode', 'darkmode')
                 .replace('wealth management', 'wealthmanagement'))


df_analysis['tweet_compound'] = df_analysis.tweet.map(create_compounds)

#### Count

In [147]:
counter = Counter()
lemma = nltk.stem.WordNetLemmatizer()

for tweet in df_analysis['tweet_compound']:
    tweet = lemma.lemmatize(tweet)
    tweet = TextBlob(tweet).words  # tokenize words
    tweet = [w for w in tweet if w not in stop]

    counter += Counter(tweet)

In [148]:
len(counter)

32771

In [149]:
for phrase, count in counter.most_common(10):
    print('%20s %i' % ("".join(phrase), count))

             account 29068
             revolut 17033
                 app 16213
               money 15544
                lock 13818
                help 12151
                chat 7750
                card 6317
              access 6215
             support 5496


### Topic modeling

In [235]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
    
vectorizer = CountVectorizer(tokenizer = LemmaTokenizer(),
                              stop_words = stop,
                              min_df=2,
                              max_df = 0.9)

doc_word = vectorizer.fit_transform(df_analysis.tweet_compound)
doc_word.shape



(99296, 13152)

In [236]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(99296, 5)

In [237]:
topic_word = nmf_model.components_
topic_word.shape

(5, 13152)

In [238]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['account', 'fund', 'document', 'open', 'information', 'customerservice'],
 ['app', 'phone', 'card', 'support', 'log', 'work'],
 ['revolut', 'bank', 'crypto', 'customer', 'fintech', 'child'],
 ['money', 'bank', 'transfer', 'send', 'card', 'pay'],
 ['help', 'chat', 'agent', 'live', 'support', 'card']]

- Component 0 (topic 1) seems to be about account queries
- Component 1 (topic 2) seems to be about app-related queries
- Component 2 (topic 3) seems to be about fintech innovations
- Component 3 (topic 4) is about transfers and not being able to access accounts / money
- Component 4 (topic 5) is about general requests for support

In [154]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [155]:
with option_context('display.max_colwidth', 600):
    display(df_analysis.sample(5))

Unnamed: 0,username,tweet,tweet_compound,topic
91573,stefanpaetow,well he left uber perhaps for a reason,well he left uber perhaps for a reason,3
8444,stevenbarr321,cant deposit any money onto my card,cant deposit any money onto my card,3
31543,katzreilly,ya the top up ita just floating around in cyber space,ya the top up ita just floating around in cyber space,2
16044,heyimwalshy,thats perfect thank you,thats perfect thank you,2
96027,dsteingruberch,just backed a payments startup that powers popular fintech apps like and innovation fintech platforms via,just backed a payments startup that powers popular fintech apps like and innovation fintech platforms via,2


### Sentiment analysis

In [156]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for tweet in df_analysis.tweet_compound:
    sentiment.append(sid_obj.polarity_scores(tweet))

In [157]:
sentiment_df = pd.DataFrame(sentiment)
sentiment_df.head()

Unnamed: 0,neg,neu,pos,compound
0,0.0,0.722,0.278,0.4019
1,0.329,0.671,0.0,-0.9136
2,0.138,0.862,0.0,-0.7269
3,0.0,0.952,0.048,0.2023
4,0.139,0.696,0.165,-0.1531


In [158]:
merged_df = pd.concat([df_analysis, sentiment_df], axis=1)

In [159]:
full_df = pd.concat([df,merged_df],axis=1)

### Analysis - high level topics

In [160]:
## Add month feature to enable analysis over time
full_df['month'] = pd.DatetimeIndex(full_df['date']).month
full_df['day'] = pd.DatetimeIndex(full_df['date']).day

In [161]:
with option_context('display.max_colwidth', 600):
    display(full_df[(full_df['topic']==2)].sample(2))

Unnamed: 0.1,Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,tweet,...,username.1,tweet.1,tweet_compound,topic,neg,neu,pos,compound,month,day
68188,68188,1259840548543827972,1259840548543827972,2020-05-11 09:39:28 EDT,2020-05-11,09:39:28,-500,2817365839,jarokrolewski,really cool for wallet management,...,jarokrolewski,really cool for wallet management,really cool for wallet management,2,0.0,0.607,0.393,0.3804,5,11
92024,92024,1229375718183915520,1229375718183915520,2020-02-17 07:03:06 EST,2020-02-17,07:03:06,-500,1412497622,clausematch,digital bank has more than tripled its valuation to bn bn after closing the funding round heres how this fast growing fintech ensures compliance,...,clausematch,digital bank has more than tripled its valuation to bn bn after closing the funding round heres how this fast growing fintech ensures compliance,digital bank has more than tripled its valuation to bn bn after closing the funding round heres how this fast growing fintech ensures compliance,2,0.0,0.931,0.069,0.1779,2,17


In [162]:
full_df.groupby(['topic','month'])['compound'].mean()

topic  month
0      1        0.031812
       2        0.021700
       3        0.021529
       4        0.043448
       5        0.041336
       6       -0.010325
       7        0.026898
       8        0.009379
       9        0.041506
       10       0.043509
       11       0.026092
       12       0.047148
1      1        0.066609
       2        0.091436
       3        0.102382
       4        0.057057
       5        0.083351
       6        0.056466
       7        0.087909
       8        0.094616
       9        0.087912
       10       0.104791
       11       0.076863
       12       0.082567
2      1        0.119911
       2        0.197166
       3        0.167375
       4        0.130164
       5        0.110703
       6        0.078551
       7        0.161052
       8        0.112354
       9        0.143191
       10       0.135144
       11       0.043564
       12       0.184340
3      1       -0.048408
       2       -0.012772
       3        0.021191
       4    

In [163]:
full_df.groupby('topic')['topic'].count()

topic
0    15340
1    22547
2    19388
3    14005
4    28016
Name: topic, dtype: int64

In [164]:
full_df.shape

(99296, 34)

In [165]:
## Export to csv for Tableau analysis
#full_df.to_csv('datatableau.csv')

### TFI-DF

In [166]:
# function to display top n terms associated with each topic

def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [167]:
# tuning vectorizer params
tf_idf = TfidfVectorizer(stop_words=stop,
                         tokenizer=word_tokenize,
                         min_df= 2,
                         max_df= 0.9)

# document-term matrix
doc_word2 = tf_idf.fit_transform(df_analysis.tweet_compound)
print(doc_word2.shape)



(99296, 14602)


In [168]:
n = 5 # number of topics

# model selection, fit/trans, and hyperparameter tuning
nmf_model_2 = NMF(n_components =n)

# doc-topic matrix
doc_topic2 = nmf_model_2.fit_transform(doc_word2)

# creating ids for each topic
topic_ids2 = ["topic"+str(val) for val in range(n)]

# topic-term matrix
topic_word2 = pd.DataFrame(nmf_model_2.components_.round(n),
             index = topic_ids2,
             columns = tf_idf.get_feature_names())

# prints top x words in each topic
display_topics(nmf_model_2, 
               tf_idf.get_feature_names(), 
               5) # number of top words/topic


Topic  0
account, lock, access, months, reason

Topic  1
help, account, problem, revolut, trying

Topic  2
app, access, contact, support, new

Topic  3
waiting, chat, agent, live, reply

Topic  4
money, revolut, bank, transfer, send


In [169]:
for tweet in merged_df['tweet_compound']:
    tweet.strip()

In [170]:
tr, test = train_test_split(merged_df,train_size = 0.05,random_state=10)

In [171]:
#tr.to_csv('data_training.csv')