In [154]:
import pandas as pd
from pandas import option_context
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import NMF

from nltk.corpus import stopwords
from nltk.tokenize import MWETokenizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

## This is the main notebook for topic modeling and sentiment analysis

In [155]:
df = pd.read_csv('train.csv')
df.shape

(87271, 24)

In [156]:
x_train, x_val = train_test_split(df,test_size=.20, random_state=10)

In [157]:
df_analysis = x_train[['username','tweet']].copy()
df_analysis.reset_index(drop=True,inplace=True)

In [158]:
import nltk
from nltk.tokenize import word_tokenize

df_analysis['tokens'] = df_analysis.tweet.apply(nltk.word_tokenize)

### Topic modeling

In [104]:
stop = stopwords.words('english')
stop.extend(['ive', 'im', 'itd', 'youre', 'hi', 'hello', 'hey', 'eg', 'l', 'h','w','v','u',
             'please','get','dont','still'])

In [190]:
vectorizer = CountVectorizer(stop_words = stop)
doc_word = vectorizer.fit_transform(df_analysis.tweet)
doc_word.shape

(69816, 25982)

In [171]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(69816, 3)

In [172]:
topic_word = nmf_model.components_
topic_word.shape

(3, 25982)

In [173]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['account', 'blocked', 'locked', 'help', 'money', 'access'],
 ['app', 'cant', 'help', 'chat', 'access', 'phone'],
 ['revolut', 'money', 'card', 'bank', 'back', 'use']]

- Component 1 (topic 1) seems to be about account queries
- Component 2 (topic 2) seems to be about the app

However, important to note a lot of compound words are losing their meaning in this approach. Need to incorporate in pipeline

In [109]:
doc_topic

array([[2.05675047e-02, 1.49616741e-01, 0.00000000e+00, 1.38512983e-02],
       [0.00000000e+00, 1.63341998e-06, 3.11545913e-05, 2.49204712e-05],
       [6.98011575e-02, 1.60536991e-01, 1.33782827e-02, 2.03226822e-03],
       ...,
       [1.41150707e-03, 8.04250739e-02, 0.00000000e+00, 2.77519412e-03],
       [4.37085802e-03, 6.00244306e-03, 8.49458754e-03, 8.18883259e-03],
       [2.23377548e-03, 9.93072444e-02, 4.46307083e-05, 7.71175894e-03]])

In [110]:
doc_topic.argmax(axis=1)

array([1, 2, 1, ..., 1, 2, 1])

In [111]:
df_analysis['topic'] = doc_topic.argmax(axis=1)

In [112]:
df_analysis

Unnamed: 0,username,tweet,topic
0,tomaszr16,hello im waiting since days for support can...,1
1,andyb1900,how dare you,2
2,albedairy2,please help my account wont verify and i can...,1
3,maniuhija,prentitye month is now where are the rainbow ...,2
4,darrenc1,is there an issue with the app few of us tryi...,1
...,...,...,...
69811,deaconu_claudiu,hello my account is not working and nobody is...,0
69812,wikitail,is there any customer service available we...,1
69813,mrgl1tch,once again i have been lied too and told s...,1
69814,burteditch,oh and also that you werent allowed to ask me...,2


### Sentiment analysis

In [113]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for tweet in df_analysis.tweet:
    sentiment.append(sid_obj.polarity_scores(tweet))

In [114]:
sentiment_df = pd.DataFrame(sentiment)

In [115]:
sentiment_df

Unnamed: 0,neg,neu,pos,compound
0,0.000,0.893,0.107,0.5859
1,0.000,1.000,0.000,0.0000
2,0.092,0.821,0.087,-0.2960
3,0.000,1.000,0.000,0.0000
4,0.000,1.000,0.000,0.0000
...,...,...,...,...
69811,0.000,0.820,0.180,0.2960
69812,0.099,0.901,0.000,-0.2960
69813,0.067,0.933,0.000,-0.2023
69814,0.223,0.777,0.000,-0.8519


In [116]:
merged_df = pd.concat([df_analysis, sentiment_df], axis=1)

In [117]:
merged_df

Unnamed: 0,username,tweet,topic,neg,neu,pos,compound
0,tomaszr16,hello im waiting since days for support can...,1,0.000,0.893,0.107,0.5859
1,andyb1900,how dare you,2,0.000,1.000,0.000,0.0000
2,albedairy2,please help my account wont verify and i can...,1,0.092,0.821,0.087,-0.2960
3,maniuhija,prentitye month is now where are the rainbow ...,2,0.000,1.000,0.000,0.0000
4,darrenc1,is there an issue with the app few of us tryi...,1,0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...,...
69811,deaconu_claudiu,hello my account is not working and nobody is...,0,0.000,0.820,0.180,0.2960
69812,wikitail,is there any customer service available we...,1,0.099,0.901,0.000,-0.2960
69813,mrgl1tch,once again i have been lied too and told s...,1,0.067,0.933,0.000,-0.2023
69814,burteditch,oh and also that you werent allowed to ask me...,2,0.223,0.777,0.000,-0.8519


In [118]:
topic2_df = merged_df[merged_df['topic'] == 2]

In [125]:
with option_context('display.max_colwidth', 600):
    display(topic2_df)

Unnamed: 0,username,tweet,topic,neg,neu,pos,compound
1,andyb1900,how dare you,2,0.000,1.000,0.000,0.0000
3,maniuhija,prentitye month is now where are the rainbow cards,2,0.000,1.000,0.000,0.0000
6,corpusjessa,join pm club manager to get new trader bonus try okex swap trade days buy crypto trading fee card deposit trading tasks invite frds join group get doge,2,0.000,0.716,0.284,0.8176
10,andr3ea_popa,hello yesterday l had my phone hacked by a person that claim to be from revolut they broke into my app and used my hsbc debit and tui credit card to get money in the revolut account then they saentity they need to verify my entityentity i have been robbed of help,2,0.090,0.773,0.137,0.4215
17,nikos_katsanos,the last weeks cannot top up my revolut using a mastercard with d secure your inapp support is really unhelpful is there a bug with your app,2,0.052,0.782,0.166,0.5435
...,...,...,...,...,...,...,...
69797,nikolasrichter1,after opening revolut on my phone a big error appearswe couldnt verify your entityentity but when i try to click on of the options to verify my age another error comes in with you must be yo to use our service how am i supposed to verify it,2,0.105,0.895,0.000,-0.6597
69799,bucuri,wtf really charging instead of why why i guess ill just wait days to see if there will be a refund but this card is not reliable at all after all i will just stop using it,2,0.173,0.827,0.000,-0.7269
69804,jasonpereira,feel free to jump in and make a pitch,2,0.000,0.708,0.292,0.5106
69807,z53758152,unlock this guys revolut account his revolut tag u locked his account without any reason and his waiting now more then one month this is unacceptable so please unlock his account other wise we will close all our accounts on revolut,2,0.064,0.807,0.129,0.4751


### With MWE tokenizer

In [174]:
mwe = MWETokenizer([('debit','card'),('business','account'),
                    ('savings','account'), ('bank','account'),
                    ('doge','coin'), ('challenger','bank'),
                    ('business','bank'),('virtual','card')])

In [185]:
def mwe_tweet(tweet):
    mwe_tok = mwe.tokenize(word_tokenize(tweet))
    
    return mwe_tok

df_analysis['mwe_token'] = df_analysis.tweet.map(mwe_tweet) 

In [187]:
df_analysis

Unnamed: 0,username,tweet,tokens,mwe_token
0,tomaszr16,hello im waiting since days for support can...,"[hello, im, waiting, since, days, for, support...","[hello, im, waiting, since, days, for, support..."
1,andyb1900,how dare you,"[how, dare, you]","[how, dare, you]"
2,albedairy2,please help my account wont verify and i can...,"[please, help, my, account, wont, verify, and,...","[please, help, my, account, wont, verify, and,..."
3,maniuhija,prentitye month is now where are the rainbow ...,"[prentitye, month, is, now, where, are, the, r...","[prentitye, month, is, now, where, are, the, r..."
4,darrenc1,is there an issue with the app few of us tryi...,"[is, there, an, issue, with, the, app, few, of...","[is, there, an, issue, with, the, app, few, of..."
...,...,...,...,...
69811,deaconu_claudiu,hello my account is not working and nobody is...,"[hello, my, account, is, not, working, and, no...","[hello, my, account, is, not, working, and, no..."
69812,wikitail,is there any customer service available we...,"[is, there, any, customer, service, available,...","[is, there, any, customer, service, available,..."
69813,mrgl1tch,once again i have been lied too and told s...,"[once, again, i, have, been, lied, too, and, t...","[once, again, i, have, been, lied, too, and, t..."
69814,burteditch,oh and also that you werent allowed to ask me...,"[oh, and, also, that, you, werent, allowed, to...","[oh, and, also, that, you, werent, allowed, to..."


In [192]:
vectorizer = CountVectorizer(stop_words = stop,lowercase=False)
doc_word = vectorizer.fit_transform(df_analysis.mwe_token)
doc_word.shape

TypeError: expected string or bytes-like object

In [171]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(69816, 3)

In [172]:
topic_word = nmf_model.components_
topic_word.shape

(3, 25982)

In [173]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['account', 'blocked', 'locked', 'help', 'money', 'access'],
 ['app', 'cant', 'help', 'chat', 'access', 'phone'],
 ['revolut', 'money', 'card', 'bank', 'back', 'use']]

- Component 1 (topic 1) seems to be about account queries
- Component 2 (topic 2) seems to be about the app

In [109]:
doc_topic

array([[2.05675047e-02, 1.49616741e-01, 0.00000000e+00, 1.38512983e-02],
       [0.00000000e+00, 1.63341998e-06, 3.11545913e-05, 2.49204712e-05],
       [6.98011575e-02, 1.60536991e-01, 1.33782827e-02, 2.03226822e-03],
       ...,
       [1.41150707e-03, 8.04250739e-02, 0.00000000e+00, 2.77519412e-03],
       [4.37085802e-03, 6.00244306e-03, 8.49458754e-03, 8.18883259e-03],
       [2.23377548e-03, 9.93072444e-02, 4.46307083e-05, 7.71175894e-03]])

In [110]:
doc_topic.argmax(axis=1)

array([1, 2, 1, ..., 1, 2, 1])

In [111]:
df_analysis['topic'] = doc_topic.argmax(axis=1)