# Text Sentiment Analysis
In this notebook I look at customer reviews for the top banks in Kenya. Is it enough to use social media comments to make an informed decision a given bank?
I will be using data from Twitter in this analysis

In [16]:
# Get Access tokens
%reload_ext dotenv
%dotenv

import os
import tweepy
from tweepy import OAuthHandler
import pandas as pd
import numpy as np
import jsonpickle
import time
# get access
# keys are stored in a '.env' file
consumer_key = os.getenv('CONSUMER_KEY')
consumer_secret= os.getenv('CONSUMER_SECRET')
access_token =  os.getenv('ACCESS_TOKEN')
access_secret =  os.getenv('ACCESS_SECRET')

# authorize access
oath = OAuthHandler(consumer_key,consumer_secret)
oath.set_access_token(access_token,access_secret)

# create an api
# enable wait when rate limit is reached
twitter_api = tweepy.API(oath,wait_on_rate_limit=True,wait_on_rate_limit_notify=True)


In [17]:
# examine top 10 tweets on timeline
for tweet in tweepy.Cursor(twitter_api.home_timeline).items(10):
    print(tweet.text)


Dog food startup Sundays launches its air-dried kibble alternative https://t.co/p6FF6r0fK5 by @anthonyha
Just saw a comment about how Sarah Cooper has to jump to TC/Netflix to capitalize on her fame. It made me wonder ar… https://t.co/Roq8kOkZtw
19:57 @NMS_Kenya @MikeSonko  I once got involved in an accident in Mombasa road around airtel 7am but be…  via… https://t.co/8rXazoW3dZ
Ugandan MPs Revive Move to Extend Parliamentary Term to 7 Years https://t.co/gkisSRGfxv #Uganda https://t.co/tgcKw7CrBg
#DataLiteracy 2020 -- It is not a Math skill. It is a Life skill.
https://t.co/VJA9jCjhzY #RealBIEvent https://t.co/WyxECTk8jk
Biden Makes History By Naming Kamala Harris As Vice President Choice https://t.co/cWhdDqL0O2
RT @BiancaSparacino: What podcast episode do you need to hear the most this week? ✨
https://t.co/LXYCVmWxHA #RealBIEvent
Driving may not require AGI, but getting out of the car does. https://t.co/h4kxGFGDGH
Malawi Ex-Spy Chief to Be Slapped With More Criminal Charges - State: h

In [18]:
# test handler
def rate_limit_handler(cursor):
    """
    Takes cursor as argument
    """
    while True:
        try:
            yield cursor.next
        except tweepy.RateLimitError:
            time.sleep(15*60)



test_file = "test_tweets.txt"
simplified_result = tweepy.Cursor(twitter_api.search,q="Kenya",since=start_date).items(10)
with open(test_file,'w') as f:
    for tweet in simplified_result:
        f.write(jsonpickle.encode(tweet._json,unpicklable=False)+
                '\n')
    f.close()


In [19]:

# define file to save tweets
filename = "tweets.txt"

# create list of top banks
banks_list = ["KCB Bank","Equity Bank","Cooperative Bank","Family Bank","Faulu Bank",
              "DTB Bank","Barclays Bank","Standard Chartered Bank","National Bank","NCBA Bank"]
# search_text  = ['bank in kenya']

# define start date
start_date = "2020-01-01"

# counter for search results
total_results = 0

# extract data for the selected banks
def search_and_extract_data_from_bank_tweets(search_list):
    """
    Takes a list of banks to search as argument
    """
    # create tweets dict
#     tweets_dict = {'location':[],'bank_name':[],'date':[],'user':[],'text':[]}
    
    # create counter for total downloads
    total_results = 0
    
    # open a file to write data
    with open(filename,'w') as f:
        
        # search tweets for banks in the list
        # add the results to a json object
        for i in search_list: 
            for tweet in tweepy.Cursor(twitter_api.search,q=i,since=start_date).items():
                f.write(jsonpickle.encode(tweet._json,unpicklable=False)+
                        '\n')
                total_results+=1
                
        f.close()

# apply function on banks list
search_and_extract_data_from_bank_tweets(banks_list)

# print total tweets downloaded
print("Total Tweets downloaded:",total_results)

Rate limit reached. Sleeping for: 480


TweepError: Failed to send request: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

##### Note
Running above code we get a timeout error
We need to optimize the search function so that we get as many tweets as possible

In [20]:
# read data from saved file
import json
json_data = []
with open(filename,'r') as f:
    for line in f:
        json_data.append(json.loads(line))
        
    f.close()

# convert json file to dataframe
bank_comments_df = pd.DataFrame(json_data)
bank_comments_df.head()

Unnamed: 0,created_at,id,id_str,text,truncated,entities,metadata,source,in_reply_to_status_id,in_reply_to_status_id_str,...,retweet_count,favorite_count,favorited,retweeted,lang,possibly_sensitive,quoted_status_id,quoted_status_id_str,quoted_status,extended_entities
0,Wed Aug 12 16:53:24 +0000 2020,1293591425033084928,1293591425033084928,"RT @KCBGroup: In response to the pandemic, KCB...",False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,,...,5,0,False,False,en,,,,,
1,Wed Aug 12 16:39:24 +0000 2020,1293587903398981632,1293587903398981632,RT @KCBGroup: When you bank with us it’s alway...,False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,,...,38,0,False,False,en,,,,,
2,Wed Aug 12 16:03:41 +0000 2020,1293578914854576128,1293578914854576128,"RT @KCBGroup: In response to the pandemic, KCB...",False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,,...,5,0,False,False,en,,,,,
3,Wed Aug 12 16:00:44 +0000 2020,1293578170533392385,1293578170533392385,"RT @KCBGroup: In response to the pandemic, KCB...",False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,...,5,0,False,False,en,,,,,
4,Wed Aug 12 16:00:27 +0000 2020,1293578102971531269,1293578102971531269,"RT @KCBGroup: In response to the pandemic, KCB...",False,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'en', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,,...,5,0,False,False,en,,,,,


In [21]:
# add bank names column
bank_comments_df['bank_name'] = np.where(bank_comments_df['text'].str.contains('Equity Bank',case=False),"Equity Bank",'')
bank_comments_df['bank_name'] = np.where(bank_comments_df['text'].str.contains('KCB',case=False),"KCB Bank",bank_comments_df['bank_name'])
bank_comments_df['bank_name'] = np.where(bank_comments_df['text'].str.contains('Cooperative Bank',case=False),"Cooperative Bank",bank_comments_df['bank_name'])
bank_comments_df['bank_name'] = np.where(bank_comments_df['text'].str.contains('Family Bank',case=False),"Family Bank",bank_comments_df['bank_name'])
bank_comments_df['bank_name'] = np.where(bank_comments_df['text'].str.contains('Faulu Bank',case=False),"Faulu Bank",bank_comments_df['bank_name'])
bank_comments_df['bank_name'] = np.where(bank_comments_df['text'].str.contains('DTB Bank',case=False),"DTB Bank",bank_comments_df['bank_name'])
bank_comments_df['bank_name'] = np.where(bank_comments_df['text'].str.contains('Barclays Bank',case=False),"Barclays Bank",bank_comments_df['bank_name'])
bank_comments_df['bank_name'] = np.where(bank_comments_df['text'].str.contains('Standard Chartered Bank',case=False),"Standard Chartered",bank_comments_df['bank_name'])
bank_comments_df['bank_name'] = np.where(bank_comments_df['text'].str.contains('NCBA Bank',case=False),"NCBA Bank",bank_comments_df['bank_name'])
bank_comments_df['bank_name'] = np.where(bank_comments_df['text'].str.contains('National Bank',case=False),"National Bank",bank_comments_df['bank_name'])


In [22]:
# summarize twitter comments by bank
bank_comments_df.groupby('bank_name')['id'].count().reset_index()

Unnamed: 0,bank_name,id
0,,2931
1,Barclays Bank,1
2,Cooperative Bank,191
3,Equity Bank,530
4,Family Bank,6
5,KCB Bank,264
6,National Bank,2
7,Standard Chartered,2


- Equity bank has the highest number of tweets followed by KCB Bank
- We have a lot of comments that are not tied to any of out top banks
- We should also optimize the search criteria to weed out some of these tweets

In [33]:
# Let's take a look at Equity bank
equity_bank_comment_df = bank_comments_df[bank_comments_df['bank_name']=='KCB Bank']
equity_bank_comments = [x for x in equity_bank_comment_df['text'].unique()]

In [34]:
# Using spacy we can tokenize tweet comments
import spacy
from spacy.lang.en import English
import re

# get stop words
stop_words = spacy.lang.en.stop_words.STOP_WORDS
spacy_nlp = English()

# remove usernames & links from tweets
username_pattern = re.compile('^@[A-Za-z0-9_]{1,15}$')
url_pattern = re.compile(r'^https?:\/\/.*[\r\n]*')

# create tokens dictionary
results = []
for i in equity_bank_comments:
    sentence_token = spacy_nlp(i)
    token_result = []
    for token in sentence_token:

        # check if token is stop word
        if token.is_stop == False:
            token_result.append(token)

    results.append(token_result)

# clean results
# remove usernames
cleaned_results = []
for x in results:
    for i in x:
        if (username_pattern.match(i.text) or url_pattern.match(i.text)):  
            pass
        else:
            cleaned_results.append(str.lower(i.text))
print(cleaned_results[100:180])

['kcb', 'bank', '...', 'soo', 'tiresome', 'rt', ':', 'caroline', 'rabar', "okong'o", 'director', 'kcb', 'bank', '.', '\n', '20', 'years', 'experience', 'finance', ',', 'risk', ',', 'audi', '…', 'kcb', 'bank', 'caroline', 'rabar', "okong'o", 'director', 'kcb', 'bank', '.', '\n', '20', 'years', 'experience', 'finance', ',', 'risk', ',', '…', 'njeri', 'onyango', 'director', 'kcb', 'bank', '.', '\n', 'advocate', 'high', 'court', 'kenya', ',', '30', 'years', 'o', '…', 'awesome', ',', 'kcb', 'post', 'bank', 'look', 'industries', '.', 'equity', ',', 'kcb', 'co', '-', 'op', 'bank', 'paying', 'executives', 'need', '2nd', 'bank', ',', 'kcb']


In [35]:
# # lemmatize results
# for word in cleaned_results[210:220]:
#     print(word,word.lemma_)


In [40]:
# create word frequency
from string import punctuation
# add some words to punctuation
punctuation = punctuation + '\n'+'…'+'\n\n'+'rt'+'RT'+' '+'..'+'...'
word_frequencies = {}
for word in cleaned_results:
    if word not in punctuation:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word]+=1

#  …	          
word_freq_df = pd.DataFrame(word_frequencies.items(),columns = ['word','freq'])
word_freq_df = word_freq_df.sort_values(by='freq',ascending=False).reset_index(drop=True)
word_freq_df[0:20]

Unnamed: 0,word,freq
0,kcb,93
1,bank,84
2,tanzania,20
3,kenya,18
4,limited,16
5,na,15
6,job,15
7,opportunity,13
8,banking,11
9,retail,9


### Looking at the word frequency, we do not see any major outliers

In [32]:
# create sentence tokenizer
try:
    sbd = spacy_nlp.create_pipe('sentencizer')
    # add to pipeline
    spacy_nlp.add_pipe(sbd)
except:
    pass

# create spacy doc
sentence_list = []
for tweet in equity_bank_comments:
    tweet_doc = spacy_nlp(tweet)
    for sentence in tweet_doc.sents:
        sentence_list.append(sentence)
    
# print sentence list
print(sentence_list[10:20])

[Very timely @Victoria…, La BCDC achetée par Equity Bank en RDC., https://t.co/oF9Fn4F9jo, RT @BayaCiamala: Equity Bank devient la 2e banque de la RDC après la finalisation de l’acquisition de 66,5% de BCDC par Equity Group Holdin…, RT @Mollel297021: @MpondaSabinus Equity Bank, @MpondaSabinus Equity Bank, @nbstv Hello nbs please help people around equity bank they realize sewage  every time its raining please help, @nwscug @NWSCMD Please help with wandegeya people around equity bank who realize sewage every time its raining please help up, Equity bank ya Kenya imepokea umiliki wa benki ya Banque Commerciale du Congo (BCDC) ya DRC baada ya kununua asilim… https://t.co/1TOUAszLqA, @MpondaSabinus EQUITY BANK 😎]
