#Sentiment & Thematic Analysis

In [15]:
import pandas as pd
import re
from textblob import TextBlob
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load cleaned review data
df = pd.read_csv('../data/cleaned_reviews.csv')
df.head()


Unnamed: 0,review,rating,date,bank,source
0,. Reviewing content on Play is a great way to ...,5,2025-06-10,Commercial Bank of Ethiopia,Google Play
1,So bad now and hard to use,5,2025-06-09,Commercial Bank of Ethiopia,Google Play
2,"it is so amazing app. but, it is better to upd...",5,2025-06-09,Commercial Bank of Ethiopia,Google Play
3,v.good app,4,2025-06-09,Commercial Bank of Ethiopia,Google Play
4,very good app,1,2025-06-09,Commercial Bank of Ethiopia,Google Play


In [16]:
#Language Check (Amharic vs English)
def is_amharic(text):
    return bool(re.search(r'[\u1200-\u137F]', str(text)))

df['is_amharic'] = df['review'].apply(is_amharic)
print(df['is_amharic'].value_counts())

is_amharic
False    6484
True      348
Name: count, dtype: int64


In [20]:
# hybrid sentiment analysis
def hybrid_sentiment(text, rating=None):
    text = str(text)
    if is_amharic(text):
        text = text.replace('።', ' ').replace('፣', ' ')  # normalize punctuation
        if any(word in text for word in ['አልሰራም', 'ችግር', 'አይሰራም', 'አልተሳካም', 'ተቋርጧል']):
            return 'Negative'
        elif any(word in text for word in ['ጥሩ', 'ጎበዝ', 'አሪፍ', 'በጣም ጥሩ', 'አሪፍ ነው']):
            return 'Positive'
        else:
            return 'Neutral'
    # English handling    
    polarity = TextBlob(text).sentiment.polarity
    # Prioritize strong polarity over rating--challenge1
    if polarity > 0.4:
        return 'Positive'
    elif polarity < -0.4:
        return 'Negative'
    elif rating:
        try:
            rating = float(rating)
            if rating >= 4:
                return 'Positive'
            elif rating <= 2:
                return 'Negative'
        except:
            pass

    return 'Neutral'
#apply to dataFrame, sentiment label
df['sentiment'] = df.apply(lambda row: hybrid_sentiment(row['review'], row['rating']), axis=1)

df['sentiment_score'] = df.apply(
    lambda row: TextBlob(str(row['review'])).sentiment.polarity if not row['is_amharic'] else None,
    axis=1
)
#check
print(df[['review', 'rating', 'sentiment', 'sentiment_score']].head())


                                              review  rating sentiment  \
0  . Reviewing content on Play is a great way to ...       5  Positive   
1                         So bad now and hard to use       5  Negative   
2  it is so amazing app. but, it is better to upd...       5  Positive   
3                                         v.good app       4  Positive   
4                                      very good app       1  Positive   

   sentiment_score  
0         0.491667  
1        -0.495833  
2         0.300000  
3         0.000000  
4         0.910000  


In [21]:
#BERT Sentiment Classification
# Load pretrained BERT model for sentiment
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def get_bert_sentiment(text):
    text = str(text)[:512]  # BERT input limit
    result = sentiment_pipeline(text)[0]
    return result['label'], result['score']
#apply to all
bert_results = df['review'].apply(get_bert_sentiment)
df['bert_sentiment'], df['bert_score'] = zip(*bert_results)

print(df[['review', 'bert_sentiment', 'bert_score']].head())


Device set to use cpu


                                              review bert_sentiment  \
0  . Reviewing content on Play is a great way to ...       POSITIVE   
1                         So bad now and hard to use       NEGATIVE   
2  it is so amazing app. but, it is better to upd...       POSITIVE   
3                                         v.good app       POSITIVE   
4                                      very good app       POSITIVE   

   bert_score  
0    0.999597  
1    0.999806  
2    0.949643  
3    0.995270  
4    0.999868  


In [28]:
#Preprocessing for Thematic Analysis
def preprocess(text):
    text = str(text)  # Converts None score to empty string
    doc = nlp(text.lower())
    return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])

df['cleaned_review'] = df['review'].apply(preprocess)


In [30]:
#Keyword Extraction via TF-IDF
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=100)
tfidf_matrix = tfidf.fit_transform(df['cleaned_review'])
keywords = tfidf.get_feature_names_out()
print("Top TF-IDF Keywords:", keywords[:20])


Top TF-IDF Keywords: ['access' 'account' 'add' 'amazing' 'app' 'app good' 'app work'
 'application' 'ask' 'available' 'bad' 'bad app' 'balance' 'bank'
 'banking' 'banking app' 'birr' 'boa' 'branch' 'bug']


In [31]:
#Theme Assignment
themes = {
    "Transaction Issues": ["transaction", "failed", "deposit", "money", "transfer"],
    "Login/Access": ["login", "pin", "reset", "password"],
    "UI/UX": ["interface", "easy to use", "navigation", "layout"],
    "Performance": ["crash", "slow", "freeze", "lag"],
    "Feature Requests": ["add", "feature", "option", "support"]
}

def assign_theme(review):
    assigned = []
    for theme, kws in themes.items():
        if any(kw in review.lower() for kw in kws):
            assigned.append(theme)
    return assigned if assigned else ["Other"]

df['themes'] = df['cleaned_review'].apply(assign_theme)
print(df[['cleaned_review', 'themes']].head())



                                      cleaned_review   themes
0  review content play great way share helpful fe...  [Other]
1                                       bad hard use  [Other]
2  amazing app well update access internet fee op...  [Other]
3                                                app  [Other]
4                                           good app  [Other]


In [32]:
# Save final sentiment and theme labels
df[['review', 'sentiment', 'sentiment_score', 'bert_sentiment', 'bert_score', 'rating', 'bank', 'themes']].to_csv(
    "../outputs/sentiment_output.csv", index=False)

# Summary table by rating & bank
summary = df.groupby(['bank', 'rating'])[['sentiment_score', 'bert_score']].mean().reset_index()
summary.to_csv("../outputs/sentiment_summary.csv", index=False)
