#Sentiment & Thematic Analysis

In [1]:

import pandas as pd
from textblob import TextBlob

from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")


In [2]:

# Load cleaned review data
df = pd.read_csv('../data/cleaned_reviews.csv')
df.head()

Unnamed: 0,review,rating,date,bank,source
0,really am happy to this app it is Siple to use...,5,2025-06-07,Commercial Bank of Ethiopia,Google Play
1,I liked this app. But the User interface is ve...,2,2025-06-07,Commercial Bank of Ethiopia,Google Play
2,"""Why don’t your ATMs support account-to-accoun...",4,2025-06-06,Commercial Bank of Ethiopia,Google Play
3,what is this app problem???,1,2025-06-05,Commercial Bank of Ethiopia,Google Play
4,the app is proactive and a good connections.,5,2025-06-05,Commercial Bank of Ethiopia,Google Play


In [6]:
# Install necessary packages (run once)
import re

# Function to detect Amharic text
def is_amharic(text):
    return bool(re.search(r'[\u1200-\u137F]', str(text)))

# Function to correct spelling (optional, slow)
def correct_spelling(text):
    try:
        return str(TextBlob(str(text)).correct())
    except:
        return str(text)

# Function to tag themes (expand as needed)
def tag_theme(text):
    text = str(text).lower()
    if 'crash' in text or 'fail' in text or 'ይቋረጣል' in text:
        return 'Stability Issue'
    elif 'fast' in text or 'quick' in text or 'በፍጥነት' in text:
        return 'Performance'
    elif 'interface' in text or 'design' in text:
        return 'UI/UX'
    elif 'ማንቀሳቀስ' in text:
        return 'Navigation'
    else:
        return 'Other'

# Hybrid sentiment classification
def hybrid_sentiment(text, rating=None):
    text = str(text)
    amharic = is_amharic(text)
    
    # Heuristic for Amharic
    if amharic:
        if 'አልሰራም' in text or 'ችግር' in text or 'አያምርም' in text:
            return 'Negative'
        elif 'ጥሩ' in text or 'በጣም ጥሩ' in text:
            return 'Positive'
        else:
            return 'Neutral'
    
    # English-based sentiment analysis
    polarity = TextBlob(text).sentiment.polarity
    
    # Cross-check with rating
    if rating is not None:
        try:
            rating = float(rating)
            if rating >= 4:
                return 'Positive'
            elif rating <= 2:
                return 'Negative'
        except:
            pass  # skip invalid ratings

    # Polarity thresholds
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

# Apply corrections and generate new columns
# Uncomment below line if you want spelling correction (can be slow!)
# df['review_cleaned'] = df['review'].apply(correct_spelling)
df['is_amharic'] = df['review'].apply(is_amharic)
df['sentiment_score'] = df['review'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df['theme'] = df['review'].apply(tag_theme)

# Apply hybrid sentiment (use 'rating' if available)
df['sentiment'] = df.apply(lambda row: hybrid_sentiment(row['review'], row.get('rating')), axis=1)

# Summary
print(df[['review', 'sentiment', 'sentiment_score', 'theme']].head(10))
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())

print("\nTheme Distribution:")
print(df['theme'].value_counts())

print("\nSentiment Score Description:")
print(df['sentiment_score'].describe())


# Sample Reviews
print("\n🔴 Negative Samples")
print(df[df['sentiment'] == 'Negative'][['review', 'sentiment_score']].head())

print("\n🟢 Positive Samples")
print(df[df['sentiment'] == 'Positive'][['review', 'sentiment_score']].head())

print("\n🟡 Neutral Samples")
print(df[df['sentiment'] == 'Neutral'][['review', 'sentiment_score']].head())


                                              review sentiment  \
0  really am happy to this app it is Siple to use...  Positive   
1  I liked this app. But the User interface is ve...  Negative   
2  "Why don’t your ATMs support account-to-accoun...  Positive   
3                        what is this app problem???  Negative   
4       the app is proactive and a good connections.  Positive   
5    I cannot send to cbebirr app. through this app.   Neutral   
6                                               good  Positive   
7                                     not functional  Negative   
8  everytime you uninstall the app you have to re...  Negative   
9  አካውንት የምናስገባበት ቦታ ስም ጽፈን ነው ከዚህ በፊት የላክንባቸውን አ...   Neutral   

   sentiment_score  theme  
0         0.800000  Other  
1         0.066667  UI/UX  
2        -0.125000  Other  
3         0.000000  Other  
4         0.700000  Other  
5         0.000000  Other  
6         0.700000  Other  
7         0.000000  Other  
8         0.234167  O

In [12]:
print(df['review'].isna().sum())


0


In [8]:
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def get_bert_sentiment(text):
    text = str(text)[:512]  # ensure it's a string and truncate to 512 chars
    result = sentiment_pipeline(text)[0]
    return result['label'], result['score']

bert_results = df['review'].apply(get_bert_sentiment)
df['bert_sentiment'], df['bert_score'] = zip(*bert_results)


Device set to use cpu


In [None]:

sentiment_summary = df.groupby(['bank', 'rating'])[['sentiment_score', 'bert_score']].mean().reset_index()
sentiment_summary.to_csv('../outputs/sentiment_summary.csv', index=False)


In [None]:

def preprocess(text):
    doc = nlp(text.lower())
    return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])

df['cleaned_review'] = df['review'].apply(preprocess)


In [None]:

tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_features=100)
tfidf_matrix = tfidf.fit_transform(df['cleaned_review'])
keywords = tfidf.get_feature_names_out()
keywords[:20]


In [None]:

themes = {
    "Transaction Issues": ["transaction", "failed", "deposit", "money", "transfer"],
    "Login/Access": ["login", "pin", "reset", "password"],
    "UI/UX": ["interface", "easy to use", "navigation", "layout"],
    "Performance": ["crash", "slow", "freeze", "lag"],
    "Feature Requests": ["add", "feature", "option", "support"]
}

def assign_theme(review):
    assigned = []
    for theme, keywords in themes.items():
        if any(keyword in review.lower() for keyword in keywords):
            assigned.append(theme)
    return assigned if assigned else ["Other"]

df['themes'] = df['cleaned_review'].apply(assign_theme)


In [None]:

df[['review', 'sentiment', 'sentiment_score', 'bert_sentiment', 'bert_score', 'rating', 'bank', 'themes']]
  .to_csv('../outputs/sentiment_output.csv', index=False)
