Step 1: Install Required Packages

In [25]:
from google_play_scraper import reviews, Sort
from datetime import datetime
import pandas as pd
import re
from textblob import TextBlob
from langdetect import detect
from bertopic import BERTopic
import os
from transformers import pipeline
import time
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

🏦 Step 2: Get App Package Names

First, find the correct package names from Google Play Store:

📥 Step 3: Scrape Reviews for Each Bank

In [None]:

# Dictionary mapping bank names to their Google Play app IDs
banks = {
    "Commercial Bank of Ethiopia": "com.combanketh.mobilebanking",
    "Bank of Abyssinia":"com.boa.boaMobileBanking",
    "Dashen Bank": "com.dashen.dashensuperapp"
}


# Function to scrape and extract relevant review data
def scrape_reviews(app_id, bank_name, num_reviews=400):
    all_reviews = []
    token = None

    while len(all_reviews) < num_reviews:
        batch, token = reviews(
            app_id,
            lang='en',
            country='et',
            sort=Sort.NEWEST,
            count=400,
            continuation_token=token
        )
        all_reviews.extend(batch)
        if not token:
            break
        time.sleep(1)

    # Extract and format required fields
    data = []
    for r in all_reviews[:num_reviews]:
        data.append({
            'review': r['content'],
            'rating': r['score'],
            'date': r['at'].strftime('%Y-%m-%d'),
            'bank': bank_name,
            'source': 'Google Play'
        })

    return pd.DataFrame(data)

# Scrape reviews for all banks
all_dfs = []
for bank, app_id in banks.items():
    df = scrape_reviews(app_id, bank)
    all_dfs.append(df)

# Combine and save
final_df = pd.concat(all_dfs, ignore_index=True)
os.makedirs("data", exist_ok=True)

# Drop duplicates and handle missing values
df.drop_duplicates(subset=["review", "bank"], inplace=True)
df.dropna(subset=["review", "rating", "date"], inplace=True)

# Save to CSV
final_df.to_csv("data/bank_reviews_cleaned.csv", index=False)

print("✅ Reviews saved to 'data/bank_reviews_cleaned.csv'")



✅ Reviews saved to 'data/bank_reviews_cleaned.csv'


Sentiment Analysis Script

In [19]:
# Download VADER lexicon if not already
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\USED\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [20]:
# Load cleaned reviews
df = pd.read_csv("data/bank_reviews_cleaned.csv")

In [22]:
# Initialize VADER
sid = SentimentIntensityAnalyzer()

# Function to classify sentiment
def classify_sentiment(text):
    score = sid.polarity_scores(str(text))['compound']
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'