In [1]:
from google_play_scraper import reviews_all
import pandas as pd

apps = {
    "Commercial Bank of Ethiopia": "com.combanketh.mobilebanking",
	"Bank of Abyssinia": "com.boa.boaMobileBanking",
    "Dashen Bank": "com.dashen.dashensuperapp"
	}
	
all_reviews = []

for bank, app_id in apps.items():
	reviews = reviews_all(app_id, lang='en', count=1500)
	for r in reviews:
	        all_reviews.append({
            'review': r['content'],
	            'rating': r['score'],
            'date': r['at'].date(),
            'bank': bank,
            'source': 'Google Play'
     })
df = pd.DataFrame(all_reviews)
df.to_csv("../data/raw_reviews.csv", index=False)

print(df['bank'].value_counts())
total_rows = df.shape[0]
print(f"Total reviews collected: {len(df)}")


bank
Commercial Bank of Ethiopia    7509
Bank of Abyssinia              1045
Dashen Bank                     454
Name: count, dtype: int64
Total reviews collected: 9008


In [2]:
df = pd.read_csv("../data/raw_reviews.csv")

print(df['review'].isna().sum())

original_len = len(df)

# Drop rows where review is null or empty string
df['review'] = df['review'].astype(str).str.strip()
df = df[df['review'].notnull() & (df['review'] != '')]

removed_pct = (1 - len(df)/original_len) * 100
print(f"% Missing Data: {removed_pct:.2f}%")

# Drop duplicates
df.drop_duplicates(subset=['bank', 'review'], inplace=True)

# Normalize date
df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')

# Save clean file
df.to_csv("../data/cleaned_reviews.csv", index=False)

# Summary
print(f"Cleaned review count: {len(df)}")
print("\nMissing values:\n", df.isnull().sum())
print("\nBank count:\n", df['bank'].value_counts())

missing_ratio = (1 - len(df) / 8991) * 100
print(f"% Missing Data Removed: {missing_ratio:.2f}%")
print(df.head())

7
% Missing Data: 0.00%
Cleaned review count: 6832

Missing values:
 review    0
rating    0
date      0
bank      0
source    0
dtype: int64

Bank count:
 bank
Commercial Bank of Ethiopia    5531
Bank of Abyssinia               890
Dashen Bank                     411
Name: count, dtype: int64
% Missing Data Removed: 24.01%
                                              review  rating        date  \
0  . Reviewing content on Play is a great way to ...       5  2025-06-10   
1                         So bad now and hard to use       5  2025-06-09   
2  it is so amazing app. but, it is better to upd...       5  2025-06-09   
3                                         v.good app       4  2025-06-09   
4                                      very good app       1  2025-06-09   

                          bank       source  
0  Commercial Bank of Ethiopia  Google Play  
1  Commercial Bank of Ethiopia  Google Play  
2  Commercial Bank of Ethiopia  Google Play  
3  Commercial Bank of Ethiopia  G