# Task 1: Data Collection

### Web Scraping

In [8]:
from google_play_scraper import Sort, reviews_all
import pandas as pd
from time import sleep
import random

BANKS = {
    "Commercial Bank of Ethiopia": "com.combanketh.mobilebanking",
    "Bank of Abyssinia": "com.boa.boaMobileBanking",
    "Dashen Bank": "com.dashen.dashensuperapp"
}

def scrape_all_reviews(app_id, bank_name):
    try:
        sleep(random.uniform(1, 3))  # Random delay
        
        # Get ALL available reviews
        reviews = reviews_all(
            app_id,
            lang='en',
            country='et',
            sort=Sort.NEWEST,
            sleep_milliseconds=2000  # Slower = more reliable
        )
        
        # Convert to DataFrame with EXACT columns you requested
        df = pd.DataFrame(reviews)[['content', 'score', 'at']]
        df.columns = ['review', 'rating', 'date']  # Rename columns
        df['bank'] = bank_name
        df['source'] = 'Google Play'  # Static value because i am scraping all from Google Play
        
        return df
    
    except Exception as e:
        print(f"⚠️ Error scraping {bank_name}: {str(e)}")
        return pd.DataFrame(columns=['review', 'rating', 'date', 'bank', 'source'])

# Scrape all banks
all_reviews = []
for bank_name, app_id in BANKS.items():
    print(f"Scraping {bank_name}...")
    bank_reviews = scrape_all_reviews(app_id, bank_name)
    print(f"→ Collected {len(bank_reviews)} reviews")
    all_reviews.append(bank_reviews)
    sleep(5)  # Pause between banks

# Combine and save
final_df = pd.concat(all_reviews, ignore_index=True)
final_df.to_csv("Dataset/ethiopian_banks_mobileApp_reviews.csv", index=False)
print(f"\n Saved {len(final_df)} total reviews to 'ethiopian_bank_reviews.csv'")

Scraping Commercial Bank of Ethiopia...
→ Collected 7496 reviews
Scraping Bank of Abyssinia...
→ Collected 1044 reviews
Scraping Dashen Bank...
→ Collected 448 reviews

 Saved 8988 total reviews to 'ethiopian_bank_reviews.csv'
