In [1]:
import sys
import os

# Get the current working directory
# Get project root (folder above 'notebooks')
project_root = os.path.dirname(os.getcwd())

# Path to scripts folder
scripts_path = os.path.join(project_root, "scripts")

# Add to sys.path
sys.path.append(project_root)
sys.path.append(scripts_path)

import pandas as pd

In [2]:
from config import APP_IDS, BANK_NAMES, SCRAPING_CONFIG,DATA_PATHS
from scraper import scrape_bank_reviews
from preprocessing import preprocess_data
from utils import save_dataframe 

#### Call the scraping function and store the value on dataframe
* Use google-play-scraper to collect reviews, ratings, dates, and app names for three banks.
* Target a minimum of 400+ reviews per bank (1,200 total).


##### using **scrape_bank_reviews** function load the three bank review targeting a minimun 400 review for each

In [3]:
   
all_data_df = scrape_bank_reviews(APP_IDS,400)

Total Scraping Progress:   0%|          | 0/3 [00:00<?, ?it/s]


--- Starting scrape for CBE (ID: com.combanketh.mobilebanking) ---
Successfully scraped 400 reviews for CBE.


Total Scraping Progress:  33%|███▎      | 1/3 [00:06<00:13,  6.68s/it]


--- Starting scrape for BoAMobile (ID: com.boa.boaMobileBanking) ---
Successfully scraped 400 reviews for BoAMobile.


Total Scraping Progress:  67%|██████▋   | 2/3 [00:13<00:06,  6.61s/it]


--- Starting scrape for DashenBank (ID: com.dashen.dashensuperapp) ---
Successfully scraped 400 reviews for DashenBank.


Total Scraping Progress: 100%|██████████| 3/3 [00:19<00:00,  6.61s/it]


#### after load a review use .head() to check the data

In [4]:
all_data_df.head()

Unnamed: 0,bank_name,app_id,review_id,user_name,rating,content,date,reply_content,replied_at,thumbs_up
0,CBE,com.combanketh.mobilebanking,cb37b096-e071-4f0f-a8fd-067b7d71706d,Kamil Tesfaye,5,CBE ይለያል።,2025-11-29 17:22:32,,,0
1,CBE,com.combanketh.mobilebanking,70f504ff-daed-40d9-9c89-cc49a95ef659,Abde Semed,5,it's special for me,2025-11-29 15:54:14,,,0
2,CBE,com.combanketh.mobilebanking,28f229b5-0026-41b9-a1eb-b76e74736f63,TOMIZ Creativity,2,Make it user friendly.,2025-11-29 08:17:45,,,0
3,CBE,com.combanketh.mobilebanking,68d8daea-db47-4e23-a692-755173dea983,Tesfaye Abdi,3,maaliif daddafee install gaafata,2025-11-28 13:36:32,,,0
4,CBE,com.combanketh.mobilebanking,ee0dbb0e-4eb0-47b5-9874-c37877493f99,Betelhem Kebede,5,good app,2025-11-28 11:33:17,,,0


In [5]:
all_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   bank_name      1200 non-null   object
 1   app_id         1200 non-null   object
 2   review_id      1200 non-null   object
 3   user_name      1200 non-null   object
 4   rating         1200 non-null   int64 
 5   content        1200 non-null   object
 6   date           1200 non-null   object
 7   reply_content  2 non-null      object
 8   replied_at     2 non-null      object
 9   thumbs_up      1200 non-null   int64 
dtypes: int64(2), object(8)
memory usage: 93.9+ KB


#### Preprocessing:
* Remove duplicates, handle missing data.
* Normalize dates (e.g., to YYYY-MM-DD).
* Save as CSV with columns: review, rating, date, bank, source. on 'data/processed'


In [6]:
cleaned_data_df = preprocess_data(all_data_df)


--- Starting Data Preprocessing ---
Removed duplicates. Rows remaining: 1200
Removed rows with missing rating. Rows remaining: 1200
Dates normalized to YYYY-MM-DD.
Preprocessing complete. Total rows cleaned: 0


#### saving cleaned review on '../data/processed'

In [7]:
path_to_cleaned_data = DATA_PATHS['processed'] + '/cleaned_data.csv'
save_dataframe(cleaned_data_df, path_to_cleaned_data)

DataFrame saved successfully to: ../data/processed/cleaned_data.csv
