In [7]:
import csv
from datetime import datetime
from google_play_scraper import Sort, reviews
import os
import warnings
import sys
import pandas as pd
from tqdm import tqdm
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Set working directory
os.chdir(r'D:\10academy\week2_update\Bank-reviews-analysis')
sys.path.append(os.getcwd())

# Suppress warnings
warnings.filterwarnings("ignore")

# Define data directory
DATA_DIR = 'notebooks/data'
os.makedirs(DATA_DIR, exist_ok=True) # Ensure data directory exists

# --- Preprocessing Functions (Temporary in Notebook) ---
def drop_duplicates_and_nan(df):
    """
    Removes duplicate reviews and drops rows with any missing data.
    """
    if df.empty:
        return df
    original_rows = len(df)
    df = df.drop_duplicates(subset=['review_text'])
    duplicates_removed = original_rows - len(df)
    df = df.dropna(subset=['review_text', 'rating', 'date']) # Drop only if key columns are missing
    nan_removed = original_rows - duplicates_removed - len(df)
    print(f"  - Removed {duplicates_removed} duplicates.")
    print(f"  - Removed {nan_removed} rows with missing data.")
    return df

def normalize_dates(df):
    """
    Normalizes the 'date' column to YYYY-MM-DD format.
    """
    if 'date' in df.columns and not df['date'].empty:
        df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
    return df

def full_preprocessing(df):
    """
    Orchestrates the full preprocessing pipeline on a DataFrame.
    """
    df = drop_duplicates_and_nan(df)
    df = normalize_dates(df)
    return df

# --- Text Preprocessing Functions (Temporary in Notebook) ---
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'. This will only happen once.")
    from spacy.cli import download
    download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    doc = nlp(text)
    tokens = [
        token.lemma_ for token in doc
        if token.text.lower() not in STOP_WORDS and token.is_alpha
    ]
    return ' '.join(tokens)

# --- Scraping Function ---
def scrape_play_store_reviews(app_id, bank_name, count=1000):
    """
    Scrapes Google Play Store reviews for a given app_id.
    Includes robust error handling.
    """
    print(f"🔄 Scraping {count} reviews for {bank_name}...")
    try:
        results, _ = reviews(
            app_id,
            lang='en',
            country='us',
            sort=Sort.NEWEST,
            count=count
        )
    except Exception as e:
        print(f"Error scraping reviews for {bank_name}: {e}")
        return None, pd.DataFrame()
        
    df = pd.DataFrame(results)
    
    if df.empty:
        print(f"No reviews found for {bank_name}.")
        return None, df

    df['bank_name'] = bank_name
    df['source'] = 'Google Play'
    
    column_mapping = {'content': 'review_text', 'score': 'rating'}
    if 'at' in df.columns:
        column_mapping['at'] = 'date'
        
    df.rename(columns=column_mapping, inplace=True)
    
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
    else:
        df['date'] = None
        print(f"⚠️ Warning: 'date' column could not be created for {bank_name} due to missing 'at' data.")

    df = df[['review_text', 'rating', 'date', 'bank_name', 'source']]
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = os.path.join(DATA_DIR, f'{bank_name}_reviews_raw_{timestamp}.csv')
    df.to_csv(filename, index=False, encoding='utf-8')
    
    print(f"✅ Saved {len(df)} raw reviews to {filename}")
    return filename, df
    
# --- Processing and Saving Function (With Fix) ---
def process_and_save(raw_df, bank_name):
    """
    Processes raw data and saves the cleaned version.
    Includes a check for empty DataFrames.
    """
    if raw_df is None or raw_df.empty:
        print(f"⚠️ Skipping preprocessing for {bank_name} due to empty or malformed data.")
        # Return an empty DataFrame with the correct columns to prevent downstream errors
        return None, pd.DataFrame(columns=['review_text', 'rating', 'date', 'bank_name', 'source', 'processed_review'])

    print(f"🔄 Preprocessing reviews for {bank_name}...")
    
    cleaned_df = full_preprocessing(raw_df)
    
    if not cleaned_df.empty:
        cleaned_df['processed_review'] = cleaned_df['review_text'].apply(preprocess_text)
    
    cleaned_filename = os.path.join(DATA_DIR, f'cleaned_{bank_name}_reviews.csv')
    cleaned_df.to_csv(cleaned_filename, index=False)
    
    print(f"✅ Cleaned and preprocessed data saved for {bank_name}: {len(cleaned_df)} records")
    return cleaned_filename, cleaned_df

# --- Display Function ---
def display_sample_data(df, title):
    print(f"\nSample data from {title}:")
    print(df.head())
    print('\n' * 2)

# --- Main Execution Block (With Fix) ---
if __name__ == "__main__":
    banks = {
        "CBE": "com.combanketh.mobilebanking",
        "BOA": "com.boa.boaMobileBanking",
        "Dashen": "com.dashen.dashensuperapp",
        "Awash": "com.sc.awashpay", # Reverting to the correct ID
        "Zemen": "com.ZemenBank.MobileApp"
    }
    
    all_processed_data = []

    for bank_name, app_id in tqdm(banks.items(), desc="Processing Banks"):
        raw_filename, raw_df = scrape_play_store_reviews(app_id, bank_name, count=1000)
        display_sample_data(raw_df, f"Raw Data for {bank_name}")
        
        cleaned_filename, cleaned_df = process_and_save(raw_df, bank_name)
        
        if cleaned_df is not None and not cleaned_df.empty:
            display_sample_data(cleaned_df, f"Cleaned Data for {bank_name}")
            all_processed_data.append(cleaned_df)
    
    if all_processed_data:
        final_df = pd.concat(all_processed_data, ignore_index=True)
        final_df.to_csv(os.path.join(DATA_DIR, 'all_processed_reviews.csv'), index=False)
        print(f"✅ All processed data combined into a single file: {len(final_df)} records")
    else:
        print("❌ No data was collected. Final output file not created.")

Processing Banks:   0%|                                                                                                      | 0/5 [00:00<?, ?it/s]

🔄 Scraping 1000 reviews for CBE...
✅ Saved 1000 raw reviews to notebooks/data\CBE_reviews_raw_20250819_221627.csv

Sample data from Raw Data for CBE:
                                         review_text  rating  \
0           this good app but screenshot must enable       4   
1                              the fast ❤❤❤❤❤❤❤❤❤❤🇪🇹       5   
2  በፊት ብር ትራንስፈር የተደረገባቸዉን አካዉንቶች remove አርጓል የሚያ...       1   
3                                               Good       4   
4                                           Best app       5   

                 date bank_name       source  
0 2025-08-18 20:04:39       CBE  Google Play  
1 2025-08-18 18:42:07       CBE  Google Play  
2 2025-08-18 18:36:31       CBE  Google Play  
3 2025-08-18 16:44:11       CBE  Google Play  
4 2025-08-18 16:23:58       CBE  Google Play  



🔄 Preprocessing reviews for CBE...
  - Removed 246 duplicates.
  - Removed 0 rows with missing data.


Processing Banks:  20%|██████████████████▊                                                                           | 1/5 [00:10<00:40, 10.13s/it]

✅ Cleaned and preprocessed data saved for CBE: 754 records

Sample data from Cleaned Data for CBE:
                                         review_text  rating        date  \
0           this good app but screenshot must enable       4  2025-08-18   
1                              the fast ❤❤❤❤❤❤❤❤❤❤🇪🇹       5  2025-08-18   
2  በፊት ብር ትራንስፈር የተደረገባቸዉን አካዉንቶች remove አርጓል የሚያ...       1  2025-08-18   
3                                               Good       4  2025-08-18   
4                                           Best app       5  2025-08-18   

  bank_name       source                                   processed_review  
0       CBE  Google Play                         good app screenshot enable  
1       CBE  Google Play                                               fast  
2       CBE  Google Play  በፊት ብር ትራንስፈር የተደረገባቸዉን አካዉንቶች remove አርጓል የሚያ...  
3       CBE  Google Play                                               good  
4       CBE  Google Play                              

Processing Banks:  40%|█████████████████████████████████████▌                                                        | 2/5 [00:21<00:32, 10.68s/it]

✅ Cleaned and preprocessed data saved for BOA: 855 records

Sample data from Cleaned Data for BOA:
                                         review_text  rating        date  \
0                               unlimited experience       5  2025-08-18   
1  as bank, u should at least hire someone good a...       4  2025-08-17   
2          very nice mobile banking app my favourite       5  2025-08-16   
3  it almost never boots up!!! and even if it did...       1  2025-08-15   
4                                          best apps       5  2025-08-15   

  bank_name       source                                   processed_review  
0       BOA  Google Play                               unlimited experience  
1       BOA  Google Play  bank u hire good job bc joke nowadays log tran...  
2       BOA  Google Play                  nice mobile banking app favourite  
3       BOA  Google Play                             boot crash immediately  
4       BOA  Google Play                              

Processing Banks:  60%|████████████████████████████████████████████████████████▍                                     | 3/5 [00:29<00:19,  9.73s/it]

✅ Cleaned and preprocessed data saved for Dashen: 489 records

Sample data from Cleaned Data for Dashen:
                                         review_text  rating        date  \
0                       easy to use and the best one       5  2025-08-17   
1  it is totally work of failure. I personally pr...       1  2025-08-16   
2                                      very good app       5  2025-08-16   
3  The worst banking app in Ethiopia. It forces y...       1  2025-08-16   
4                                                አሪፍ       5  2025-08-15   

  bank_name       source                                   processed_review  
0    Dashen  Google Play                                      easy use good  
1    Dashen  Google Play  totally work failure personally prefer dashen ...  
2    Dashen  Google Play                                           good app  
3    Dashen  Google Play    bad banking app Ethiopia force change PIN month  
4    Dashen  Google Play                        

Processing Banks:  80%|███████████████████████████████████████████████████████████████████████████▏                  | 4/5 [00:39<00:09,  9.83s/it]

✅ Cleaned and preprocessed data saved for Awash: 669 records

Sample data from Cleaned Data for Awash:
                                         review_text  rating        date  \
0  the worst banking experience, the app keeps cr...       1  2025-08-18   
1  Awash Bank Birr Pro — Safe and reliable - Stro...       5  2025-08-17   
2                                      bedt app ever       5  2025-08-16   
3                                           good app       5  2025-08-16   
4                                      Excellent App       5  2025-08-15   

  bank_name       source                                   processed_review  
0     Awash  Google Play  bad banking experience app keep crush guy well...  
1     Awash  Google Play  Awash Bank Birr Pro safe reliable strong encry...  
2     Awash  Google Play                                           bedt app  
3     Awash  Google Play                                           good app  
4     Awash  Google Play                          

Processing Banks: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:41<00:00,  8.33s/it]

✅ Cleaned and preprocessed data saved for Zemen: 126 records

Sample data from Cleaned Data for Zemen:
                                         review_text  rating        date  \
0  I don’t think it was the bank app. It was bori...       1  2025-06-04   
1  It used to be a great app. But these days it d...       2  2025-05-17   
2                                          ohh tired       1  2025-04-30   
3  We are Thankful for your quick fix on the bug ...       4  2025-04-05   
4                                   nice application       4  2025-04-04   

  bank_name       source                                   processed_review  
0     Zemen  Google Play  think bank app boring bad bore use strong word...  
1     Zemen  Google Play  great app day right balance account issue mont...  
2     Zemen  Google Play                                          ohh tired  
3     Zemen  Google Play  thankful quick fix bug fix app frequent crashi...  
4     Zemen  Google Play                          


