In [None]:


# Cell 1: Imports and Setup
from google_play_scraper import app, Sort, reviews_all
import pandas as pd
import time
from datetime import datetime

# Bank apps to scrape (package names from Google Play Store URLs)
BANK_APPS = {
    "Commercial Bank of Ethiopia": "com.combanketh.mobilebanking",
    "Bank of Abyssinia": "com.boa.boaMobileBanking",
    "Dashen Bank": "com.cr2.amolelight"
}



In [None]:
# Cell 2: Review Scraper Function
def scrape_reviews(package_name, bank_name):
    """
    Scrape reviews for a specific bank app
    """
    print(f"Scraping reviews for {bank_name}...")
    
    # Get app info first
    try:
        app_info = app(package_name)
        print(f"App found: {app_info['title']} ({app_info['score']} stars)")
    except Exception as e:
        print(f"Error getting app info: {e}")
        app_info = None
    
    # Scrape reviews
    all_reviews = []
    continuation_token = None
    
    for _ in range(5):  # 5 batches of ~100 reviews
        try:
            result = reviews_all(
            package_name,
            lang='en',
            country='et',
            sort=Sort.NEWEST,
        )

            all_reviews.extend(result)
            print(f"Collected {len(result)} reviews (total: {len(all_reviews)})")
            
            if not continuation_token:
                break
                
            time.sleep(2)
            
        except Exception as e:
            print(f"Error scraping reviews: {e}")
            break
    
    # Convert to DataFrame
    df = pd.DataFrame(all_reviews)
    
    # Add bank name
    df['bank'] = bank_name
    
    # Convert timestamp to date
    df['date'] = pd.to_datetime(df['at']).dt.date
    
    # Select relevant columns
    df = df[['content', 'score', 'date', 'bank', 'thumbsUpCount']]
    df.columns = ['review', 'rating', 'date', 'bank', 'votes']
    
    return df


In [None]:


all_reviews = []

for bank_name, package_name in BANK_APPS.items():
    try:
        bank_reviews = scrape_reviews(package_name, bank_name)
        all_reviews.append(bank_reviews)
    except Exception as e:
        print(f"Failed to scrape {bank_name}: {e}")



In [None]:
combined_df = pd.concat(all_reviews, ignore_index=True)

# Save to CSV
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"../data/All_banks_reviews.csv"
combined_df.to_csv(filename, index=False)
print(f"Saved {len(combined_df)} reviews to {filename}")

In [None]:
combined_df.head()

In [9]:
# Cell 1: Imports and setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from transformers import pipeline
import sqlalchemy
import oracledb
from datetime import datetime
import warnings

# Initialize NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Suppress warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\surap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\surap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\surap\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
class ReviewAnalyzer:
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file)
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
    def clean_text(self, text):
        """Clean and preprocess text"""
        if not isinstance(text, str):
            return ""
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
        # Convert to lowercase
        text = text.lower()
        # Remove extra whitespace
        text = re.sub(' +', ' ', text)
        return text.strip()
    
    def preprocess_text(self, text):
        """Tokenize and lemmatize text"""
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in self.stop_words]
        tokens = [word for word in tokens if len(word) > 2]
        return ' '.join(tokens)