In [1]:
# Cell 1: Imports and setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from transformers import pipeline
import sqlalchemy
import oracledb
from datetime import datetime
import warnings
import os

# Initialize NLTK
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Suppress warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\surap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\surap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\surap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\surap\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Cell 2: ReviewAnalyzer class definition
class ReviewAnalyzer:
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file)
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        
    def clean_text(self, text):
        """Clean and preprocess text"""
        if not isinstance(text, str):
            return ""
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
        # Convert to lowercase
        text = text.lower()
        # Remove extra whitespace
        text = re.sub(' +', ' ', text)
        return text.strip()
    
    def preprocess_text(self, text):
        """Tokenize and lemmatize text"""
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in self.stop_words]
        tokens = [word for word in tokens if len(word) > 2]
        return ' '.join(tokens)
    
    def analyze_sentiment(self, method='textblob'):
        """Perform sentiment analysis"""
        if method == 'textblob':
            self.df['sentiment'] = self.df['clean_review'].apply(
                lambda x: TextBlob(x).sentiment.polarity
            )
            self.df['sentiment_label'] = pd.cut(
                self.df['sentiment'],
                bins=[-1, -0.1, 0.1, 1],
                labels=['negative', 'neutral', 'positive']
            )
        else:
            # Use transformers for more accurate sentiment analysis
            sentiment_pipeline = pipeline(
                "sentiment-analysis", 
                model="distilbert-base-uncased-finetuned-sst-2-english"
            )
            
            def get_sentiment(text):
                if len(text) == 0:
                    return 'neutral', 0
                result = sentiment_pipeline(text[:512])[0]
                return result['label'], result['score']
            
            # Apply to a sample if dataset is large
            sample_size = min(200, len(self.df))
            sample = self.df.sample(sample_size, random_state=42)
            sample[['sentiment_label', 'sentiment_score']] = sample['clean_review'].apply(
                lambda x: pd.Series(get_sentiment(x))
            )
            
            # Merge back with main dataframe
            self.df = self.df.merge(
                sample[['sentiment_label', 'sentiment_score']],
                how='left',
                left_index=True,
                right_index=True
            )
    
    def extract_themes(self):
        """Extract common themes from reviews"""
        # Define theme categories based on keywords
        theme_keywords = {
            'Stability Issues': ['crash', 'bug', 'error', 'freeze', 'close'],
            'Authentication Problems': ['login', 'password', 'otp', 'pin', 'authentic'],
            'Transaction Issues': ['transfer', 'transaction', 'payment', 'money', 'send'],
            'Performance': ['slow', 'fast', 'speed', 'load', 'lag'],
            'UI/UX': ['interface', 'design', 'screen', 'button', 'layout'],
            'Features': ['feature', 'function', 'add', 'need', 'want'],
            'Customer Support': ['support', 'help', 'service', 'contact', 'response']
        }
        
        def assign_theme(text):
            themes = []
            for theme, keywords in theme_keywords.items():
                if any(keyword in text for keyword in keywords):
                    themes.append(theme)
            return ', '.join(themes) if themes else 'Other'
        
        self.df['themes'] = self.df['processed_review'].apply(assign_theme)
    
    def analyze(self):
        """Run full analysis pipeline"""
        # Clean text
        self.df['clean_review'] = self.df['review'].apply(self.clean_text)
        
        # Preprocess for thematic analysis
        self.df['processed_review'] = self.df['clean_review'].apply(self.preprocess_text)
        
        # Sentiment analysis
        self.analyze_sentiment(method='textblob')  # or 'transformers'
        
        # Thematic analysis
        self.extract_themes()
        
        return self.df
    
    def visualize(self, bank_name):
        """Generate visualizations for a specific bank"""
        bank_df = self.df[self.df['bank'] == bank_name]
        
        plt.figure(figsize=(15, 12))
        plt.suptitle(f'{bank_name} Mobile App Review Analysis', y=1.02)
        
        # 1. Rating distribution
        plt.subplot(2, 2, 1)
        sns.countplot(data=bank_df, x='rating', palette='viridis')
        plt.title('Rating Distribution')
        plt.xlabel('Star Rating')
        plt.ylabel('Count')
        
        # 2. Sentiment distribution
        plt.subplot(2, 2, 2)
        sentiment_counts = bank_df['sentiment_label'].value_counts()
        sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette='coolwarm')
        plt.title('Sentiment Distribution')
        plt.ylabel('Count')
        
        # 3. Top themes
        plt.subplot(2, 2, 3)
        themes = bank_df['themes'].str.split(', ').explode()
        top_themes = themes.value_counts().head(5)
        sns.barplot(y=top_themes.index, x=top_themes.values, palette='magma')
        plt.title('Top 5 Complaint Themes')
        plt.xlabel('Count')
        
        # 4. Word cloud
        plt.subplot(2, 2, 4)
        text = ' '.join(bank_df['processed_review'])
        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Reviews')
        
        plt.tight_layout()
         # Create output directory if it doesn't exist
        output_dir = os.path.abspath(os.path.join(os.getcwd(), '../output'))
        os.makedirs(output_dir, exist_ok=True)

        # Prepare output path for the visualization
        output_path = os.path.join(output_dir, bank_name.replace(" ", "_") + "_analysis.png")

        # Save the figure
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()

#  # 4. Word cloud
#         plt.subplot(2, 2, 4)
#         text = ' '.join(bank_df['processed_review'])
#         wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
#         plt.imshow(wordcloud, interpolation='bilinear')
#         plt.axis('off')
#         plt.title('Word Cloud of Reviews')
        
#         plt.tight_layout()
#         plt.savefig(f'{bank_name.replace(" ", "_")}_analysis.png', dpi=300, bbox_inches='tight')
#         plt.close()

In [3]:
# Define the CSV file path
csv_file_path = "../data/All_banks_reviews.csv"

# Instantiate the ReviewAnalyzer
analyzer = ReviewAnalyzer(csv_file_path)

try:
    # Run full analysis pipeline
    df_analyzed = analyzer.analyze()
    
    # Check if any banks are available
    if 'bank' in df_analyzed.columns and not df_analyzed['bank'].isnull().all():
        # Generate visualizations for each bank
        for bank in df_analyzed['bank'].unique():
            analyzer.visualize(bank)
    else:
        print("No bank data available for visualization.")
except Exception as e:
    print(f"Error during analysis: {e}")