### Thematic Analysis

In [None]:
# === Path Configuration ===
from pathlib import Path
import sys
import re
import pandas as pd

# Set root path
ROOT = Path.cwd().parent if "notebooks" in str(Path.cwd()) else Path.cwd()
sys.path.append(str(ROOT))

# Paths
DATA_PATH = ROOT / "notebooks" / "Dataset" / "reviews_with_sentiment.csv"
OUTPUT_PATH = ROOT / "notebooks" / "Dataset" / "reviews_with_themees.csv"

# === Load Data ===
try:
    df = pd.read_csv(DATA_PATH)
    print(f"Loaded {len(df)} reviews from:\n{DATA_PATH}")
except FileNotFoundError:
    print(f"File not found at:\n{DATA_PATH}")
    print(f"Current directory: {Path.cwd()}")
    print("Contents of notebooks/Dataset/:")
    print(list((ROOT / "notebooks" / "Dataset").glob("*")))
    raise

# === Imports for NLP and Keyword Extraction ===
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from utils.theme_rules import THEME_RULES  # Your manually defined theme rules

# Load spaCy model
nlp = spacy.load("en_core_web_sm")
tqdm.pandas()

# === Text Preprocessing with POS Filtering ===
def preprocess_spacy(text):
    """
    Lemmatize text and remove stopwords, punctuation, and all POS 
    except NOUN and VERB for more meaningful keyword extraction.
    """
    doc = nlp(str(text).lower())
    tokens = [
        token.lemma_ 
        for token in doc 
        if token.is_alpha and not token.is_stop and token.pos_ in {"NOUN", "VERB"}
    ]
    return ' '.join(tokens)

df['cleaned_review'] = df['review'].progress_apply(preprocess_spacy)

# === Keyword Extraction with TF-IDF ===
def extract_keywords_tfidf(corpus, ngram_range=(1, 2), top_n=5):
    vec = TfidfVectorizer(ngram_range=ngram_range, stop_words='english', max_features=5000)
    tfidf_matrix = vec.fit_transform(corpus)
    feature_names = vec.get_feature_names_out()

    keywords_per_doc = []
    for row in tfidf_matrix:
        scores = zip(feature_names, row.toarray()[0])
        sorted_keywords = sorted(scores, key=lambda x: x[1], reverse=True)
        top_keywords = [word for word, score in sorted_keywords[:top_n] if score > 0]
        keywords_per_doc.append(top_keywords)
    return keywords_per_doc

df['keywords'] = extract_keywords_tfidf(df['cleaned_review'].tolist())

# === Rule-Based Theme Detection ===
def detect_themes(text):
    """Assign themes based on presence of keywords in THEME_RULES"""
    matched = []
    text = f" {text.lower()} "
    for theme, rules in THEME_RULES.items():
        if any(f" {kw.lower()} " in text for kw in rules["keywords"]):
            matched.append(theme)
    return matched if matched else ["Other"]

df["themes"] = df["review"].progress_apply(detect_themes)

# === Save Results ===
final_cols = ["review_id", "review", "sentiment_label", "sentiment_score", "keywords", "themes"]
existing_cols = [col for col in final_cols if col in df.columns]
df[existing_cols].to_csv(OUTPUT_PATH, index=False)


✅ Loaded 6817 reviews from:
c:\Users\user\Desktop\fintech-reviews-analysis\notebooks\Dataset\reviews_with_sentiment.csv


100%|██████████| 6817/6817 [01:18<00:00, 87.40it/s] 
100%|██████████| 6817/6817 [00:01<00:00, 5040.35it/s]


✅ Saved 6817 analyzed reviews to:
c:\Users\user\Desktop\fintech-reviews-analysis\notebooks\Dataset\reviews_with_themees.csv

Theme Distribution:
themes
Other                      4875
Transactions                805
Feature Requests            644
App Performance             523
Activation/Registration     279
Account Access              255
Transaction History         186
Screenshot/Restrictions     180
Customer Service            148
Interface Issues            131
Notifications                85
Security Concerns            68
International Use            28
Language Support             17
Name: count, dtype: int64

Sample 'Other' Reviews:
['Fine'
 'Why is the app asking me to remove developer options? Why does it matter what I do with my phone , this is unacceptable! Fix this'
 'very good']


In [3]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 1.2 MB/s eta 0:00:11
     --- ------------------------------------ 1.0/12.8 MB 1.2 MB/s eta 0:00:10
     ---- ----------------------------------- 1.3/12.8 MB 1.2 MB/s eta 0:00:10
     ---- ----------------------------------- 1.6/12.8 MB 1.3 MB/s eta 0:00:09
     ----- ---------------------------------- 1.8/12.8 MB 1.3 MB/s eta 0:00:09
     ------ --------------------------------- 2.1/12.8 MB 1.2 MB/s et


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
pip install spacy






[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
