In [19]:
# Run this cell in Colab

# --- Install Libraries ---
!pip install numpy~=1.26.0 # Pin NumPy first
!pip install pandas nltk scikit-learn transformers torch tensorflow sentencepiece
# No VADER needed now
!pip install rich~=13.0 # Downgrade rich

# --- Check final versions ---
print("--- Checking Key Package Versions ---")
!pip list | grep -E 'numpy|tensorflow|nltk|rich' # Removed vader/spacy
print("--- Check Complete ---")

# --- IMPORTANT: RESTART RUNTIME AFTER THIS CELL ---
print("\n *** IMPORTANT: Dependencies installed/updated. Please RESTART RUNTIME now (Runtime -> Restart runtime) before running subsequent cells! *** \n")

# --- Code below will run AFTER restart ---

import pandas as pd
import numpy as np
import nltk # Still needed for tokenization in summarizer fallback
import re
import string
# import collections # Not needed if Pro/Con is removed
import logging
import time

# --- NLTK Download (Minimal: punkt for tokenization, stopwords) ---
print("Downloading NLTK resources ('punkt', 'stopwords')...")
NLTK_READY = False # Flag for basic NLTK
try:
    nltk.download('punkt', quiet=True) # Needed for sent_tokenize fallback in summarizer
    nltk.download('stopwords', quiet=True) # Might be used elsewhere, keep for now
    print("NLTK resources downloaded successfully.")
    stop_words = set(stopwords.words('english')) # Keep stopwords just in case
    NLTK_READY = True
except Exception as e:
    print(f"Error downloading NLTK data: {e}")
    stop_words = set()
    # NLTK_READY remains False
# --- End NLTK Download ---

# --- VADER Setup (REMOVED) ---
# No VADER needed

# --- Core Library Imports ---
from transformers import pipeline
import torch
import telegram
from telegram.ext import Application, CommandHandler, MessageHandler, filters
# --- End Core Library Imports ---

# Setup logging for the bot
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

print("Core libraries imported.")
print(f"Using python-telegram-bot version: {telegram.__version__}")
print(f"NLTK ready (for basic tokenization): {NLTK_READY}")

--- Checking Key Package Versions ---
nltk                                  3.9.1
numpy                                 1.26.4
rich                                  13.9.4
tensorflow                            2.18.0
tensorflow-datasets                   4.9.8
tensorflow_decision_forests           1.11.0
tensorflow-hub                        0.16.1
tensorflow-io-gcs-filesystem          0.37.1
tensorflow-metadata                   1.17.0
tensorflow-probability                0.25.0
tensorflow-text                       2.18.1
--- Check Complete ---

 *** IMPORTANT: Dependencies installed/updated. Please RESTART RUNTIME now (Runtime -> Restart runtime) before running subsequent cells! *** 

Downloading NLTK resources ('punkt', 'stopwords')...
NLTK resources downloaded successfully.
Core libraries imported.
Using python-telegram-bot version: 20.3
NLTK ready (for basic tokenization): True


In [20]:
# %% [code]
# Block 1: Load and Prepare Data

print("\n--- Block 1: Loading Data ---")
# --- Configuration ---
DATA_FILE = 'Dataset-SA.csv' # Make sure this file is uploaded to Colab session storage
PRODUCT_NAME_COLUMN = 'product_name'
REVIEW_TEXT_COLUMN = 'Review'
RATING_COLUMN = 'Rate'
ORIGINAL_SENTIMENT_COLUMN = 'Sentiment'
SUMMARY_COLUMN = 'Summary' # <-- ADDED: Column name for original summaries

# --- Load Data ---
df = None
try:
    df = pd.read_csv(DATA_FILE)
    print(f"Dataset loaded successfully. Shape: {df.shape}")
    print("Columns found:", df.columns.tolist())

    # Verify required columns exist (including the new SUMMARY_COLUMN)
    required_columns = [PRODUCT_NAME_COLUMN, REVIEW_TEXT_COLUMN, RATING_COLUMN, ORIGINAL_SENTIMENT_COLUMN, SUMMARY_COLUMN]
    if not all(col in df.columns for col in required_columns):
        missing = [col for col in required_columns if col not in df.columns]
        raise ValueError(f"Missing required columns: {missing}. Found: {df.columns.tolist()}")

    # Basic Data Cleaning & Type Conversion
    df.dropna(subset=[REVIEW_TEXT_COLUMN], inplace=True) # Drop rows with no review text
    df[REVIEW_TEXT_COLUMN] = df[REVIEW_TEXT_COLUMN].astype(str)
    df[PRODUCT_NAME_COLUMN] = df[PRODUCT_NAME_COLUMN].astype(str)
    # Convert Rating, coerce errors to NaN
    df[RATING_COLUMN] = pd.to_numeric(df[RATING_COLUMN], errors='coerce')
    # Clean Sentiment column
    if ORIGINAL_SENTIMENT_COLUMN in df.columns:
         df[ORIGINAL_SENTIMENT_COLUMN] = df[ORIGINAL_SENTIMENT_COLUMN].astype(str).str.lower().str.strip()
    # Clean Summary column - Ensure string, fill NaN with empty string, strip whitespace
    if SUMMARY_COLUMN in df.columns:
        df[SUMMARY_COLUMN] = df[SUMMARY_COLUMN].astype(str).fillna('').str.strip()

    print("Data loaded and basic preparation done.")
    print("\nSample Data (including Summary):")
    print(df[required_columns].head())

except FileNotFoundError:
    print(f"ERROR: File '{DATA_FILE}' not found. Please upload it to your Colab environment.")
    df = None # Ensure df is None if loading fails
except ValueError as e:
    print(f"ERROR: {e}")
    df = None
except Exception as e:
    print(f"An unexpected error occurred loading the data: {e}")
    df = None

print("--- Block 1: Complete ---")


--- Block 1: Loading Data ---
Dataset loaded successfully. Shape: (205052, 6)
Columns found: ['product_name', 'product_price', 'Rate', 'Review', 'Summary', 'Sentiment']
Data loaded and basic preparation done.

Sample Data (including Summary):
                                        product_name           Review  Rate  \
0  Candes 12 L Room/Personal Air Cooler??????(Whi...           super!   5.0   
1  Candes 12 L Room/Personal Air Cooler??????(Whi...          awesome   5.0   
2  Candes 12 L Room/Personal Air Cooler??????(Whi...             fair   3.0   
3  Candes 12 L Room/Personal Air Cooler??????(Whi...  useless product   1.0   
4  Candes 12 L Room/Personal Air Cooler??????(Whi...             fair   3.0   

  Sentiment                                            Summary  
0  positive  great cooler excellent air flow and for this p...  
1  positive              best budget 2 fit cooler nice cooling  
2  positive  the quality is good but the power of air is de...  
3  negative          

In [21]:
# %% [code]
# STEP 2 CODE BLOCK (COMPLETE - RELAXED HEURISTICS V2)

import pandas as pd # Make sure pandas is imported
import re          # Make sure re is imported
import logging     # Make sure logging is imported

logger = logging.getLogger(__name__) # Ensure logger is defined

# --- Fake Review Detection (RELAXED Heuristic Approach V2) ---

def predict_fake_heuristic(review_text, rating, original_sentiment):
    """
    RELAXED heuristic rules using text, rating, and original sentiment.
    Aims to filter only the most obvious spam/junk.
    Returns True if suspected fake, False otherwise.
    """
    text = str(review_text).strip() # Ensure text is string and stripped
    # Normalize original sentiment for comparison (handle potential NaN/None)
    sentiment = str(original_sentiment).lower().strip() if pd.notna(original_sentiment) else None
    rate = rating if pd.notna(rating) else None

    # --- Rule Tuning - Be VERY lenient ---

    # Rule 1: Very short reviews - **Significantly Relaxed**
    # Only flag reviews with virtually no content.
    if len(text.split()) < 2: # Was < 4, now < 2 (e.g., flags "Ok", ".", but allows "Good product")
        logger.info(f"Flagged as potentially fake (Rule 1: Too short - {len(text.split())} words): '{text[:50]}...'")
        return True

    # Rule 2: Excessive capitalization - Keep (Relatively reliable signal)
    # Check only if text is reasonably long
    if len(text) > 30 and sum(1 for c in text if c.isupper()) / len(text) > 0.6: # Threshold slightly increased
         logger.info(f"Flagged as potentially fake (Rule 2: Excessive Caps): '{text[:50]}...'")
         return True

    # Rule 3: Excessive punctuation - Keep, maybe relax threshold slightly? Let's keep 5+
    if len(re.findall(r'(!|\?){5,}', text)) > 0: # 5+ consecutive ! or ?
        logger.info(f"Flagged as potentially fake (Rule 3: Excessive Punctuation): '{text[:50]}...'")
        return True

    # Rule 4: Repetitive characters - Keep (Good signal)
    if len(re.findall(r'(.)\1{4,}', text)) > 0: # 5+ consecutive identical characters
        logger.info(f"Flagged as potentially fake (Rule 4: Repetitive Chars): '{text[:50]}...'")
        return True

    # Rule 5/6: Mismatch - Keep (Clear contradictions are strong signals)
    if rate is not None and sentiment is not None:
        # Rule 5: Low rating (1 star) but explicitly 'positive' sentiment
        if rate == 1 and sentiment == 'positive':
            logger.info(f"Flagged as potentially fake (Rule 5: Rating/Sentiment Mismatch - Rate 1, Sent 'positive'): '{text[:50]}...'")
            return True
        # Rule 6: High rating (5 stars) but explicitly 'negative' sentiment
        if rate == 5 and sentiment == 'negative':
            logger.info(f"Flagged as potentially fake (Rule 6: Rating/Sentiment Mismatch - Rate 5, Sent 'negative'): '{text[:50]}...'")
            return True

    # Rule 7: Generic phrases - Keep commented out

    # Rule 8: All Caps Review - Keep (Good signal)
    if text.isupper() and len(text) > 20:
        logger.info(f"Flagged as potentially fake (Rule 8: All Caps): '{text[:50]}...'")
        return True

    # --- If none of the above rules triggered ---
    return False # Assume genuine


# --- Assign the chosen function ---
is_likely_fake = predict_fake_heuristic
print("Using RELAXED V2 Heuristics for fake review detection.")

# --- Test the heuristic function (Examples - Adjusted Expectations) ---
print("\n--- RELAXED V2 Heuristic Tests ---")
test_data = [
    # Text, Rating, Original Sentiment, Expected Fake (True/False) with RELAXED V2 rules
    ("amazing product works perfectly !!!!!", 5, 'Positive', False), # Should pass (punctuation ok)
    ("Bad.", 1, 'Negative', False), # Should PASS now (>= 1 word)
    ("Ok", 3, 'Neutral', True), # Should FAIL now (< 2 words)
    (".", 1, 'Negative', True), # Should FAIL (< 2 words)
    ("Good", 4, 'Positive', True), # Should FAIL (< 2 words)
    ("Good product", 4, 'Positive', False), # Should PASS (>= 2 words)
    ("Absolutely terrible product, do not buy.", 5, 'Negative', True), # Should still fail (Rule 6: Rate 5/Negative)
    ("GOOD PRODUCT", 4, 'Positive', False), # Should pass (Not all caps > 20 chars)
    ("THIS IS A LONGER REVIEW ALL IN CAPS", 5, 'Positive', True), # Should fail (Rule 8: All Caps > 20 chars)
    ("it was okay", 3, 'Neutral', False), # Should pass
    ("This is fantastic, I love it!", 1, 'Positive', True), # Should fail (Rule 5: Rate 1/Positive)
    ("vvvvvvveeeeerrrryyyyy goooood", 5, 'Positive', True), # Should fail (Rule 4: Repetitive chars >= 5)
    ("Great!", 5, 'Positive', True), # Should FAIL (< 2 words)
    ("Didn't work.", 1, 'Negative', False) # Should pass (>= 2 words)
]

# Check if DataFrame 'df' exists before running detailed tests (optional here)
run_detailed_tests = True # Assume we want to see test output

if run_detailed_tests:
    print("Running tests with sample data...")
    correct_count = 0
    for text, rating, sentiment, expected in test_data:
        is_fake = is_likely_fake(text, rating, sentiment)
        status = "Correct" if is_fake == expected else f"INCORRECT (Expected {expected})"
        if is_fake == expected: correct_count += 1
        print(f"  Test: ('{text}', Rate: {rating}, Sent: {sentiment}) -> Predicted Fake: {is_fake} - {status}")
    print(f"Accuracy on test cases: {correct_count / len(test_data):.1%}")
else:
    print("Skipping detailed heuristic tests.")

print("--- End Heuristic Tests ---")

Using RELAXED V2 Heuristics for fake review detection.

--- RELAXED V2 Heuristic Tests ---
Running tests with sample data...
  Test: ('amazing product works perfectly !!!!!', Rate: 5, Sent: Positive) -> Predicted Fake: True - INCORRECT (Expected False)
  Test: ('Bad.', Rate: 1, Sent: Negative) -> Predicted Fake: True - INCORRECT (Expected False)
  Test: ('Ok', Rate: 3, Sent: Neutral) -> Predicted Fake: True - Correct
  Test: ('.', Rate: 1, Sent: Negative) -> Predicted Fake: True - Correct
  Test: ('Good', Rate: 4, Sent: Positive) -> Predicted Fake: True - Correct
  Test: ('Good product', Rate: 4, Sent: Positive) -> Predicted Fake: False - Correct
  Test: ('Absolutely terrible product, do not buy.', Rate: 5, Sent: Negative) -> Predicted Fake: True - Correct
  Test: ('GOOD PRODUCT', Rate: 4, Sent: Positive) -> Predicted Fake: False - Correct
  Test: ('THIS IS A LONGER REVIEW ALL IN CAPS', Rate: 5, Sent: Positive) -> Predicted Fake: True - Correct
  Test: ('it was okay', Rate: 3, Sent: Ne

In [22]:
# --- Sentiment Analysis Setup (Hugging Face) ---

sentiment_pipeline = None
try:
    # Using a model that outputs Positive/Negative/Neutral directly
    # Consider device='cuda' if GPU is available in Colab runtime, otherwise defaults to CPU
    device = 0 if torch.cuda.is_available() else -1
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model="cardiffnlp/twitter-roberta-base-sentiment",
        device=device # Use GPU if available
        )
    print(f"Sentiment analysis pipeline loaded (RoBERTa) on device: {'GPU' if device == 0 else 'CPU'}.")

    def get_sentiment(review_text):
        """Analyzes sentiment using the loaded pipeline."""
        if not sentiment_pipeline:
             return {"label": "Neutral", "score": 0.0}
        try:
            # Model has max sequence length, truncation is handled by pipeline by default usually
            # but explicit truncation can prevent unexpected errors with very long reviews.
            max_length = 512
            truncated_text = review_text[:max_length*4] # Allow slightly longer input for truncation handling

            if not truncated_text.strip(): # Handle empty strings
                return {"label": "Neutral", "score": 0.0}

            with torch.no_grad(): # Disable gradient calculation for inference
                 result = sentiment_pipeline(truncated_text, truncation=True)[0] # Ensure truncation

            # Map labels (LABEL_0: Negative, LABEL_1: Neutral, LABEL_2: Positive)
            label_map = {"LABEL_0": "Negative", "LABEL_1": "Neutral", "LABEL_2": "Positive"}
            sentiment = label_map.get(result['label'], "Neutral")
            score = result['score']
            return {"label": sentiment, "score": score}
        except Exception as e:
            logger.error(f"Error during sentiment analysis for text: '{review_text[:50]}...': {e}")
            return {"label": "Neutral", "score": 0.0} # Default on error

except Exception as e:
    print(f"ERROR loading sentiment pipeline: {e}")
    print("Sentiment analysis may not be available or will be slow (CPU fallback).")
    # Define a dummy function if pipeline fails
    def get_sentiment(review_text):
        logger.warning("Sentiment pipeline not loaded, returning Neutral.")
        return {"label": "Neutral", "score": 0.0}

# --- Test Sentiment Analysis ---
print("\n--- Sentiment Analysis Tests ---")
if sentiment_pipeline:
    test_review_pos = "This product is amazing, I really love it!"
    sentiment_result_pos = get_sentiment(test_review_pos)
    print(f"Test Positive ('{test_review_pos}'): {sentiment_result_pos}")

    test_review_neg = "Worst purchase ever, completely broke after one day."
    sentiment_result_neg = get_sentiment(test_review_neg)
    print(f"Test Negative ('{test_review_neg}'): {sentiment_result_neg}")

    test_review_neu = "The product is black."
    sentiment_result_neu = get_sentiment(test_review_neu)
    print(f"Test Neutral ('{test_review_neu}'): {sentiment_result_neu}")
else:
    print("Skipping sentiment analysis test as pipeline failed to load.")
print("--- End Sentiment Analysis Tests ---")

Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Sentiment analysis pipeline loaded (RoBERTa) on device: CPU.

--- Sentiment Analysis Tests ---
Test Positive ('This product is amazing, I really love it!'): {'label': 'Positive', 'score': 0.9926257729530334}
Test Negative ('Worst purchase ever, completely broke after one day.'): {'label': 'Negative', 'score': 0.9672914743423462}
Test Neutral ('The product is black.'): {'label': 'Neutral', 'score': 0.5029102563858032}
--- End Sentiment Analysis Tests ---


In [23]:
# %% [code]
# STEP 4 CODE BLOCK (Ensure this runs successfully before Block 6)

# --- Summarization Setup (Hugging Face) ---
summarizer = None
try:
    # Ensure torch is imported from Block 0!
    # Consider device='cuda' if GPU is available
    device = 0 if torch.cuda.is_available() else -1
    summarizer = pipeline(
        "summarization",
        model="sshleifer/distilbart-cnn-12-6",
        device=device # Use GPU if available
        )
    print(f"Summarization pipeline loaded (DistilBART) on device: {'GPU' if device == 0 else 'CPU'}.")

    def generate_summary(text_corpus, max_len=130, min_len=25): # Adjusted lengths
        """Generates a summary from a large block of text."""
        if not summarizer:
            logger.warning("Summarizer pipeline accessed but not loaded.")
            # Fallback: Return first few sentences if summarizer fails
            sentences = re.split(r'(?<=[.!?])\s+', text_corpus) # Need import re
            return " ".join(sentences[:3]) + "..." if sentences else "Summarization unavailable."
        try:
            # Summarization models have input length limits (e.g., 1024 tokens for BART)
            # Pipelines often handle this, but being explicit can help.
            # Let's estimate max chars based on typical token length
            max_input_chars = 1024 * 4 # Heuristic limit (adjust based on model/memory)

            # Ensure input is string and not empty
            text_to_summarize = str(text_corpus).strip()
            if not text_to_summarize:
                 return "No genuine reviews available to summarize."

            truncated_corpus = text_to_summarize[:max_input_chars]

            with torch.no_grad(): # Disable gradient calculation
                # Use truncation=True in the call
                summary_list = summarizer(truncated_corpus, max_length=max_len, min_length=min_len, do_sample=False, truncation=True)

            # Check if the result is valid
            if summary_list and isinstance(summary_list, list) and 'summary_text' in summary_list[0]:
                 return summary_list[0]['summary_text']
            else:
                 logger.error(f"Unexpected summarizer output format: {summary_list}")
                 return "Could not generate summary (unexpected output)."

        except Exception as e:
            logger.error(f"Error during summarization: {e}")
            # Fallback: Return first few sentences if summarizer fails
            sentences = re.split(r'(?<=[.!?])\s+', str(text_corpus)) # Need import re
            return " ".join(sentences[:3]) + "..." if sentences else "Could not generate summary (error)."

except NameError as ne:
     print(f"ERROR loading summarization pipeline: {ne}")
     print("This usually means 'import torch' or 'import re' in Block 0 was not executed in this session.")
     print("Please re-run Block 0, then Blocks 1-3, and this block (4) again.")
     # Define dummy function
     def generate_summary(text_corpus, max_len=130, min_len=25):
        logger.warning("Summarizer pipeline not loaded (NameError), using fallback.")
        try:
            sentences = re.split(r'(?<=[.!?])\s+', str(text_corpus)) # Need import re
            return " ".join(sentences[:3]) + "..." if sentences else "Summarization unavailable."
        except NameError: # If re wasn't imported
             return "Summarization unavailable (re module missing)."
except Exception as e:
    print(f"ERROR loading summarization pipeline: {e}")
    print("Summarization may not be available or will use basic fallback.")
    # Define a dummy function if pipeline fails for other reasons
    def generate_summary(text_corpus, max_len=130, min_len=25):
         logger.warning(f"Summarizer pipeline not loaded (Error: {e}), using fallback.")
         try:
            sentences = re.split(r'(?<=[.!?])\s+', str(text_corpus)) # Need import re
            return " ".join(sentences[:3]) + "..." if sentences else "Summarization unavailable."
         except NameError: # If re wasn't imported
             return "Summarization unavailable (re module missing)."
         except Exception as fallback_e:
             logger.error(f"Error in summarizer fallback: {fallback_e}")
             return "Summarization unavailable (fallback error)."


# --- Test Summarization ---
print("\n--- Summarization Test ---")
long_text_example = """
Review 1: This is the first review. It was okay, not great but not terrible. The setup was easy. Buttons feel a bit flimsy. Decent for the price point.
Review 2: This second review is much more positive. I absolutely love this product! It exceeded all my expectations and works flawlessly. Highly recommended to everyone needing this. Setup was instant.
Review 3: A third opinion suggests mediocrity. It does the job, but feels a bit cheap, like it might break soon. I wouldn't buy it again at full price. Maybe wait for a sale if you really need one. Performance is average.
Review 4: Finally, a very negative experience. Broke within a week of light use. Customer service was unhelpful and slow to respond. Avoid this product at all costs. It's flimsy and poorly made. Complete waste of money.
"""
# Use the defined generate_summary function (real or dummy)
summary_result = generate_summary(long_text_example)
print(f"Input Text Length: {len(long_text_example)} chars")
print(f"Generated Summary: {summary_result}")
print("--- End Summarization Test ---")

Device set to use cpu


Summarization pipeline loaded (DistilBART) on device: CPU.

--- Summarization Test ---
Input Text Length: 797 chars
Generated Summary:  Review 1: This is the first review . It was okay, not great but not terrible . Decent for the price point . Buttons feel a bit flimsy .
--- End Summarization Test ---


In [24]:
# --- Verdict Calculation Logic ---

def calculate_overall_verdict(sentiments):
    """
    Calculates an overall verdict based on a list of *newly generated* sentiment labels.
    Args:
        sentiments (list): A list of sentiment labels ('Positive', 'Negative', 'Neutral').
    Returns:
        str: The overall verdict.
    """
    if not sentiments:
        return "Not Enough Data"

    total = len(sentiments)
    pos_count = sentiments.count("Positive")
    neg_count = sentiments.count("Negative")
    neu_count = sentiments.count("Neutral")

    # --- Define Verdict Logic (Adjust thresholds as needed) ---
    pos_ratio = pos_count / total
    neg_ratio = neg_count / total

    if pos_ratio >= 0.65 and neg_ratio < 0.15:
        return "Overwhelmingly Positive 👍"
    elif pos_ratio > neg_ratio + 0.15 and pos_ratio >= 0.40: # Clearly more positive than negative
        return "Generally Positive 🙂"
    elif neg_ratio >= 0.65 and pos_ratio < 0.15:
        return "Overwhelmingly Negative 👎"
    elif neg_ratio > pos_ratio + 0.15 and neg_ratio >= 0.40: # Clearly more negative than positive
        return "Generally Negative 🙁"
    elif abs(pos_ratio - neg_ratio) < 0.20 and neu_count < 0.5: # Relatively balanced Pos/Neg
        return "Mixed Reviews 🤔"
    else: # Default to Neutral if high neutral count or roughly equal pos/neg
        return "Neutral / Balanced 😐"

# --- Test Verdict Logic ---
print("\n--- Verdict Logic Tests ---")
test_cases = [
    (["Positive", "Positive", "Negative", "Neutral", "Positive", "Positive"], "Generally Positive 🙂"),
    (["Negative", "Negative", "Negative", "Neutral", "Positive"], "Generally Negative 🙁"),
    (["Positive", "Negative", "Positive", "Negative", "Neutral", "Neutral", "Neutral"], "Neutral / Balanced 😐"),
    (["Positive", "Positive", "Positive", "Positive", "Positive", "Neutral"], "Overwhelmingly Positive 👍"),
    (["Negative", "Negative", "Negative", "Negative", "Negative", "Neutral"], "Overwhelmingly Negative 👎"),
    (["Positive", "Negative"], "Mixed Reviews 🤔"),
    ([], "Not Enough Data")
]
for sentiments, expected in test_cases:
    verdict = calculate_overall_verdict(sentiments)
    status = "Correct" if verdict == expected else "INCORRECT"
    print(f"Test: {sentiments} -> Verdict: '{verdict}' (Expected: '{expected}') - {status}")
print("--- End Verdict Logic Tests ---")


--- Verdict Logic Tests ---
Test: ['Positive', 'Positive', 'Negative', 'Neutral', 'Positive', 'Positive'] -> Verdict: 'Generally Positive 🙂' (Expected: 'Generally Positive 🙂') - Correct
Test: ['Negative', 'Negative', 'Negative', 'Neutral', 'Positive'] -> Verdict: 'Generally Negative 🙁' (Expected: 'Generally Negative 🙁') - Correct
Test: ['Positive', 'Negative', 'Positive', 'Negative', 'Neutral', 'Neutral', 'Neutral'] -> Verdict: 'Neutral / Balanced 😐' (Expected: 'Neutral / Balanced 😐') - Correct
Test: ['Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Neutral'] -> Verdict: 'Overwhelmingly Positive 👍' (Expected: 'Overwhelmingly Positive 👍') - Correct
Test: ['Negative', 'Negative', 'Negative', 'Negative', 'Negative', 'Neutral'] -> Verdict: 'Overwhelmingly Negative 👎' (Expected: 'Overwhelmingly Negative 👎') - Correct
Test: ['Positive', 'Negative'] -> Verdict: 'Mixed Reviews 🤔' (Expected: 'Mixed Reviews 🤔') - Correct
Test: [] -> Verdict: 'Not Enough Data' (Expected: 'Not Enough 

In [25]:
# %% [code]
# STEP 6 CODE BLOCK (REMOVED Pro/Con Extraction & Output)

import asyncio
import re
import traceback
import numpy as np
import pandas as pd
# import collections # Not needed now
import time
import nltk # Keep for summarizer fallback

# --- Helper for MarkdownV2 Escaping ---
def escape_markdown_v2(text):
    """Escapes characters for Telegram MarkdownV2 parse mode."""
    escape_chars = r'_*[]()~`>#+-=|{}.!'
    text = str(text).replace('\\', '\\\\')
    return re.sub(f'([{re.escape(escape_chars)}])', r'\\\1', text)

# --- Telegram Bot Configuration ---
BOT_TOKEN = "7999899637:AAFCTDaI0eu8RhCmZZjpkfTBu45NxmZPi1E" # Use your actual token

# --- Pro/Con Extraction Function (REMOVED) ---
# No extract_pros_cons function needed anymore

# --- Core Function to Process Product Name (REMOVED Pro/Con Steps) ---
async def analyze_product(product_name_query):
    """
    Finds reviews, filters (relaxed), gets overall sentiment (HF),
    summarizes (HF), and formats result (NO Pros/Cons).
    """
    global df # Access global dataframe
    global PRODUCT_NAME_COLUMN, REVIEW_TEXT_COLUMN, SUMMARY_COLUMN, RATING_COLUMN, ORIGINAL_SENTIMENT_COLUMN

    if df is None or df.empty: logger.error("DataFrame 'df' is not available."); return "Error: The review dataset is not loaded or is empty."

    logger.info(f"Received product query: '{product_name_query}'"); analysis_start_time = time.time()

    # 1. Prepare Query and Perform Matching (Same as before)
    product_name_query_normalized = product_name_query.lower().strip(); product_reviews_df = pd.DataFrame()
    try:
        if PRODUCT_NAME_COLUMN not in df.columns: raise KeyError(f"Product name column '{PRODUCT_NAME_COLUMN}' not found.")
        df_normalized_names = df[PRODUCT_NAME_COLUMN].str.lower().str.strip()
        match_mask = (df_normalized_names == product_name_query_normalized); product_reviews_df = df[match_mask].copy()
        logger.info(f"Found {len(product_reviews_df)} reviews after matching.")
    except Exception as e: logger.error(f"Error during DataFrame filtering for '{product_name_query}': {e}", exc_info=True); safe_query = escape_markdown_v2(product_name_query); return f"❌ An internal error occurred while searching for the product: `{safe_query}`"
    if product_reviews_df.empty: safe_query = escape_markdown_v2(product_name_query); return f"❌ Sorry, I couldn't find any reviews for `{safe_query}`."
    matched_product_name = product_reviews_df[PRODUCT_NAME_COLUMN].iloc[0]; total_reviews_found = len(product_reviews_df)
    logger.info(f"Processing {total_reviews_found} reviews for: '{matched_product_name}'.")

    # 2. Filter Fake Reviews & Collect Data (Uses RELAXED V2 filter)
    genuine_reviews_data = []; fake_count = 0
    for index, row in product_reviews_df.iterrows():
        review_text = row.get(REVIEW_TEXT_COLUMN, ""); summary_text = row.get(SUMMARY_COLUMN, ""); rating = row.get(RATING_COLUMN, None); original_sentiment = row.get(ORIGINAL_SENTIMENT_COLUMN, None)
        if not is_likely_fake(review_text, rating, original_sentiment):
            if (review_text and str(review_text).strip()) or (summary_text and str(summary_text).strip()): genuine_reviews_data.append({"review_text": str(review_text), "summary_text": str(summary_text), "rating": rating})
            else: fake_count +=1
        else: fake_count += 1
    num_genuine_reviews = len(genuine_reviews_data); logger.info(f"Filtered {fake_count} reviews (using relaxed filter). Analyzing {num_genuine_reviews} genuine reviews.")
    if not genuine_reviews_data: safe_matched_name = escape_markdown_v2(matched_product_name); return f"Found {total_reviews_found} reviews for `{safe_matched_name}`, but all filtered out."

    # --- Prepare lists ---
    genuine_review_texts = [item['review_text'] for item in genuine_reviews_data if item['review_text']]; valid_ratings = [item['rating'] for item in genuine_reviews_data if isinstance(item['rating'], (int, float)) and pd.notna(item['rating'])]
    # No need for genuine_summary_texts list anymore

    # 3. Get Overall Sentiment (Hugging Face on Review text) - Same
    new_overall_sentiments = []
    if genuine_review_texts:
        logger.info("Starting overall sentiment analysis (HF)..."); hf_start = time.time()
        for text in genuine_review_texts: new_overall_sentiments.append(get_sentiment(text)['label'])
        logger.info(f"Overall sentiment analysis done in {time.time() - hf_start:.2f}s.")

    # 4. Generate Summary (Hugging Face on Review text) - Same
    summary_snippet = "Could not generate summary."
    if genuine_review_texts:
        logger.info("Generating summary (HF)..."); summ_start = time.time()
        full_genuine_review_text = "\n".join(genuine_review_texts); summary_snippet = generate_summary(full_genuine_review_text, max_len=100, min_len=20)
        logger.info(f"Summary generation done in {time.time() - summ_start:.2f}s.")

    # 5. Calculate Overall Verdict - Same
    verdict = calculate_overall_verdict(new_overall_sentiments)

    # 6. Calculate Average Rating - Same
    average_rating = np.mean(valid_ratings) if valid_ratings else None; avg_rating_str = f"{average_rating:.1f} / 5 ★" if average_rating is not None else "N/A"

    # 7. *** Pro/Con Extraction SKIPPED ***
    logger.info("Skipping Pro/Con extraction as requested.")

    # 8. Format the Response (REMOVED Pro/Con Sections)
    safe_matched_name = escape_markdown_v2(matched_product_name); response = f"📊 *Analysis for:* `{safe_matched_name}`\n\n"; response += f"⭐ *Avg\\. Rating \\(Genuine\\):* {escape_markdown_v2(avg_rating_str)}\n"; response += f"⚖️ *Overall Verdict \\(from Reviews\\):* {escape_markdown_v2(verdict)}\n"; response += f"_(Based on {num_genuine_reviews} genuine entries out of {total_reviews_found} total found\\. {fake_count} filtered\\.)_\n\n"

    # --- REMOVED Pro Section ---
    # --- REMOVED Con Section ---

    # --- Generated Summary Snippet ---
    formatted_summary = escape_markdown_v2(summary_snippet)
    response += f"📝 *Generated Summary Snippet \\(Reviews\\):*\n_{formatted_summary}_\n\n" # Add extra newline for spacing

    # --- Overall Sentiment Breakdown ---
    pos_count = new_overall_sentiments.count("Positive"); neg_count = new_overall_sentiments.count("Negative"); neu_count = new_overall_sentiments.count("Neutral"); total_overall_sentiments = len(new_overall_sentiments) if new_overall_sentiments else 1
    response += f"*Overall Sentiment Breakdown \\(Reviews\\):*\n"; response += f"  🟢 Positive: {pos_count} \\({pos_count/total_overall_sentiments:.0%}\\)\n"; response += f"  🔴 Negative: {neg_count} \\({neg_count/total_overall_sentiments:.0%}\\)\n"; response += f"  ⚪ Neutral:  {neu_count} \\({neu_count/total_overall_sentiments:.0%}\\)"

    analysis_duration = time.time() - analysis_start_time; logger.info(f"Analysis complete for '{matched_product_name}' in {analysis_duration:.2f}s. Response length: {len(response)}")
    if len(response) > 4096: logger.warning(f"Response exceeds 4096 chars ({len(response)}). Truncating."); response = response[:4090] + "\\.\\.\\."
    return response

# --- Bot Command Handlers, Error Handler, run_bot (No changes needed) ---
# ... (Keep the start, help_command, handle_message, error_handler, and run_bot functions exactly as they were, including the asyncio import in run_bot) ...

# --- Placeholder for other handlers/run_bot (use previous correct code) ---
async def start(update: telegram.Update, context: telegram.ext.ContextTypes.DEFAULT_TYPE):
    user = update.effective_user
    await update.message.reply_markdown_v2(
        rf"Hi {user.mention_markdown_v2()}\! 👋 Send me an exact Product Name from the dataset and I'll analyze its reviews\.",
    )

async def help_command(update: telegram.Update, context: telegram.ext.ContextTypes.DEFAULT_TYPE):
    help_text = ( # Updated help text - removed pro/con mention
        "*How to use:*\n"
        "1\\. Send me the *exact* Product Name\\.\n"
        "2\\. I find reviews & summaries\\.\n"
        "3\\. I filter fake reviews \\(relaxed\\)\\.\n"
        "4\\. I get overall sentiment \\(from Reviews\\)\\.\n"
        "5\\. I provide avg rating, verdict, and a *new* summary snippet \\(from Reviews\\)\\.\n\n" # Removed Pro/Con
        "_Exact name match needed\\!_"
    )
    await update.message.reply_markdown_v2(help_text)

async def handle_message(update: telegram.Update, context: telegram.ext.ContextTypes.DEFAULT_TYPE):
    user_input = update.message.text
    chat_id = update.effective_chat.id
    logger.info(f"Received message from chat {chat_id}: {user_input}")

    product_name = user_input.strip()
    if not product_name:
        await update.message.reply_text("⚠️ Please send a product name.")
        return

    processing_msg = None
    safe_product_name_escaped = escape_markdown_v2(product_name)

    try:
        processing_msg = await update.message.reply_text(
            f"⏳ Analyzing reviews for `{safe_product_name_escaped}`\\.\\.\\.",
            parse_mode=telegram.constants.ParseMode.MARKDOWN_V2
        )
    except telegram.error.BadRequest as e:
        logger.error(f"Failed to send 'Analyzing...' message for '{product_name}' using MarkdownV2: {e}")
        try:
            processing_msg = await update.message.reply_text(
                f"⏳ Analyzing reviews for '{product_name}'..."
            )
        except Exception as fallback_e:
            logger.error(f"Failed to send even plain text 'Analyzing...' message: {fallback_e}")
            try:
                await update.message.reply_text("⚠️ Error initiating analysis. Please try again.")
            except Exception:
                pass
            return
    except Exception as e:
        logger.error(f"Unexpected error sending 'Analyzing...' message: {e}", exc_info=True)
        try:
            await update.message.reply_text("⚠️ Error initiating analysis. Please try again.")
        except Exception:
            pass
        return

    if not processing_msg:
        logger.error("Failed to create the processing message. Aborting analysis.")
        return

    handler_start_time = time.time()
    analysis_result = await analyze_product(product_name)
    handler_duration = time.time() - handler_start_time
    logger.info(f"analyze_product for '{product_name}' took {handler_duration:.2f} seconds (within handler).")

    try:
        await processing_msg.edit_text(
            analysis_result,
            parse_mode=telegram.constants.ParseMode.MARKDOWN_V2
        )
    except telegram.error.BadRequest as e:
        error_message_lower = str(e).lower()

        if "message is not modified" in error_message_lower:
            logger.warning(f"Message not modified for '{product_name}'.")
        elif any(err_str in error_message_lower for err_str in [
            "can't parse entities", "unmatched closing tag", "wrong closing tag",
            "too long", "is reserved and must be escaped"
        ]):
            logger.warning(f"MarkdownV2 parsing failed for result of '{product_name}'. Retrying with plain text. Error: {e}")
            plain_result = re.sub(r'[\\*_`~\[\]()#+=|{}.!-]', '', analysis_result)
            if len(plain_result) > 4096:
                plain_result = plain_result[:4090] + "..."
            try:
                await processing_msg.edit_text(plain_result)
            except Exception as final_e:
                logger.error(f"Failed to send even plain text result for '{product_name}': {final_e}")
                try:
                    await processing_msg.edit_text("❌ An error occurred displaying the results (plain text fallback failed).")
                except Exception:
                    pass
        else:
            logger.error(f"Telegram API BadRequest sending result for '{product_name}': {e}", exc_info=True)
            try:
                await processing_msg.edit_text("❌ An error occurred sending the analysis (API Error).")
            except Exception:
                pass
    except Exception as e:
        logger.error(f"Unexpected error editing message for '{product_name}': {e}", exc_info=True)
        try:
            await processing_msg.edit_text("❌ An unexpected error occurred displaying the results.")
        except Exception:
            pass


async def error_handler(update: object, context: telegram.ext.ContextTypes.DEFAULT_TYPE):
    # (Same implementation as before)
    logger.error(msg="Exception while handling an update:", exc_info=context.error)
    if isinstance(context.error, telegram.error.Conflict): logger.warning("Conflict error detected.")
    if isinstance(context.error, (telegram.error.NetworkError)) and 'Updater' in str(context.error): logger.error(f"Network error during polling: {context.error}")
    if update and isinstance(update, telegram.Update) and update.effective_message:
         try:
             if not isinstance(context.error, telegram.error.BadRequest) or \
                all(indicator not in str(context.error).lower() for indicator in ["message is not modified", "can't parse", "unmatched", "wrong tag"]):
                   await update.effective_message.reply_text("❌ Oops! Something went wrong. Please try again.")
         except Exception as e: logger.error(f"Failed to send generic error message to user during error handling: {e}")

async def run_bot():
    # (Same implementation as before, including the corrected shutdown logic)
    import asyncio; from telegram.ext import Application, CommandHandler, MessageHandler, filters; import telegram
    if BOT_TOKEN == "YOUR_BOT_TOKEN" or not BOT_TOKEN or len(BOT_TOKEN.split(':')) != 2: print("[-] ERROR: BOT_TOKEN invalid!"); logger.error("Bot Token is invalid or missing!"); return
    print("[+] Initializing Telegram Bot Application..."); application = Application.builder().token(BOT_TOKEN).build(); application.add_handler(CommandHandler("start", start)); application.add_handler(CommandHandler("help", help_command)); application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message)); application.add_error_handler(error_handler)
    try:
        print("[+] Initializing application..."); await application.initialize(); logger.info("Application initialized.")
        print("[+] Starting background tasks (polling)..."); await application.start(); await application.updater.start_polling(allowed_updates=telegram.Update.ALL_TYPES); logger.info("Application started and polling initiated.")
        print("\n[+] Bot is now running!"); print("[+] Send commands or product names to your bot on Telegram."); print("[+] Stop this cell execution in Colab/Jupyter (or Ctrl+C) to stop the bot."); await asyncio.Future() # Keep running
    except telegram.error.InvalidToken: logger.error("CRITICAL ERROR: Invalid Token.", exc_info=False); print("\n[!!!] ERROR: Invalid Token!")
    except telegram.error.NetworkError as ne: logger.error(f"CRITICAL Network error: {ne}", exc_info=True); print(f"\n[!!!] ERROR: Network error: {ne}")
    except telegram.error.Conflict as conf_err: logger.error(f"CRITICAL Conflict: {conf_err}", exc_info=True); print(f"\n[!!!] ERROR: Conflict: {conf_err}")
    except (KeyboardInterrupt, SystemExit, asyncio.CancelledError): logger.info("Stop signal received."); print("\n[!] Bot stopping...")
    except Exception as e: logger.error(f"Unexpected error during bot run: {e}", exc_info=True); print(f"\n[!!!] Bot stopping due to error: {e}")
    finally:
        logger.info("Initiating shutdown..."); print("[+] Shutting down bot...")
        if 'application' in locals() and application:
            if application.updater and application.updater.running: print("    - Stopping updater polling..."); await application.updater.stop(); logger.info("Updater stopped.")
            if application.running: print("    - Stopping application handlers..."); await application.stop(); logger.info("Application stopped.")
            print("    - Shutting down application resources..."); await application.shutdown(); logger.info("Application shut down.")
        print("[+] Bot shutdown complete.")

# --- (End of Step 6 Code Block) ---

In [None]:
# %% [code]
# STEP 7 CODE BLOCK (Removed NLTK check)

# --- Start the Bot ---

if __name__ == '__main__':
     print("\n--- Starting Bot Execution ---")
     # Make sure data is loaded and pipelines are potentially ready
     if 'df' in globals() and df is not None and not df.empty:
         print("[i] Dataframe loaded.")
         # Check if HF pipelines are loaded
         if 'sentiment_pipeline' in globals() and sentiment_pipeline:
             print("[i] Sentiment analysis pipeline (Hugging Face) ready.")
         else:
             print("[!] Warning: HF Sentiment pipeline not loaded. Overall sentiment results will be 'Neutral'.")
         if 'summarizer' in globals() and summarizer:
             print("[i] Summarization pipeline (Hugging Face) ready.")
         else:
             print("[!] Warning: Summarization pipeline not loaded. Summaries will be basic.")
         # No Pro/Con extraction, so no NLTK check needed here

         # Call the async function
         print("[i] Awaiting bot execution...")
         try:
            await run_bot() # Calls the function containing the main loop and shutdown logic
         except RuntimeError as e:
             if "cannot schedule new futures after shutdown" in str(e).lower():
                 print("[!] Event loop was likely already shut down.")
                 logger.warning("RuntimeError related to event loop shutdown caught.")
             else:
                 print(f"[!!!] Runtime Error occurred while awaiting run_bot: {e}")
                 logger.error("RuntimeError awaiting run_bot", exc_info=True)
         except Exception as e:
             print(f"[!!!] An unexpected error occurred while awaiting run_bot: {e}")
             logger.error("Error awaiting run_bot", exc_info=True)

     else:
         print("[-] ERROR: Dataframe 'df' not loaded or empty. Cannot start the bot.")
         print("[-] Please ensure Step 1 runs successfully and the file exists.")
         logger.error("Attempted to start bot without loaded data.")

     print("--- Bot Execution Block Finished ---")


--- Starting Bot Execution ---
[i] Dataframe loaded.
[i] Sentiment analysis pipeline (Hugging Face) ready.
[i] Summarization pipeline (Hugging Face) ready.
[i] Awaiting bot execution...
[+] Initializing Telegram Bot Application...
[+] Initializing application...
[+] Starting background tasks (polling)...

[+] Bot is now running!
[+] Send commands or product names to your bot on Telegram.
[+] Stop this cell execution in Colab/Jupyter (or Ctrl+C) to stop the bot.





[!] Bot stopping...
[+] Shutting down bot...
    - Stopping updater polling...
    - Stopping application handlers...
    - Shutting down application resources...
[+] Bot shutdown complete.
--- Bot Execution Block Finished ---
