In [2]:
"""
SCREENER.IN PROJECT - CLASS-2 SIMPLIFIED: Profit & Loss Data Extractor
=====================================================================
CLEAN FORMAT: Matches CLASS-1 structure for easy reading
Features: Annual P&L data, Selenium automation, RAW PDF filtering, Timer system
Author: Your Assistant | Status: CLASS-2 Simplified Implementation
"""

import pandas as pd
import time
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import os
from datetime import datetime

class CLASS2_PL_Simple:
    def __init__(self, csv_path="E:\\JN\\TestSymbol.csv"):
        self.csv_path = csv_path
        self.base_url = "https://www.screener.in/company/"
        self.driver = None
        self.start_time = None
        self.processed_count = 0
        self.success_count = 0
        self.error_count = 0
        
        # Initialize results storage (simple format like CLASS-1)
        self.consolidated_data = []
        self.standalone_data = []
        
        print("🎯 CLASS-2 SIMPLIFIED: Clean P&L Extractor Initialized")
        print("=" * 60)
    
    def setup_driver(self):
        """Setup Chrome driver with optimized settings"""
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        
        try:
            self.driver = webdriver.Chrome(options=chrome_options)
            print("✅ Chrome driver initialized successfully")
            return True
        except Exception as e:
            print(f"❌ Driver setup failed: {e}")
            return False
    
    def start_timer(self):
        """Start execution timer"""
        self.start_time = time.time()
        print(f"⏰ Timer started: {datetime.now().strftime('%H:%M:%S')}")
    
    def get_elapsed_time(self):
        """Get elapsed time in formatted string"""
        if self.start_time:
            elapsed = time.time() - self.start_time
            minutes, seconds = divmod(int(elapsed), 60)
            return f"{minutes:02d}:{seconds:02d}"
        return "00:00"
    
    def print_progress(self, symbol, status):
        """Print progress with timer"""
        elapsed = self.get_elapsed_time()
        print(f"[{elapsed}] {self.processed_count}/{len(self.symbols)} - {symbol}: {status}")
    
    def is_raw_pdf_link(self, text):
        """Check if text indicates a RAW PDF link (to be skipped)"""
        raw_indicators = [
            'download pdf', 'pdf report', 'annual report pdf',
            'view pdf', 'raw pdf', 'original pdf', 'full pdf'
        ]
        return any(indicator in text.lower() for indicator in raw_indicators)
    
    def click_expandable_rows(self, section):
        """Click all + signs in the P&L section to expand data"""
        expanded_count = 0
        max_attempts = 5
        
        for attempt in range(max_attempts):
            try:
                # Find all + signs in this section with valid CSS selectors
                plus_selectors = [
                    "button[data-toggle]",
                    ".btn[data-toggle]", 
                    "button.btn-sm",
                    ".expandable",
                    "button[title*='expand']",
                    "button[onclick*='expand']"
                ]
                
                plus_signs = []
                for selector in plus_selectors:
                    try:
                        elements = section.find_elements(By.CSS_SELECTOR, selector)
                        plus_signs.extend(elements)
                    except:
                        continue
                
                # Also look for elements containing '+' text
                try:
                    all_buttons = section.find_elements(By.TAG_NAME, "button")
                    for btn in all_buttons:
                        if '+' in btn.text and btn not in plus_signs:
                            plus_signs.append(btn)
                except:
                    pass
                
                if not plus_signs:
                    break
                
                for plus_sign in plus_signs:
                    try:
                        if plus_sign.is_displayed() and plus_sign.is_enabled():
                            self.driver.execute_script("arguments[0].click();", plus_sign)
                            expanded_count += 1
                            time.sleep(0.3)  # Small delay
                    except:
                        continue
                
                # Wait for content to load
                time.sleep(1)
                
            except Exception as e:
                if attempt == 0:  # Only print first error
                    print(f"   ⚠️ Expand error: {str(e)[:40]}...")
                break
        
        return expanded_count
    
    def extract_pl_simple(self, symbol, url_type):
        """Extract P&L data in simple format (like CLASS-1)"""
        try:
            # Construct URL
            if url_type == "consolidated":
                url = f"{self.base_url}{symbol}/consolidated/"
            else:
                url = f"{self.base_url}{symbol}/"
            
            self.driver.get(url)
            
            # Wait for page to load
            WebDriverWait(self.driver, 15).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            
            # Find the profit-loss section
            try:
                pl_section = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.ID, "profit-loss"))
                )
            except TimeoutException:
                return None, "P&L section not found"
            
            # Expand all + signs in the section
            expanded_count = self.click_expandable_rows(pl_section)
            if expanded_count > 0:
                print(f"   📂 Expanded {expanded_count} rows")
                time.sleep(2)  # Wait for all expansions to complete
            
            # Find the main P&L table
            try:
                table = pl_section.find_element(By.CSS_SELECTOR, "table, .data-table, .table-responsive table")
            except NoSuchElementException:
                return None, "P&L table not found in section"
            
            # Get all rows
            rows = table.find_elements(By.TAG_NAME, "tr")
            if len(rows) < 2:
                return None, "Insufficient P&L data rows"
            
            # Extract header row (years)
            header_cells = rows[0].find_elements(By.TAG_NAME, "th")
            if not header_cells:
                header_cells = rows[0].find_elements(By.TAG_NAME, "td")
            
            if not header_cells:
                return None, "No header found"
            
            # Get clean headers
            headers = []
            for cell in header_cells:
                header_text = cell.text.strip()
                if header_text and not self.is_raw_pdf_link(header_text):
                    headers.append(header_text)
            
            print(f"   ✅ Found {len(headers)} year columns: {headers[:3]}...{headers[-2:]}")
            
            # Extract data rows in SIMPLE format (like CLASS-1)
            extracted_data = []
            
            for i, row in enumerate(rows[1:], 1):  # Skip header row
                try:
                    cells = row.find_elements(By.TAG_NAME, "td")
                    if not cells:
                        continue
                    
                    # Get row data
                    row_values = []
                    for cell in cells:
                        cell_text = cell.text.strip()
                        if not self.is_raw_pdf_link(cell_text):
                            row_values.append(cell_text)
                    
                    if len(row_values) >= len(headers) and row_values[0]:  # Must have row label
                        # Create simple record (like CLASS-1)
                        record = {
                            'Symbol': symbol,
                            'URL_Type': url_type,
                            'Section': 'Profit_Loss',
                            'Row_Label': row_values[0],  # First column is the metric name
                            'Extracted_At': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                        }
                        
                        # Add each year as separate column (like CLASS-1)
                        for j, header in enumerate(headers):
                            if j < len(row_values):
                                # Clean column name (remove special chars)
                                clean_header = header.replace('Mar ', 'Mar-').replace('Dec ', 'Dec-').replace(' ', '-')
                                record[clean_header] = row_values[j] if j > 0 else row_values[0]
                        
                        extracted_data.append(record)
                
                except Exception as e:
                    print(f"   ⚠️ Row {i} error: {str(e)[:30]}")
                    continue
            
            return extracted_data, f"Extracted {len(extracted_data)} P&L rows"
        
        except Exception as e:
            return None, f"Extraction error: {str(e)[:50]}"
    
    def process_symbol(self, symbol):
        """Process both consolidated and standalone P&L data for a symbol"""
        print(f"\n🔍 Processing: {symbol}")
        
        # Process consolidated data
        self.print_progress(symbol, "Extracting consolidated P&L...")
        cons_data, cons_msg = self.extract_pl_simple(symbol, "consolidated")
        
        if cons_data:
            self.consolidated_data.extend(cons_data)
            print(f"   ✅ Consolidated: {cons_msg}")
        else:
            print(f"   ❌ Consolidated: {cons_msg}")
        
        time.sleep(1)  # Delay between requests
        
        # Process standalone data
        self.print_progress(symbol, "Extracting standalone P&L...")
        stand_data, stand_msg = self.extract_pl_simple(symbol, "standalone")
        
        if stand_data:
            self.standalone_data.extend(stand_data)
            print(f"   ✅ Standalone: {stand_msg}")
        else:
            print(f"   ❌ Standalone: {stand_msg}")
        
        # Update counters
        if cons_data or stand_data:
            self.success_count += 1
        else:
            self.error_count += 1
    
    def save_results(self):
        """Save extracted P&L data to CSV files (simple format like CLASS-1)"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        # Save consolidated data
        if self.consolidated_data:
            cons_df = pd.DataFrame(self.consolidated_data)
            cons_filename = f"CLASS2_PL_Simple_Consolidated_{timestamp}.csv"
            cons_df.to_csv(cons_filename, index=False)
            print(f"💾 Consolidated P&L saved: {cons_filename} ({len(cons_df)} rows)")
        
        # Save standalone data
        if self.standalone_data:
            stand_df = pd.DataFrame(self.standalone_data)
            stand_filename = f"CLASS2_PL_Simple_Standalone_{timestamp}.csv"
            stand_df.to_csv(stand_filename, index=False)
            print(f"💾 Standalone P&L saved: {stand_filename} ({len(stand_df)} rows)")
        
        return len(self.consolidated_data) + len(self.standalone_data)
    
    def run_extraction(self):
        """Main execution method for CLASS-2 simplified P&L extraction"""
        print("🚀 Starting CLASS-2 SIMPLIFIED: Clean P&L Extraction")
        print("=" * 60)
        
        # Load symbols
        try:
            df = pd.read_csv(self.csv_path)
            self.symbols = df['Symbol'].tolist() if 'Symbol' in df.columns else df.iloc[:, 0].tolist()
            print(f"📊 Loaded {len(self.symbols)} symbols from CSV")
        except Exception as e:
            print(f"❌ CSV loading error: {e}")
            return False
        
        # Setup driver
        if not self.setup_driver():
            return False
        
        # Start processing
        self.start_timer()
        
        try:
            for i, symbol in enumerate(self.symbols):
                self.processed_count = i + 1
                
                try:
                    self.process_symbol(symbol.strip())
                    
                    # Progress update every 5 symbols
                    if self.processed_count % 5 == 0:
                        elapsed = self.get_elapsed_time()
                        print(f"\n📊 Progress: {self.processed_count}/{len(self.symbols)} | "
                              f"Success: {self.success_count} | Errors: {self.error_count} | "
                              f"Time: {elapsed}")
                
                except Exception as e:
                    print(f"   ❌ Symbol processing error: {str(e)[:50]}")
                    self.error_count += 1
                    continue
        
        finally:
            # Cleanup
            if self.driver:
                self.driver.quit()
                print("🔒 Browser closed")
        
        # Save results
        total_rows = self.save_results()
        
        # Final summary
        elapsed = self.get_elapsed_time()
        print("\n" + "=" * 60)
        print("🎊 CLASS-2 SIMPLIFIED P&L EXTRACTION COMPLETE!")
        print("=" * 60)
        print(f"📊 Total Symbols Processed: {self.processed_count}")
        print(f"✅ Successful Extractions: {self.success_count}")
        print(f"❌ Failed Extractions: {self.error_count}")
        print(f"💾 Total P&L Rows Extracted: {total_rows}")
        print(f"⏰ Total Execution Time: {elapsed}")
        print(f"🎯 Success Rate: {(self.success_count/self.processed_count)*100:.1f}%")
        
        return True

# Usage Example
if __name__ == "__main__":
    # Initialize CLASS-2 SIMPLIFIED extractor
    extractor = CLASS2_PL_Simple("E:\\JN\\TestSymbol.csv")
    
    # Run the extraction
    extractor.run_extraction()
    
    print("\n🎯 CLASS-2 Complete! Ready for CLASS-3: Balance Sheet!")

🎯 CLASS-2 SIMPLIFIED: Clean P&L Extractor Initialized
🚀 Starting CLASS-2 SIMPLIFIED: Clean P&L Extraction
📊 Loaded 1 symbols from CSV
✅ Chrome driver initialized successfully
⏰ Timer started: 21:31:37

🔍 Processing: RELIANCE
[00:00] 1/1 - RELIANCE: Extracting consolidated P&L...
   📂 Expanded 5 rows
   ✅ Found 13 year columns: ['Mar 2014', 'Mar 2015', 'Mar 2016']...['Mar 2025', 'TTM']
   ✅ Consolidated: Extracted 27 P&L rows
[00:17] 1/1 - RELIANCE: Extracting standalone P&L...
   📂 Expanded 5 rows
   ✅ Found 13 year columns: ['Mar 2014', 'Mar 2015', 'Mar 2016']...['Mar 2025', 'TTM']
   ✅ Standalone: Extracted 25 P&L rows
🔒 Browser closed
💾 Consolidated P&L saved: CLASS2_PL_Simple_Consolidated_20250830_213213.csv (27 rows)
💾 Standalone P&L saved: CLASS2_PL_Simple_Standalone_20250830_213213.csv (25 rows)

🎊 CLASS-2 SIMPLIFIED P&L EXTRACTION COMPLETE!
📊 Total Symbols Processed: 1
✅ Successful Extractions: 1
❌ Failed Extractions: 0
💾 Total P&L Rows Extracted: 52
⏰ Total Execution Time: 00: