In [2]:
"""
CLASS-5: Screener.in Ratio Data Extraction - FIXED VERSION
This version specifically targets the RATIOS TABLE (not peer comparison table)
Based on the actual structure showing years as columns and ratio names as rows
"""

import pandas as pd
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import logging
import re

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class ScreenerRatioExtractor:
    def __init__(self):
        """Initialize the ratio extractor"""
        self.driver = None
        self.setup_driver()
        
    def setup_driver(self):
        """Setup Chrome driver with optimized settings"""
        try:
            chrome_options = Options()
            chrome_options.add_argument('--headless')  # Remove this line to see browser
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument('--disable-gpu')
            chrome_options.add_argument('--window-size=1920,1080')
            chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
            
            self.driver = webdriver.Chrome(options=chrome_options)
            self.driver.set_page_load_timeout(30)
            logger.info("✅ Chrome driver initialized successfully")
            
        except Exception as e:
            logger.error(f"❌ Failed to setup driver: {str(e)}")
            raise

    def extract_ratios_standalone(self, symbol):
        """Extract STANDALONE ratio data for a given symbol"""
        url = f"https://www.screener.in/company/{symbol}/#ratios"
        return self._extract_ratios_from_url(symbol, url, "Standalone")

    def extract_ratios_consolidated(self, symbol):
        """Extract CONSOLIDATED ratio data for a given symbol"""
        url = f"https://www.screener.in/company/{symbol}/consolidated/#ratios"
        return self._extract_ratios_from_url(symbol, url, "Consolidated")

    def _extract_ratios_from_url(self, symbol, url, data_type):
        """Extract ratio data from specific URL - FIXED to target correct table"""
        try:
            logger.info(f"🔍 Extracting {data_type} ratios for: {symbol}")
            logger.info(f"📍 URL: {url}")
            
            # Load the page
            self.driver.get(url)
            time.sleep(5)  # Wait for page load and React rendering
            
            # Wait for page to load properly
            wait = WebDriverWait(self.driver, 20)
            
            try:
                # Wait for the ratios section to load
                wait.until(
                    EC.any_of(
                        EC.presence_of_element_located((By.XPATH, "//section[@id='ratios']")),
                        EC.presence_of_element_located((By.XPATH, "//h2[text()='Ratios']")),
                        EC.presence_of_element_located((By.CSS_SELECTOR, "[data-reactroot]"))
                    )
                )
                logger.info(f"✅ {data_type} page loaded successfully")
            except TimeoutException:
                logger.warning(f"⚠️ Page load timeout for {symbol} - {data_type}")
            
            # Extract the CORRECT ratio table data (not peer table)
            ratio_data = self.extract_correct_ratio_table(symbol, data_type)
            
            if ratio_data:
                logger.info(f"✅ Successfully extracted {len(ratio_data)} ratio records for {symbol} - {data_type}")
                return ratio_data
            else:
                logger.warning(f"⚠️ No ratio data found for {symbol} - {data_type}")
                return [{'Symbol': symbol, 'Data_Type': data_type, 'Extraction_Status': 'No_Data_Found'}]
                
        except Exception as e:
            logger.error(f"❌ Error extracting ratios for {symbol} - {data_type}: {str(e)}")
            return [{'Symbol': symbol, 'Data_Type': data_type, 'Extraction_Status': f'Error: {str(e)}'}]

    def extract_correct_ratio_table(self, symbol, data_type):
        """Extract the ACTUAL ratios table (years as columns, ratios as rows)"""
        try:
            ratio_records = []
            
            # STRATEGY 1: Look for the ratios section specifically
            logger.info("🎯 Strategy 1: Looking for ratios section...")
            ratios_section = None
            
            # Try to find the ratios section
            section_selectors = [
                "//section[@id='ratios']",
                "//div[@id='ratios']", 
                "//h2[text()='Ratios']/following-sibling::div",
                "//h2[contains(text(), 'Ratio')]/following-sibling::div",
                "//div[contains(@class, 'ratios')]"
            ]
            
            for selector in section_selectors:
                try:
                    ratios_section = self.driver.find_element(By.XPATH, selector)
                    if ratios_section:
                        logger.info(f"✅ Found ratios section using: {selector}")
                        break
                except NoSuchElementException:
                    continue
            
            if ratios_section:
                # Look for table within ratios section
                tables_in_section = ratios_section.find_elements(By.CSS_SELECTOR, "table")
                if tables_in_section:
                    logger.info(f"📊 Found {len(tables_in_section)} tables in ratios section")
                    # Use the first table in ratios section
                    ratio_table = tables_in_section[0]
                    return self.parse_ratio_table(ratio_table, symbol, data_type)
            
            # STRATEGY 2: Look for tables with ratio-specific content
            logger.info("🎯 Strategy 2: Looking for tables with ratio content...")
            all_tables = self.driver.find_elements(By.CSS_SELECTOR, "table")
            logger.info(f"📊 Found {len(all_tables)} total tables on page")
            
            for i, table in enumerate(all_tables):
                try:
                    # Get table text to identify ratio table
                    table_text = table.text.lower()
                    
                    # Check if this table contains ratio keywords
                    ratio_keywords = [
                        'debtor days', 'inventory days', 'days payable', 
                        'cash conversion', 'working capital', 'roce %',
                        'mar 20', 'mar 19', 'mar 21'  # Year indicators
                    ]
                    
                    keyword_matches = sum(1 for keyword in ratio_keywords if keyword in table_text)
                    
                    if keyword_matches >= 2:  # At least 2 ratio keywords
                        logger.info(f"✅ Found ratio table (Table {i+1}) with {keyword_matches} ratio keywords")
                        ratio_data = self.parse_ratio_table(table, symbol, data_type)
                        if ratio_data:
                            return ratio_data
                    
                except Exception as e:
                    logger.warning(f"⚠️ Error checking table {i+1}: {str(e)}")
                    continue
            
            # STRATEGY 3: Skip peer comparison tables explicitly
            logger.info("🎯 Strategy 3: Filtering out peer comparison tables...")
            for i, table in enumerate(all_tables):
                try:
                    table_text = table.text.lower()
                    
                    # Skip if this looks like peer comparison
                    peer_indicators = ['reliance industr', 'i o c l', 'name', 'cmp rs', 'p/e', 'mar cap']
                    peer_matches = sum(1 for indicator in peer_indicators if indicator in table_text)
                    
                    if peer_matches >= 2:
                        logger.info(f"❌ Skipping table {i+1} - appears to be peer comparison table")
                        continue
                    
                    # Check if it has time series structure (multiple year columns)
                    rows = table.find_elements(By.CSS_SELECTOR, "tr")
                    if len(rows) > 1:
                        # Check first row for year patterns
                        first_row = rows[0]
                        first_row_text = first_row.text
                        year_pattern = re.findall(r'(mar|Mar|MAR)\s*\d{4}', first_row_text)
                        
                        if len(year_pattern) >= 3:  # At least 3 year columns
                            logger.info(f"✅ Found time-series table (Table {i+1}) with {len(year_pattern)} year columns")
                            ratio_data = self.parse_ratio_table(table, symbol, data_type)
                            if ratio_data:
                                return ratio_data
                    
                except Exception as e:
                    logger.warning(f"⚠️ Error in strategy 3 for table {i+1}: {str(e)}")
                    continue
            
            logger.warning("⚠️ No suitable ratio table found")
            return []
            
        except Exception as e:
            logger.error(f"❌ Error in extract_correct_ratio_table: {str(e)}")
            return []

    def parse_ratio_table(self, table, symbol, data_type):
        """Parse the ratio table with years as columns and ratios as rows"""
        try:
            ratio_records = []
            
            # Get all rows
            rows = table.find_elements(By.CSS_SELECTOR, "tr")
            if len(rows) < 2:
                logger.warning("⚠️ Table has insufficient rows")
                return []
            
            # Extract headers (years/periods) from first row
            header_row = rows[0]
            header_cells = header_row.find_elements(By.CSS_SELECTOR, "th, td")
            headers = []
            
            for cell in header_cells:
                header_text = cell.text.strip()
                if header_text:
                    headers.append(header_text)
            
            logger.info(f"📋 Table headers: {headers}")
            
            # Validate this looks like a ratios table
            if len(headers) < 4:  # Should have ratio name + multiple years
                logger.warning("⚠️ Table doesn't have enough columns for time series")
                return []
            
            # Check if headers contain year patterns
            year_headers = [h for h in headers if re.search(r'(mar|Mar|MAR)\s*\d{4}|\d{4}', h)]
            if len(year_headers) < 2:
                logger.warning("⚠️ Table doesn't contain sufficient year columns")
                return []
            
            logger.info(f"✅ Valid ratio table found with {len(year_headers)} year columns")
            
            # Process data rows (skip header row)
            for row_idx, row in enumerate(rows[1:], 1):
                try:
                    cells = row.find_elements(By.CSS_SELECTOR, "td, th")
                    if not cells:
                        continue
                    
                    cell_values = [cell.text.strip() for cell in cells]
                    if not cell_values or not cell_values[0]:
                        continue
                    
                    ratio_name = cell_values[0]
                    
                    # Skip empty or header-like rows
                    if ratio_name.lower() in ['', 'ratio', 'metric', 'period']:
                        continue
                    
                    # Create record for this ratio
                    record = {
                        'Symbol': symbol,
                        'Data_Type': data_type,
                        'Ratio_Name': ratio_name,
                        'Extraction_Status': 'Success'
                    }
                    
                    # Add values for each year/period
                    for i, value in enumerate(cell_values[1:]):
                        if i < len(headers) - 1:  # -1 because first header is ratio name
                            header_name = headers[i + 1].replace(' ', '_').replace('/', '_')
                            clean_value = value if value and value not in ['-', '', 'N/A'] else 'N/A'
                            record[header_name] = clean_value
                    
                    ratio_records.append(record)
                    logger.info(f"✅ Extracted ratio: {ratio_name}")
                    
                except Exception as e:
                    logger.warning(f"⚠️ Error processing row {row_idx}: {str(e)}")
                    continue
            
            return ratio_records
            
        except Exception as e:
            logger.error(f"❌ Error parsing ratio table: {str(e)}")
            return []

    def process_symbols_file(self, symbols_file, output_dir, extract_both=True):
        """Process symbols from file and extract ratios"""
        try:
            # Create output directory
            os.makedirs(output_dir, exist_ok=True)
            
            # Read symbols file
            logger.info(f"📖 Reading symbols from: {symbols_file}")
            symbols_df = pd.read_csv(symbols_file)
            
            # Debug: Show file structure
            logger.info(f"📋 CSV Columns found: {list(symbols_df.columns)}")
            logger.info(f"📋 CSV Shape: {symbols_df.shape}")
            
            # Find symbol column
            symbol_column = None
            possible_names = ['Symbol', 'SYMBOL', 'symbol', 'Symbols', 'Stock', 'Ticker', 'Company']
            
            for col_name in possible_names:
                if col_name in symbols_df.columns:
                    symbol_column = col_name
                    break
            
            if symbol_column is None:
                symbol_column = symbols_df.columns[0]
                logger.warning(f"⚠️ Using first column as symbol: '{symbol_column}'")
            
            symbols = [str(sym).strip() for sym in symbols_df[symbol_column].tolist() if pd.notna(sym)]
            logger.info(f"📊 Found {len(symbols)} valid symbols to process")
            
            # Process each symbol
            all_ratios = []
            processed = 0
            
            for symbol in symbols:
                try:
                    logger.info(f"\n{'='*60}")
                    logger.info(f"📈 Processing {processed + 1}/{len(symbols)}: {symbol}")
                    
                    if extract_both:
                        # Extract both standalone and consolidated
                        standalone_data = self.extract_ratios_standalone(symbol)
                        consolidated_data = self.extract_ratios_consolidated(symbol)
                        
                        all_ratios.extend(standalone_data)
                        all_ratios.extend(consolidated_data)
                    else:
                        # Extract only standalone
                        standalone_data = self.extract_ratios_standalone(symbol)
                        all_ratios.extend(standalone_data)
                    
                    processed += 1
                    
                    # Add delay between requests
                    time.sleep(3)
                    
                    # Save progress every 5 symbols
                    if processed % 5 == 0:
                        self.save_progress(all_ratios, output_dir, f"progress_{processed}")
                        
                except KeyboardInterrupt:
                    logger.info("🛑 Process interrupted by user")
                    break
                    
                except Exception as e:
                    logger.error(f"❌ Error processing {symbol}: {str(e)}")
                    all_ratios.append({
                        'Symbol': symbol, 
                        'Data_Type': 'Error',
                        'Extraction_Status': f'Error: {str(e)}'
                    })
                    continue
            
            # Save final results
            self.save_final_results(all_ratios, output_dir)
            
            logger.info(f"\n🎉 EXTRACTION COMPLETE!")
            logger.info(f"📊 Processed: {processed}/{len(symbols)} symbols")
            
        except Exception as e:
            logger.error(f"❌ Error in main processing: {str(e)}")
            raise

    def save_progress(self, data, output_dir, filename):
        """Save progress data"""
        try:
            df = pd.DataFrame(data)
            filepath = os.path.join(output_dir, f"{filename}.csv")
            df.to_csv(filepath, index=False)
            logger.info(f"💾 Progress saved: {filepath}")
        except Exception as e:
            logger.error(f"❌ Error saving progress: {str(e)}")

    def save_final_results(self, data, output_dir):
        """Save final results in multiple formats"""
        try:
            df = pd.DataFrame(data)
            
            # Main output file
            main_file = os.path.join(output_dir, "Screener_Ratios_Complete.csv")
            df.to_csv(main_file, index=False)
            logger.info(f"💾 Main results saved: {main_file}")
            
            # Separate standalone and consolidated files
            standalone_df = df[df['Data_Type'] == 'Standalone']
            consolidated_df = df[df['Data_Type'] == 'Consolidated']
            
            if not standalone_df.empty:
                standalone_file = os.path.join(output_dir, "Screener_Ratios_Standalone.csv")
                standalone_df.to_csv(standalone_file, index=False)
                logger.info(f"✅ Standalone results saved: {standalone_file}")
            
            if not consolidated_df.empty:
                consolidated_file = os.path.join(output_dir, "Screener_Ratios_Consolidated.csv")
                consolidated_df.to_csv(consolidated_file, index=False)
                logger.info(f"✅ Consolidated results saved: {consolidated_file}")
            
            # Success summary
            success_df = df[df['Extraction_Status'] == 'Success']
            error_df = df[df['Extraction_Status'].str.contains('Error', na=False)]
            
            total = len(df)
            success_count = len(success_df)
            error_count = len(error_df)
            
            logger.info(f"\n📈 FINAL SUMMARY:")
            logger.info(f"Total Records: {total}")
            logger.info(f"Successful: {success_count} ({success_count/total*100:.1f}%)")
            logger.info(f"Errors: {error_count} ({error_count/total*100:.1f}%)")
            
        except Exception as e:
            logger.error(f"❌ Error saving final results: {str(e)}")

    def __del__(self):
        """Cleanup driver"""
        if self.driver:
            self.driver.quit()

def test_single_symbol(symbol="RELIANCE", data_type="Consolidated"):
    """Test extraction for a single symbol"""
    logger.info(f"🧪 Testing single symbol extraction: {symbol} - {data_type}")
    
    try:
        extractor = ScreenerRatioExtractor()
        
        if data_type.lower() == "consolidated":
            results = extractor.extract_ratios_consolidated(symbol)
        else:
            results = extractor.extract_ratios_standalone(symbol)
        
        logger.info(f"✅ Test completed. Got {len(results)} records")
        
        # Show sample results
        if results:
            logger.info("📋 Sample results:")
            for i, result in enumerate(results[:3]):  # Show first 3 records
                logger.info(f"  Record {i+1}: {result}")
        
        return results
        
    except Exception as e:
        logger.error(f"❌ Test failed: {str(e)}")
        return []

def main():
    """Main execution function"""
    # Configuration
    SYMBOLS_FILE = "E:/JN/TestSymbol.csv"  # Your symbols file
    OUTPUT_DIR = "E:/JN/Ratios_Output/"     # Output directory
    EXTRACT_BOTH = True  # Set to False if you only want standalone ratios
    
    logger.info("🚀 Starting CLASS-5: Screener.in Ratio Extraction (FIXED VERSION)")
    logger.info(f"📁 Symbols file: {SYMBOLS_FILE}")
    logger.info(f"📁 Output directory: {OUTPUT_DIR}")
    logger.info(f"🔄 Extract both standalone & consolidated: {EXTRACT_BOTH}")
    
    # TEST MODE: Uncomment this to test with single symbol first
    # logger.info("🧪 RUNNING IN TEST MODE")
    # test_results = test_single_symbol("RELIANCE", "Consolidated")
    # return
    
    # Verify files exist
    if not os.path.exists(SYMBOLS_FILE):
        logger.error(f"❌ Symbols file not found: {SYMBOLS_FILE}")
        return
    
    try:
        # Initialize extractor
        extractor = ScreenerRatioExtractor()
        
        # Process symbols
        extractor.process_symbols_file(SYMBOLS_FILE, OUTPUT_DIR, EXTRACT_BOTH)
        
    except KeyboardInterrupt:
        logger.info("🛑 Process interrupted by user")
    except Exception as e:
        logger.error(f"❌ Fatal error: {str(e)}")
        import traceback
        logger.error(f"📋 Full error details:\n{traceback.format_exc()}")
    finally:
        logger.info("🏁 Process ended")

if __name__ == "__main__":
    main()

2025-09-01 16:46:53,742 - INFO - 🚀 Starting CLASS-5: Screener.in Ratio Extraction (FIXED VERSION)
2025-09-01 16:46:53,744 - INFO - 📁 Symbols file: E:/JN/TestSymbol.csv
2025-09-01 16:46:53,746 - INFO - 📁 Output directory: E:/JN/Ratios_Output/
2025-09-01 16:46:53,748 - INFO - 🔄 Extract both standalone & consolidated: True
2025-09-01 16:46:55,748 - INFO - ✅ Chrome driver initialized successfully
2025-09-01 16:46:55,753 - INFO - 📖 Reading symbols from: E:/JN/TestSymbol.csv
2025-09-01 16:46:55,759 - INFO - 📋 CSV Columns found: ['SYMBOL']
2025-09-01 16:46:55,762 - INFO - 📋 CSV Shape: (1, 1)
2025-09-01 16:46:55,765 - INFO - 📊 Found 1 valid symbols to process
2025-09-01 16:46:55,769 - INFO - 
2025-09-01 16:46:55,773 - INFO - 📈 Processing 1/1: RELIANCE
2025-09-01 16:46:55,781 - INFO - 🔍 Extracting Standalone ratios for: RELIANCE
2025-09-01 16:46:55,784 - INFO - 📍 URL: https://www.screener.in/company/RELIANCE/#ratios
2025-09-01 16:47:02,535 - INFO - ✅ Standalone page loaded successfully
2025-09-