In [2]:
#!/usr/bin/env python3
"""
SCREENER.IN PROJECT - CLASS-3: BALANCE SHEET EXTRACTION - JAVASCRIPT-AWARE VERSION
==================================================================================
Status: JavaScript-Aware Version - Properly handles dynamic + signs
Features: Waits for JS, Detects dynamic +, Smart expansion, Consolidated + Standalone

Author: Screener.in Project Class System
Version: 3.0 - JavaScript-Aware with Dynamic + Detection and Fixed Headers
"""

import requests
import pandas as pd
import time
from datetime import datetime
import os
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException

class BalanceSheetExtractorJS:
    def __init__(self, use_selenium=True):
        self.use_selenium = use_selenium
        self.driver = None
        
        # Initialize requests session for fallback
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
        if self.use_selenium:
            self.setup_selenium()
    
    def setup_selenium(self):
        """Setup Selenium WebDriver with longer waits"""
        try:
            chrome_options = Options()
            chrome_options.add_argument('--headless')  # Run in background
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument('--disable-blink-features=AutomationControlled')
            chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
            chrome_options.add_experimental_option('useAutomationExtension', False)
            
            self.driver = webdriver.Chrome(options=chrome_options)
            self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
            
            # Set longer timeouts for JavaScript-heavy pages
            self.driver.implicitly_wait(10)
            self.driver.set_page_load_timeout(30)
            
            print("✅ Selenium WebDriver initialized with JS support")
            
        except Exception as e:
            print(f"⚠️ Selenium setup failed: {e}")
            print("📝 Falling back to basic extraction")
            self.use_selenium = False
    
    def load_symbols(self, csv_path):
        """Load symbols from CSV file"""
        try:
            df = pd.read_csv(csv_path)
            symbols = df['Symbol'].tolist() if 'Symbol' in df.columns else df.iloc[:, 0].tolist()
            print(f"✅ Loaded {len(symbols)} symbols from {csv_path}")
            return symbols
        except Exception as e:
            print(f"❌ Error loading symbols: {e}")
            return []
    
    def wait_for_dynamic_content(self, url):
        """Wait for JavaScript to fully load and add dynamic + signs"""
        try:
            print("🔍 Loading page and waiting for JavaScript...")
            self.driver.get(url)
            
            # Wait for initial page load
            WebDriverWait(self.driver, 15).until(
                EC.presence_of_element_located((By.TAG_NAME, "table"))
            )
            
            # Wait for JavaScript to execute - critical for dynamic + signs
            print("⏳ Waiting for JavaScript to add dynamic content...")
            time.sleep(5)  # Let JavaScript fully execute
            
            # Try to detect when + signs are added
            max_attempts = 10
            for attempt in range(max_attempts):
                try:
                    # Check if + signs have been added to the balance sheet
                    page_text = self.driver.page_source
                    if 'Borrowings+' in page_text or 'Other Liabilities+' in page_text:
                        print(f"✅ Dynamic + signs detected after {attempt + 1} attempts")
                        break
                    
                    print(f"⏳ Attempt {attempt + 1}: Waiting for + signs to appear...")
                    time.sleep(2)
                    
                except:
                    continue
            
            # Additional wait to ensure all JavaScript is complete
            time.sleep(3)
            
            return self.driver.page_source
            
        except TimeoutException:
            print("⚠️ Page load timeout, proceeding with available content")
            return self.driver.page_source
        except Exception as e:
            print(f"⚠️ Error waiting for dynamic content: {e}")
            return self.driver.page_source
    
    def expand_dynamic_rows(self):
        """Expand rows that have dynamic + signs"""
        try:
            expanded_count = 0
            
            # Method 1: Direct JavaScript execution to find and click expandable rows
            print("🔍 Using JavaScript to find and expand rows...")
            
            expansion_script = """
                var expandedCount = 0;
                var tables = document.getElementsByTagName('table');
                
                for (var t = 0; t < tables.length; t++) {
                    var tableText = tables[t].textContent.toLowerCase();
                    
                    // Check if this is a balance sheet table
                    if (tableText.includes('equity capital') || tableText.includes('reserves') || 
                        tableText.includes('borrowings') || tableText.includes('total liabilities')) {
                        
                        var rows = tables[t].getElementsByTagName('tr');
                        
                        for (var i = 0; i < rows.length; i++) {
                            var cells = rows[i].getElementsByTagName('td');
                            
                            for (var j = 0; j < cells.length; j++) {
                                var cellText = cells[j].textContent;
                                
                                // Look for cells with + signs
                                if (cellText.includes('+') && 
                                    (cellText.includes('Borrowings') || 
                                     cellText.includes('Liabilities') ||
                                     cellText.includes('Assets') ||
                                     cellText.includes('Current'))) {
                                    
                                    // Try different click methods
                                    try {
                                        // Method A: Click the cell
                                        if (cells[j].onclick || cells[j].style.cursor === 'pointer') {
                                            cells[j].click();
                                            expandedCount++;
                                        }
                                        // Method B: Look for child clickable elements
                                        else {
                                            var clickableChildren = cells[j].querySelectorAll('[onclick], [style*="cursor:pointer"]');
                                            for (var k = 0; k < clickableChildren.length; k++) {
                                                clickableChildren[k].click();
                                                expandedCount++;
                                                break;
                                            }
                                        }
                                        // Method C: Try parent row
                                        if (expandedCount === 0 && rows[i].onclick) {
                                            rows[i].click();
                                            expandedCount++;
                                        }
                                    } catch (e) {
                                        console.log('Click failed for:', cellText, e);
                                    }
                                }
                            }
                        }
                        break; // Only process balance sheet table
                    }
                }
                
                return expandedCount;
            """
            
            js_expanded = self.driver.execute_script(expansion_script)
            expanded_count += js_expanded
            print(f"📊 JavaScript expanded {js_expanded} rows")
            
            # Method 2: Selenium-based clicking with better targeting
            try:
                # Find balance sheet table first
                tables = self.driver.find_elements(By.TAG_NAME, "table")
                balance_sheet_table = None
                
                for table in tables:
                    table_text = table.text.lower()
                    if ('equity capital' in table_text and 'reserves' in table_text):
                        balance_sheet_table = table
                        break
                
                if balance_sheet_table:
                    # Look for cells containing + in balance sheet table
                    plus_cells = balance_sheet_table.find_elements(By.XPATH, ".//td[contains(text(), '+')]")
                    
                    for cell in plus_cells:
                        cell_text = cell.text
                        if any(keyword in cell_text.lower() for keyword in ['borrowings', 'liabilities', 'assets', 'current']):
                            try:
                                print(f"🎯 Attempting to expand: {cell_text}")
                                
                                # Store initial table content
                                initial_content = balance_sheet_table.text
                                
                                # Try clicking
                                cell.click()
                                time.sleep(2)
                                
                                # Check if content changed
                                new_content = balance_sheet_table.text
                                if len(new_content) > len(initial_content):
                                    print(f"✅ Successfully expanded: {cell_text}")
                                    expanded_count += 1
                                else:
                                    print(f"❌ No expansion detected for: {cell_text}")
                                    
                            except Exception as e:
                                print(f"⚠️ Error clicking {cell_text}: {e}")
                                continue
            
            except Exception as e:
                print(f"⚠️ Selenium expansion method failed: {e}")
            
            # Wait for all expansions to complete
            if expanded_count > 0:
                print(f"⏳ Waiting for {expanded_count} expansions to complete...")
                time.sleep(4)
            
            return expanded_count
            
        except Exception as e:
            print(f"⚠️ Error in expansion: {e}")
            return 0
    
    def extract_balance_sheet_data(self, symbol, data_type="consolidated"):
        """Extract balance sheet data with proper JavaScript handling"""
        print(f"\n🔍 Extracting {data_type.upper()} Balance Sheet for: {symbol}")
        
        # Construct URL
        if data_type == "consolidated":
            url = f"https://www.screener.in/company/{symbol}/consolidated/#balance-sheet"
        else:
            url = f"https://www.screener.in/company/{symbol}/#balance-sheet"
        
        print(f"📡 URL: {url}")
        
        try:
            # Start timer
            start_time = time.time()
            
            # Get HTML content with proper JS handling
            if self.use_selenium and self.driver:
                print("🚀 Using JavaScript-aware extraction...")
                
                # Load page and wait for dynamic content
                html_content = self.wait_for_dynamic_content(url)
                
                # Try to expand dynamic rows
                expanded_count = self.expand_dynamic_rows()
                print(f"✅ Total expansions: {expanded_count}")
                
                # Get final HTML after expansions
                html_content = self.driver.page_source
                
            else:
                print("📡 Using basic requests extraction...")
                response = self.session.get(url, timeout=30)
                response.raise_for_status()
                html_content = response.content
            
            # Parse HTML with BeautifulSoup
            soup = BeautifulSoup(html_content, 'html.parser')
            
            # Find balance sheet table
            balance_sheet_table = self.find_balance_sheet_table(soup)
            
            if not balance_sheet_table:
                print(f"❌ No balance sheet table found for {symbol}")
                return None
            
            # Extract data from table
            result = self.extract_table_data(balance_sheet_table, symbol, data_type, start_time)
            
            if result and self.use_selenium:
                result['extraction_method'] = f"JavaScript-Aware (Expanded: {expanded_count})"
            
            return result
            
        except Exception as e:
            print(f"❌ Error extracting data for {symbol}: {e}")
            return None
    
    def find_balance_sheet_table(self, soup):
        """Find the balance sheet table in the soup"""
        # Method 1: Look for balance sheet heading
        headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
        for heading in headings:
            if 'balance sheet' in heading.get_text().lower():
                table = heading.find_next('table')
                if table:
                    return table
        
        # Method 2: Look for table with balance sheet indicators
        tables = soup.find_all('table')
        for table in tables:
            table_text = table.get_text().lower()
            if any(term in table_text for term in ['equity capital', 'reserves', 'total liabilities', 'total assets']):
                return table
        
        return None
    
    def extract_table_data(self, table, symbol, data_type, start_time):
        """Extract data from the balance sheet table with proper header alignment"""
        try:
            rows = table.find_all('tr')
            if len(rows) < 2:
                return None
            
            # Get headers with proper alignment
            header_row = rows[0]
            header_cells = header_row.find_all(['th', 'td'])
            
            # CRITICAL FIX: Always start with Particulars column
            headers = ['Particulars']  # First column for row descriptions
            
            # Extract year headers, skipping empty first cell if present
            for i, cell in enumerate(header_cells):
                header_text = cell.get_text(strip=True)
                
                # Skip empty first cell (common in balance sheet tables)
                if i == 0 and (not header_text or header_text == ''):
                    continue
                    
                # Skip RAW PDF columns
                if header_text and not ('raw' in header_text.lower() and 'pdf' in header_text.lower()):
                    headers.append(header_text)
            
            print(f"📊 Headers aligned: {headers[:6]}..." if len(headers) > 6 else f"📊 Headers aligned: {headers}")
            print(f"📊 Total columns: {len(headers)}")
            
            # Extract data rows with proper alignment
            data_rows = []
            for row_idx, row in enumerate(rows[1:], 1):
                cells = row.find_all(['td', 'th'])
                if len(cells) < 2:
                    continue
                
                row_data = []
                
                # Extract all cell data
                for cell in cells:
                    cell_text = cell.get_text(strip=True)
                    row_data.append(cell_text)
                
                # Skip RAW PDF entries
                if row_data and ('raw' in row_data[0].lower() and 'pdf' in row_data[0].lower()):
                    continue
                
                # Skip empty rows
                if not row_data or not row_data[0].strip():
                    continue
                
                # Ensure we have the right number of columns
                # If we have more data than headers, take only what we need
                if len(row_data) > len(headers):
                    row_data = row_data[:len(headers)]
                # If we have less data than headers, pad with empty strings
                elif len(row_data) < len(headers):
                    while len(row_data) < len(headers):
                        row_data.append('')
                
                data_rows.append(row_data)
                
                # Debug: Show first few rows alignment
                if row_idx <= 3:
                    print(f"   Row {row_idx}: {row_data[:4]}...")
            
            # Create DataFrame
            if data_rows:
                df = pd.DataFrame(data_rows, columns=headers)
                
                # Clean data
                df = df.dropna(subset=[df.columns[0]])  # Remove rows with empty particulars
                df = df[df.iloc[:, 0].str.strip() != '']  # Remove empty particulars
                
                # Additional cleanup - remove completely empty rows
                df = df.dropna(how='all')
                
                end_time = time.time()
                extraction_time = round(end_time - start_time, 2)
                
                print(f"✅ Extracted {len(df)} rows in {extraction_time}s")
                print(f"📋 Header-aligned data preview:")
                print(df.head(5).to_string(index=False, max_cols=8))
                
                # Verify data integrity
                if len(df.columns) > 1:
                    sample_row = df.iloc[0]
                    print(f"🔍 Data verification - {sample_row.iloc[0]}: {sample_row.iloc[1]} | Last col: {sample_row.iloc[-1]}")
                
                # Enhance data with section headers
                enhanced_df = self.enhance_balance_sheet_data(df)
                
                return {
                    'dataframe': enhanced_df,
                    'symbol': symbol,
                    'data_type': data_type,
                    'extraction_time': extraction_time,
                    'url': f"https://www.screener.in/company/{symbol}/consolidated/#balance-sheet" if data_type == "consolidated" else f"https://www.screener.in/company/{symbol}/#balance-sheet",
                    'timestamp': datetime.now().strftime('%Y%m%d_%H%M%S')
                }
            
            return None
            
        except Exception as e:
            print(f"❌ Error extracting table data: {e}")
            return None
    
    def enhance_balance_sheet_data(self, df):
        """Add section headers and improve data organization"""
        try:
            enhanced_rows = []
            current_section = "LIABILITIES"
            
            for index, row in df.iterrows():
                particular = row['Particulars'].strip()
                
                # Detect section transitions
                if any(term in particular.lower() for term in ['total liabilities', 'total assets']):
                    if 'liabilities' in particular.lower():
                        current_section = "LIABILITIES"
                    elif 'assets' in particular.lower():
                        current_section = "ASSETS"
                
                # Add section identifier
                row_dict = row.to_dict()
                row_dict['Section'] = current_section
                enhanced_rows.append(row_dict)
            
            # Create enhanced DataFrame with section column
            enhanced_df = pd.DataFrame(enhanced_rows)
            
            # Reorder columns to put Section after Particulars
            cols = enhanced_df.columns.tolist()
            if 'Section' in cols:
                cols.remove('Section')
                cols.insert(1, 'Section')
                enhanced_df = enhanced_df[cols]
            
            return enhanced_df
            
        except Exception as e:
            print(f"⚠️ Error enhancing data: {e}")
            return df  # Return original if enhancement fails
    
    def save_to_csv(self, data_dict, output_dir="E:\\JN\\"):
        """Save extracted data to CSV"""
        try:
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            
            symbol = data_dict['symbol']
            data_type = data_dict['data_type'].upper()
            timestamp = data_dict['timestamp']
            method = data_dict.get('extraction_method', 'BASIC')
            
            filename = f"{symbol}_BalanceSheet_{data_type}_JSAware_{timestamp}.csv"
            filepath = os.path.join(output_dir, filename)
            
            data_dict['dataframe'].to_csv(filepath, index=False)
            
            print(f"💾 Saved: {filename}")
            print(f"📊 File size: {os.path.getsize(filepath)} bytes")
            print(f"📈 Rows: {len(data_dict['dataframe'])}")
            print(f"🔧 Method: {method}")
            
            return filepath
            
        except Exception as e:
            print(f"❌ Error saving CSV: {e}")
            return None
    
    def cleanup(self):
        """Cleanup Selenium driver"""
        if self.driver:
            try:
                self.driver.quit()
                print("🔄 Selenium driver closed")
            except:
                pass

def main():
    print("=" * 80)
    print("SCREENER.IN PROJECT - CLASS-3: BALANCE SHEET EXTRACTOR - JAVASCRIPT-AWARE")
    print("=" * 80)
    
    extractor = BalanceSheetExtractorJS(use_selenium=True)
    
    try:
        # Load symbols
        csv_path = "E:\\JN\\TestSymbol.csv"
        symbols = extractor.load_symbols(csv_path)
        
        if not symbols:
            print("❌ No symbols to process!")
            return
        
        test_symbol = symbols[0]
        print(f"\n🎯 Testing with first symbol: {test_symbol}")
        
        # Extract CONSOLIDATED data
        print("\n" + "="*70)
        print("PHASE 1: CONSOLIDATED BALANCE SHEET (JAVASCRIPT-AWARE)")
        print("="*70)
        
        consolidated_data = extractor.extract_balance_sheet_data(test_symbol, "consolidated")
        if consolidated_data:
            consolidated_file = extractor.save_to_csv(consolidated_data)
            print(f"✅ Consolidated extraction completed!")
        
        # Extract STANDALONE data
        print("\n" + "="*70)
        print("PHASE 2: STANDALONE BALANCE SHEET (JAVASCRIPT-AWARE)")
        print("="*70)
        
        standalone_data = extractor.extract_balance_sheet_data(test_symbol, "standalone")
        if standalone_data:
            standalone_file = extractor.save_to_csv(standalone_data)
            print(f"✅ Standalone extraction completed!")
        
        print("\n" + "="*80)
        print("CLASS-3 JAVASCRIPT-AWARE VERSION COMPLETED!")
        print("FEATURES: ✅ JS-Aware ✅ Dynamic + Detection ✅ Smart Expansion ✅ Fixed Headers")
        print("="*80)
        
    finally:
        extractor.cleanup()

if __name__ == "__main__":
    main()

SCREENER.IN PROJECT - CLASS-3: BALANCE SHEET EXTRACTOR - JAVASCRIPT-AWARE
✅ Selenium WebDriver initialized with JS support
✅ Loaded 1 symbols from E:\JN\TestSymbol.csv

🎯 Testing with first symbol: RELIANCE

PHASE 1: CONSOLIDATED BALANCE SHEET (JAVASCRIPT-AWARE)

🔍 Extracting CONSOLIDATED Balance Sheet for: RELIANCE
📡 URL: https://www.screener.in/company/RELIANCE/consolidated/#balance-sheet
🚀 Using JavaScript-aware extraction...
🔍 Loading page and waiting for JavaScript...
⏳ Waiting for JavaScript to add dynamic content...
⏳ Attempt 1: Waiting for + signs to appear...
⏳ Attempt 2: Waiting for + signs to appear...
⏳ Attempt 3: Waiting for + signs to appear...
⏳ Attempt 4: Waiting for + signs to appear...
⏳ Attempt 5: Waiting for + signs to appear...
⏳ Attempt 6: Waiting for + signs to appear...
⏳ Attempt 7: Waiting for + signs to appear...
⏳ Attempt 8: Waiting for + signs to appear...
⏳ Attempt 9: Waiting for + signs to appear...
⏳ Attempt 10: Waiting for + signs to appear...
🔍 Using Ja