In [6]:
#!/usr/bin/env python3
"""
SCREENER.IN PROJECT - CLASS-4: CASH FLOW EXTRACTION - FILE READING VERSION
=========================================================================
Focus: Read symbols from CSV file and extract Cash Flow data with + sign expansion
File Path: E:/JN/TestSymbol.csv (using forward slashes to avoid escape issues)
Based on: Previous classes' working + sign expansion code
"""

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd
import time
import os
import re

class CashFlowExtractor:
    def __init__(self):
        self.driver = None
        self.wait_time = 20
        # Using forward slashes to avoid escape sequence issues
        self.csv_file_path = "E:/JN/TestSymbol.csv"
        
    def read_symbols_from_csv(self):
        """Read company symbols from the CSV file"""
        try:
            if not os.path.exists(self.csv_file_path):
                print(f"❌ File not found: {self.csv_file_path}")
                return []
            
            df = pd.read_csv(self.csv_file_path)
            print(f"📁 Reading from: {self.csv_file_path}")
            print(f"📊 CSV columns: {list(df.columns)}")
            
            # Detect symbol column (common names)
            symbol_column = None
            for col in df.columns:
                if col.lower() in ['symbol', 'symbols', 'company', 'stock', 'ticker']:
                    symbol_column = col
                    break
            
            if symbol_column is None and len(df.columns) > 0:
                symbol_column = df.columns[0]  # Use first column
                print(f"⚠️ Using first column as symbol: {symbol_column}")
            
            symbols = df[symbol_column].dropna().tolist()
            print(f"✅ Found {len(symbols)} symbols: {symbols[:5]}...")  # Show first 5
            return symbols
            
        except Exception as e:
            print(f"❌ Error reading CSV file: {str(e)}")
            return []
    
    def setup_driver(self):
        """Initialize Chrome WebDriver with proper settings"""
        options = webdriver.ChromeOptions()
        # options.add_argument('--headless')  # Comment out for debugging
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-gpu')
        options.add_argument('--window-size=1920,1080')
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        
        self.driver = webdriver.Chrome(options=options)
        self.driver.implicitly_wait(10)
        return self.driver
    
    def wait_for_page_load(self):
        """Wait for page to fully load including JavaScript"""
        try:
            # Wait for basic page load
            WebDriverWait(self.driver, self.wait_time).until(
                lambda driver: driver.execute_script("return document.readyState") == "complete"
            )
            time.sleep(3)  # Additional wait for dynamic content
            
            # Wait specifically for cash flow section
            WebDriverWait(self.driver, self.wait_time).until(
                EC.presence_of_element_located((By.ID, "cash-flow"))
            )
            print("✅ Page and cash flow section loaded")
            return True
            
        except TimeoutException:
            print("⚠️ Page load timeout")
            return False
    
    def find_and_expand_plus_signs(self):
        """Find and expand + signs using multiple detection methods"""
        expanded_count = 0
        
        try:
            print("🔍 Searching for expandable + signs...")
            
            # Get initial row count
            initial_rows = self.driver.find_elements(By.CSS_SELECTOR, "#cash-flow tbody tr")
            initial_count = len(initial_rows)
            print(f"📊 Initial rows: {initial_count}")
            
            # Find all rows with + signs
            expandable_rows = []
            for row in initial_rows:
                try:
                    first_cell = row.find_element(By.TAG_NAME, "td")
                    cell_text = first_cell.text.strip()
                    
                    if '+' in cell_text:
                        expandable_rows.append((row, cell_text))
                        print(f"  📋 Found: {cell_text}")
                        
                except:
                    continue
            
            if not expandable_rows:
                print("❌ No + signs found")
                return False
            
            # Try expanding each row
            print(f"🖱️ Expanding {len(expandable_rows)} categories...")
            
            for i, (row, description) in enumerate(expandable_rows):
                try:
                    print(f"  🖱️ {i+1}. Clicking: {description}")
                    
                    # Method 1: Direct JavaScript click
                    try:
                        self.driver.execute_script("arguments[0].click();", row)
                        time.sleep(3)  # Longer wait for expansion
                        print("    ✅ JavaScript click done")
                    except Exception as e:
                        print(f"    ⚠️ JavaScript click failed: {e}")
                        
                        # Method 2: Regular click
                        try:
                            row.click()
                            time.sleep(3)
                            print("    ✅ Regular click done")
                        except:
                            print("    ❌ Regular click failed")
                            continue
                    
                    # Check if expansion worked by counting rows
                    current_rows = self.driver.find_elements(By.CSS_SELECTOR, "#cash-flow tbody tr")
                    current_count = len(current_rows)
                    
                    if current_count > initial_count:
                        print(f"    ✅ Expansion detected! Rows: {initial_count} → {current_count}")
                        expanded_count += 1
                        initial_count = current_count  # Update baseline
                    else:
                        print(f"    ⚠️ No new rows appeared (still {current_count})")
                        
                        # Try alternative expansion method
                        try:
                            # Look for clickable elements within the row
                            clickable_elements = row.find_elements(By.CSS_SELECTOR, "a, span, div")
                            for elem in clickable_elements:
                                if '+' in elem.text:
                                    print(f"    🔄 Trying element click: {elem.text}")
                                    elem.click()
                                    time.sleep(3)
                                    break
                            
                            # Check again
                            final_rows = self.driver.find_elements(By.CSS_SELECTOR, "#cash-flow tbody tr")
                            final_count = len(final_rows)
                            
                            if final_count > current_count:
                                print(f"    ✅ Alternative method worked! Rows: {current_count} → {final_count}")
                                expanded_count += 1
                                initial_count = final_count
                            
                        except Exception as alt_e:
                            print(f"    ❌ Alternative method failed: {alt_e}")
                    
                    # Small delay between expansions
                    time.sleep(1)
                        
                except Exception as e:
                    print(f"    ❌ Error with {description}: {str(e)}")
                    continue
            
            # Final check
            final_rows = self.driver.find_elements(By.CSS_SELECTOR, "#cash-flow tbody tr")
            final_count = len(final_rows)
            
            print(f"🔍 Final status: {len(expandable_rows)} + signs found, {expanded_count} expanded")
            print(f"📊 Row count: Initial {len(initial_rows)} → Final {final_count}")
            
            # If no expansion worked, try one more alternative approach
            if expanded_count == 0:
                print("🔄 Trying alternative expansion method...")
                self.try_alternative_expansion()
            
            return expanded_count > 0 or final_count > 4  # Success if we have more than just the 4 main categories
            
        except Exception as e:
            print(f"❌ Error in plus sign expansion: {str(e)}")
            return False
    
    def try_alternative_expansion(self):
        """Alternative method to expand cash flow details"""
        try:
            # Method 1: Look for any clickable elements in the cash flow section
            clickable_elements = self.driver.find_elements(
                By.CSS_SELECTOR, 
                "#cash-flow td:first-child, #cash-flow .clickable, #cash-flow [onclick]"
            )
            
            for elem in clickable_elements:
                if '+' in elem.text:
                    try:
                        elem.click()
                        time.sleep(2)
                        print(f"    🔄 Clicked element: {elem.text[:50]}")
                    except:
                        continue
            
            # Method 2: Try JavaScript to trigger any onclick events
            try:
                script = """
                var rows = document.querySelectorAll('#cash-flow tbody tr');
                rows.forEach(function(row) {
                    if (row.textContent.includes('+')) {
                        if (row.onclick) row.onclick();
                        row.click();
                    }
                });
                """
                self.driver.execute_script(script)
                time.sleep(3)
                print("    🔄 JavaScript trigger attempted")
                
            except Exception as js_e:
                print(f"    ⚠️ JavaScript method failed: {js_e}")
            
        except Exception as e:
            print(f"❌ Alternative expansion failed: {str(e)}")
    
    def extract_all_cashflow_data(self):
        """Extract all cash flow data after expansion"""
        try:
            data = {}
            
            # Get table headers (years)
            cash_flow_table = self.driver.find_element(By.CSS_SELECTOR, "#cash-flow table")
            header_row = cash_flow_table.find_element(By.TAG_NAME, "thead")
            headers = [th.text.strip() for th in header_row.find_elements(By.TAG_NAME, "th")]
            years = headers[1:]  # Skip first empty header
            print(f"📅 Years found: {years}")
            
            # Wait a moment for any delayed expansions
            time.sleep(2)
            
            # Extract all rows (main + expanded)
            tbody = cash_flow_table.find_element(By.TAG_NAME, "tbody")
            rows = tbody.find_elements(By.TAG_NAME, "tr")
            print(f"📊 Total rows to extract: {len(rows)}")
            
            # Look for both visible and potentially hidden/dynamic rows
            all_possible_rows = []
            
            # Method 1: Direct row extraction
            for row in rows:
                all_possible_rows.append(row)
            
            # Method 2: Look for any dynamically inserted rows
            try:
                dynamic_rows = self.driver.find_elements(By.CSS_SELECTOR, "#cash-flow tbody tr[style*='display']")
                for drow in dynamic_rows:
                    if drow not in all_possible_rows:
                        all_possible_rows.append(drow)
            except:
                pass
            
            print(f"📊 Processing {len(all_possible_rows)} total rows (including any dynamic ones)...")
            
            for i, row in enumerate(all_possible_rows):
                try:
                    cells = row.find_elements(By.TAG_NAME, "td")
                    if len(cells) >= 2:  # Must have at least category name + 1 data cell
                        
                        # Get row name
                        row_name = cells[0].text.strip()
                        if not row_name:  # Skip empty rows
                            continue
                            
                        # Skip if row is hidden or not visible
                        try:
                            if not row.is_displayed():
                                print(f"    ⚠️ Skipping hidden row: {row_name}")
                                continue
                        except:
                            pass
                        
                        # Clean row name (remove + signs)
                        plus_pattern = r'\s*\+\s*$'
                        clean_row_name = re.sub(plus_pattern, '', row_name).strip()
                        
                        # Check for indented sub-items (these are the expanded details we want)
                        is_sub_item = False
                        sub_item_keywords = [
                            'net income', 'depreciation', 'working capital', 'trade receivables', 
                            'trade payables', 'other receivables', 'other payables', 'inventories',
                            'purchase of fixed assets', 'sale of fixed assets', 'investments',
                            'dividend paid', 'proceeds from borrowings', 'repayment of borrowings',
                            'proceeds from issue', 'buy back', 'interest paid', 'interest received'
                        ]
                        
                        if (row_name.startswith('  ') or  # Indented
                            any(keyword in row_name.lower() for keyword in sub_item_keywords)):
                            is_sub_item = True
                            clean_row_name = f"  {clean_row_name}"  # Keep indentation for sub-items
                        
                        # Extract values for each year
                        values = []
                        for cell in cells[1:]:
                            value = cell.text.strip()
                            # Convert empty or dash to 0
                            if value in ['', '-', '--']:
                                value = '0'
                            values.append(value)
                        
                        # Store data
                        data[clean_row_name] = dict(zip(years, values))
                        
                        # Show extraction status
                        status = "📋 SUB-ITEM" if is_sub_item else "📊 MAIN"
                        print(f"  {status} Row {i+1}: {clean_row_name[:50]} -> {len(values)} values")
                        
                except Exception as e:
                    print(f"    ⚠️ Error processing row {i}: {str(e)}")
                    continue
            
            main_items = sum(1 for k in data.keys() if not k.startswith('  '))
            sub_items = sum(1 for k in data.keys() if k.startswith('  '))
            
            print(f"✅ Extracted {len(data)} cash flow items ({main_items} main + {sub_items} detailed)")
            
            # If we only got main items, try one more expansion attempt
            if sub_items == 0 and main_items <= 4:
                print("⚠️ No detailed items found, trying final expansion attempt...")
                self.final_expansion_attempt()
                
                # Re-extract after final attempt
                time.sleep(3)
                tbody = cash_flow_table.find_element(By.TAG_NAME, "tbody")
                new_rows = tbody.find_elements(By.TAG_NAME, "tr")
                
                if len(new_rows) > len(rows):
                    print(f"🎯 Found {len(new_rows) - len(rows)} additional rows after final attempt!")
                    # Re-process new rows
                    for i, row in enumerate(new_rows[len(rows):], len(rows)):
                        try:
                            cells = row.find_elements(By.TAG_NAME, "td")
                            if len(cells) >= 2:
                                row_name = cells[0].text.strip()
                                if row_name:
                                    plus_pattern = r'\s*\+\s*$'
                                    clean_row_name = f"  {re.sub(plus_pattern, '', row_name).strip()}"
                                    values = [cell.text.strip() for cell in cells[1:]]
                                    data[clean_row_name] = dict(zip(years, values))
                                    print(f"  📋 NEW Row {i+1}: {clean_row_name[:50]}")
                        except:
                            continue
            
            return data
            
        except Exception as e:
            print(f"❌ Error extracting cash flow data: {str(e)}")
            return {}
    
    def final_expansion_attempt(self):
        """Final attempt to expand using different methods"""
        try:
            print("🔄 Final expansion attempt using scroll and wait...")
            
            # Scroll to cash flow section
            cash_flow_section = self.driver.find_element(By.ID, "cash-flow")
            self.driver.execute_script("arguments[0].scrollIntoView();", cash_flow_section)
            time.sleep(1)
            
            # Try clicking any remaining + signs
            plus_elements = self.driver.find_elements(By.XPATH, "//td[contains(text(), '+')]")
            for elem in plus_elements:
                try:
                    self.driver.execute_script("arguments[0].click();", elem)
                    time.sleep(2)
                    print(f"    🔄 Clicked: {elem.text[:30]}")
                except:
                    continue
            
            # Try hovering over rows (some sites need hover to show details)
            cash_flow_rows = self.driver.find_elements(By.CSS_SELECTOR, "#cash-flow tbody tr")
            for row in cash_flow_rows[:3]:  # Just first 3 main categories
                try:
                    ActionChains(self.driver).move_to_element(row).perform()
                    time.sleep(1)
                except:
                    continue
            
        except Exception as e:
            print(f"    ⚠️ Final expansion attempt failed: {e}")
    
    def process_symbol(self, symbol, data_type="standalone"):
        """Process a single symbol for cash flow extraction"""
        try:
            # Construct URL
            base_url = f"https://www.screener.in/company/{symbol}/"
            if data_type == "consolidated":
                url = f"{base_url}consolidated/#cash-flow"
            else:
                url = f"{base_url}#cash-flow"
            
            print(f"\n🔍 Processing {symbol} ({data_type.upper()})")
            print(f"🌐 URL: {url}")
            
            # Load page
            self.driver.get(url)
            
            # Wait for page to load
            if not self.wait_for_page_load():
                print(f"❌ Page load failed for {symbol}")
                return None
            
            # Find and expand + signs
            expansion_success = self.find_and_expand_plus_signs()
            
            # Extract all data (whether expansion worked or not)
            data = self.extract_all_cashflow_data()
            
            if data:
                print(f"✅ {symbol} - {len(data)} cash flow items extracted")
                return data
            else:
                print(f"❌ {symbol} - No data extracted")
                return None
                
        except Exception as e:
            print(f"❌ Error processing {symbol}: {str(e)}")
            return None
    
    def save_to_csv(self, data, symbol, data_type):
        """Save data to CSV file"""
        try:
            if not data:
                return
            
            # Create output directory using forward slashes
            output_dir = "E:/JN/CashFlow_Output"
            os.makedirs(output_dir, exist_ok=True)
            
            # Flatten data for CSV
            rows = []
            for category, yearly_data in data.items():
                row = {"Category": category}
                row.update(yearly_data)
                rows.append(row)
            
            if rows:
                df = pd.DataFrame(rows)
                filename = os.path.join(output_dir, f"{symbol}_{data_type}_cashflow.csv")
                df.to_csv(filename, index=False)
                print(f"💾 Saved: {filename}")
                return filename
            
        except Exception as e:
            print(f"❌ Error saving {symbol}: {str(e)}")
            return None
    
    def process_all_symbols(self):
        """Main processing function"""
        
        # Read symbols from CSV file
        symbols = self.read_symbols_from_csv()
        if not symbols:
            print("❌ No symbols found to process")
            return
        
        # Setup WebDriver
        try:
            self.setup_driver()
            print(f"🚀 WebDriver initialized")
        except Exception as e:
            print(f"❌ Failed to initialize WebDriver: {str(e)}")
            return
        
        # Process each symbol
        results_summary = []
        
        for i, symbol in enumerate(symbols):
            print(f"\n{'='*60}")
            print(f"📊 Processing {i+1}/{len(symbols)}: {symbol}")
            print(f"{'='*60}")
            
            symbol_results = {"symbol": symbol}
            
            # Process Standalone data
            standalone_data = self.process_symbol(symbol, "standalone")
            if standalone_data:
                filename = self.save_to_csv(standalone_data, symbol, "standalone")
                symbol_results["standalone"] = "SUCCESS"
                symbol_results["standalone_file"] = filename
            else:
                symbol_results["standalone"] = "FAILED"
            
            # Process Consolidated data
            consolidated_data = self.process_symbol(symbol, "consolidated")
            if consolidated_data:
                filename = self.save_to_csv(consolidated_data, symbol, "consolidated")
                symbol_results["consolidated"] = "SUCCESS" 
                symbol_results["consolidated_file"] = filename
            else:
                symbol_results["consolidated"] = "FAILED"
            
            results_summary.append(symbol_results)
            
            # Small delay between symbols
            time.sleep(2)
        
        # Save summary
        self.save_processing_summary(results_summary)
        
        print(f"\n🎯 CLASS-4 PROCESSING COMPLETE!")
        print(f"📊 Total symbols processed: {len(symbols)}")
        
        # Show success/failure summary
        successful = sum(1 for r in results_summary if r.get("standalone") == "SUCCESS" or r.get("consolidated") == "SUCCESS")
        print(f"✅ Successful extractions: {successful}/{len(symbols)}")
    
    def save_processing_summary(self, results):
        """Save processing summary"""
        try:
            output_dir = "E:/JN/CashFlow_Output"
            summary_file = os.path.join(output_dir, "cashflow_extraction_summary.csv")
            
            df = pd.DataFrame(results)
            df.to_csv(summary_file, index=False)
            print(f"📋 Summary saved: {summary_file}")
            
        except Exception as e:
            print(f"⚠️ Could not save summary: {str(e)}")
    
    def close(self):
        """Clean up"""
        if self.driver:
            self.driver.quit()
            print("🔒 WebDriver closed")

# Main execution
def main():
    """Main execution function"""
    
    print("🚀 CLASS-4: CASH FLOW EXTRACTOR - FILE READING VERSION")
    print("📁 Reading from: E:/JN/TestSymbol.csv")
    print("💾 Output to: E:/JN/CashFlow_Output/")
    print("="*60)
    
    extractor = CashFlowExtractor()
    
    try:
        extractor.process_all_symbols()
    except KeyboardInterrupt:
        print("\n⚠️ Process interrupted by user")
    except Exception as e:
        print(f"❌ Main execution error: {str(e)}")
    finally:
        extractor.close()

if __name__ == "__main__":
    main()

🚀 CLASS-4: CASH FLOW EXTRACTOR - FILE READING VERSION
📁 Reading from: E:/JN/TestSymbol.csv
💾 Output to: E:/JN/CashFlow_Output/
📁 Reading from: E:/JN/TestSymbol.csv
📊 CSV columns: ['SYMBOL']
✅ Found 1 symbols: ['RELIANCE']...
🚀 WebDriver initialized

📊 Processing 1/1: RELIANCE

🔍 Processing RELIANCE (STANDALONE)
🌐 URL: https://www.screener.in/company/RELIANCE/#cash-flow
✅ Page and cash flow section loaded
🔍 Searching for expandable + signs...
📊 Initial rows: 4
  📋 Found: Cash from Operating Activity +
  📋 Found: Cash from Investing Activity +
  📋 Found: Cash from Financing Activity +
🖱️ Expanding 3 categories...
  🖱️ 1. Clicking: Cash from Operating Activity +
    ✅ JavaScript click done
    ⚠️ No new rows appeared (still 4)
    🔄 Trying element click: +
    ✅ Alternative method worked! Rows: 4 → 10
  🖱️ 2. Clicking: Cash from Investing Activity +
    ✅ JavaScript click done
    ⚠️ No new rows appeared (still 10)
    🔄 Trying element click: +
    ✅ Alternative method worked! Rows: 10 → 