In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
from datetime import datetime
import time
import os

# Try to import Selenium for dynamic content
try:
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.chrome.options import Options
    from selenium.common.exceptions import TimeoutException, NoSuchElementException
    SELENIUM_AVAILABLE = True
    print("✅ Selenium available - Will extract + sign expandable data")
except ImportError:
    SELENIUM_AVAILABLE = False
    print("⚠️ Selenium not available - Will use basic extraction (no + sign data)")
    print("💡 To get + sign data, install: pip install selenium")

class QuarterlyExtractor:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        self.base_url = "https://www.screener.in/company"
        self.results_folder = "quarterly_data"
        self.use_selenium = SELENIUM_AVAILABLE
        self.driver = None
        
        # Create results folder
        if not os.path.exists(self.results_folder):
            os.makedirs(self.results_folder)
            print(f"✅ Created folder: {self.results_folder}")
    
    def setup_selenium(self):
        """Setup Selenium WebDriver for dynamic content extraction"""
        if not self.use_selenium:
            return False
        
        try:
            chrome_options = Options()
            chrome_options.add_argument('--headless')  # Run in background
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument('--disable-gpu')
            chrome_options.add_argument('--window-size=1920,1080')
            
            # Try to create Chrome WebDriver
            self.driver = webdriver.Chrome(options=chrome_options)
            print("✅ Selenium WebDriver initialized")
            return True
            
        except Exception as e:
            print(f"⚠️ Selenium setup failed: {e}")
            print("💡 Download ChromeDriver from: https://chromedriver.chromium.org/")
            self.use_selenium = False
            return False
    
    def close_selenium(self):
        """Close Selenium WebDriver"""
        if self.driver:
            self.driver.quit()
            print("✅ Selenium WebDriver closed")
    
    def read_symbols_from_csv(self, file_path):
        """Read stock symbols from CSV file"""
        print(f"\n🔍 Step 1: Reading symbols from: {file_path}")
        
        if not os.path.exists(file_path):
            print(f"❌ ERROR: File not found: {file_path}")
            return []
        
        try:
            symbols = []
            
            # Read CSV file
            df = pd.read_csv(file_path)
            print(f"📊 CSV columns found: {list(df.columns)}")
            
            # Look for Symbol column
            symbol_col = None
            for col in df.columns:
                if col.lower() in ['symbol', 'symbols', 'stock', 'ticker']:
                    symbol_col = col
                    break
            
            if symbol_col:
                symbols = df[symbol_col].dropna().tolist()
                print(f"✅ Found {len(symbols)} symbols using column: {symbol_col}")
            else:
                symbols = df.iloc[:, 0].dropna().tolist()
                print(f"✅ Using first column, found {len(symbols)} symbols")
            
            # Clean symbols
            symbols = [symbol.strip().upper() for symbol in symbols if symbol.strip()]
            print(f"📋 Final symbols: {symbols}")
            return symbols
            
        except Exception as e:
            print(f"❌ ERROR reading CSV: {e}")
            return []
    
    def extract_quarters_section_data(self, symbol):
        """Extract data from both standalone and consolidated #quarters sections"""
        print(f"\n📈 Step 2: Processing {symbol} - #quarters section")
        
        quarters_data = {
            'consolidated': None,
            'standalone': None
        }
        
        # 1. First, try to get CONSOLIDATED data
        try:
            consolidated_url = f"{self.base_url}/{symbol}/consolidated/#quarters"
            print(f"🌐 Fetching CONSOLIDATED: {consolidated_url}")
            
            if self.use_selenium:
                consolidated_data = self.extract_with_selenium(consolidated_url, 'consolidated')
            else:
                consolidated_data = self.extract_with_requests(consolidated_url, 'consolidated')
            
            if consolidated_data:
                quarters_data['consolidated'] = consolidated_data
                print(f"✅ CONSOLIDATED data found")
            else:
                print(f"⚠️ CONSOLIDATED data not available")
                
        except Exception as e:
            print(f"⚠️ Error fetching CONSOLIDATED: {e}")
        
        # 2. Then, get STANDALONE data  
        try:
            standalone_url = f"{self.base_url}/{symbol}/#quarters"
            print(f"🌐 Fetching STANDALONE: {standalone_url}")
            
            if self.use_selenium:
                standalone_data = self.extract_with_selenium(standalone_url, 'standalone')
            else:
                standalone_data = self.extract_with_requests(standalone_url, 'standalone')
            
            if standalone_data:
                quarters_data['standalone'] = standalone_data
                print(f"✅ STANDALONE data found")
            else:
                print(f"⚠️ STANDALONE data not available")
                
        except Exception as e:
            print(f"❌ Error fetching STANDALONE: {e}")
        
        # Check if we got any data
        if not quarters_data['consolidated'] and not quarters_data['standalone']:
            print(f"❌ No quarters data found for {symbol}")
            return None
        
        return quarters_data
    
    def extract_with_selenium(self, url, data_type):
        """Extract data using Selenium to handle dynamic content"""
        try:
            print(f"🤖 Using Selenium for {data_type} data...")
            
            # Load the page
            self.driver.get(url)
            
            # Wait for page to load
            wait = WebDriverWait(self.driver, 10)
            
            # Look for tables
            tables = self.driver.find_elements(By.TAG_NAME, "table")
            print(f"📊 Found {len(tables)} tables")
            
            for i, table in enumerate(tables):
                print(f"\n🔍 Checking table {i+1} for quarters data...")
                
                # Check if this looks like a quarters table
                try:
                    headers = table.find_elements(By.TAG_NAME, "th")
                    if not headers:
                        headers = table.find_elements(By.TAG_NAME, "td")[:10]  # Get first row as headers
                    
                    header_texts = [h.text.strip() for h in headers]
                    print(f"   Headers: {header_texts[:5]}...")
                    
                    if self.is_valid_quarters_table_selenium(header_texts, table):
                        print(f"✅ Valid quarters table found!")
                        
                        # Click all expandable elements (+ signs)
                        expandable_elements = self.click_expandable_elements(table)
                        
                        # Wait a bit for content to load
                        time.sleep(2)
                        
                        # Extract the complete data
                        table_data = self.extract_selenium_table_data(table)
                        
                        return {
                            'table_index': i+1,
                            'headers': header_texts,
                            'data': table_data,
                            'type': data_type,
                            'expandable_clicked': expandable_elements
                        }
                
                except Exception as e:
                    print(f"   ⚠️ Error checking table {i+1}: {e}")
                    continue
            
            print(f"❌ No valid quarters table found for {data_type}")
            return None
            
        except Exception as e:
            print(f"❌ Selenium extraction error: {e}")
            return None
    
    def click_expandable_elements(self, table):
        """Click all + sign elements in the table"""
        clicked_count = 0
        
        try:
            # Look for clickable elements with + signs or expand functionality
            expand_selectors = [
                "[class*='expand']",
                "[class*='plus']", 
                "[class*='toggle']",
                "span:contains('+')",
                "button:contains('+')",
                "[data-toggle]",
                "[onclick*='expand']"
            ]
            
            for selector in expand_selectors:
                try:
                    expandable_elements = table.find_elements(By.CSS_SELECTOR, selector)
                    for element in expandable_elements:
                        if element.is_displayed() and element.is_enabled():
                            try:
                                # Scroll into view and click
                                self.driver.execute_script("arguments[0].scrollIntoView();", element)
                                time.sleep(0.5)
                                element.click()
                                clicked_count += 1
                                print(f"   🔍 Clicked expandable element: {element.text[:20]}")
                                time.sleep(0.5)  # Wait for expansion
                            except:
                                pass  # Element might not be clickable
                except:
                    pass
            
            # Also try clicking on elements that contain + in their text
            try:
                all_elements = table.find_elements(By.XPATH, ".//*[contains(text(), '+')]")
                for element in all_elements:
                    if element.is_displayed() and element.tag_name in ['span', 'button', 'td', 'th']:
                        try:
                            element.click()
                            clicked_count += 1
                            time.sleep(0.5)
                        except:
                            pass
            except:
                pass
        
        except Exception as e:
            print(f"   ⚠️ Error clicking expandable elements: {e}")
        
        if clicked_count > 0:
            print(f"   ✅ Clicked {clicked_count} expandable elements")
        
        return clicked_count
    
    def extract_selenium_table_data(self, table):
        """Extract table data using Selenium"""
        table_data = []
        
        try:
            rows = table.find_elements(By.TAG_NAME, "tr")
            
            for row in rows:
                cells = row.find_elements(By.TAG_NAME, "td")
                if not cells:
                    cells = row.find_elements(By.TAG_NAME, "th")
                
                if cells:
                    cell_texts = [cell.text.strip() for cell in cells]
                    
                    # Skip RAW PDF rows
                    first_cell = cell_texts[0].upper() if cell_texts else ""
                    if "RAW PDF" in first_cell or "PDF" in first_cell:
                        print(f"   ⚠️ Skipping RAW PDF row")
                        continue
                    
                    if any(cell for cell in cell_texts):  # Only add non-empty rows
                        table_data.append(cell_texts)
        
        except Exception as e:
            print(f"   ❌ Error extracting table data: {e}")
        
        return table_data
    
    def is_valid_quarters_table_selenium(self, headers, table):
        """Check if this is a valid quarters table using Selenium"""
        if not headers:
            return False
        
        headers_text = " ".join(headers).upper()
        
        # Look for quarterly patterns
        quarterly_indicators = ['Q1', 'Q2', 'Q3', 'Q4', 'MAR', 'JUN', 'SEP', 'DEC', 'FY']
        quarterly_score = sum(1 for indicator in quarterly_indicators if indicator in headers_text)
        
        # Check for financial metrics in the table
        try:
            rows = table.find_elements(By.TAG_NAME, "tr")[:10]  # Check first 10 rows
            metrics_text = ""
            for row in rows:
                cells = row.find_elements(By.TAG_NAME, "td")
                if cells:
                    metrics_text += " " + cells[0].text.upper()
        except:
            metrics_text = ""
        
        financial_indicators = ['SALES', 'REVENUE', 'EXPENSES', 'PROFIT', 'OPM', 'EPS']
        financial_score = sum(1 for indicator in financial_indicators if indicator in metrics_text)
        
        return quarterly_score >= 2 and financial_score >= 1
    
    def extract_with_requests(self, url, data_type):
        """Fallback extraction using requests (no + sign data)"""
        try:
            response = self.session.get(url, timeout=30)
            if response.status_code != 200:
                return None
            
            soup = BeautifulSoup(response.content, 'html.parser')
            return self.find_quarters_tables_in_page(soup, data_type)
            
        except Exception as e:
            print(f"❌ Requests extraction error: {e}")
            return None
    
    def find_quarters_tables_in_page(self, soup, data_type):
        """Find quarters tables in the given page (basic extraction)"""
        tables = soup.find_all('table')
        
        for i, table in enumerate(tables):
            headers = []
            first_row = table.find('tr')
            if first_row:
                headers = [th.get_text().strip() for th in first_row.find_all(['th', 'td'])]
            
            if self.is_valid_quarters_table(headers, table):
                table_data = self.extract_basic_table_data(table)
                
                return {
                    'table_index': i+1,
                    'headers': headers,
                    'data': table_data,
                    'type': data_type
                }
        
        return None
    
    def is_valid_quarters_table(self, headers, table):
        """Check if table contains quarterly data (basic)"""
        if not headers:
            return False
        
        headers_text = " ".join(headers).upper()
        quarterly_indicators = ['Q1', 'Q2', 'Q3', 'Q4', 'MAR', 'JUN', 'SEP', 'DEC', 'FY']
        quarterly_score = sum(1 for indicator in quarterly_indicators if indicator in headers_text)
        
        return quarterly_score >= 2
    
    def extract_basic_table_data(self, table):
        """Extract basic table data (no dynamic content)"""
        table_data = []
        
        try:
            rows = table.find_all('tr')
            
            for row in rows:
                cells = [cell.get_text().strip() for cell in row.find_all(['td', 'th'])]
                
                if cells and any(cell for cell in cells):
                    # Skip RAW PDF rows
                    first_cell = cells[0].upper() if cells else ""
                    if "RAW PDF" in first_cell or "PDF" in first_cell:
                        continue
                    
                    table_data.append(cells)
        
        except Exception as e:
            print(f"❌ Error extracting basic table data: {e}")
        
        return table_data
    
    def save_quarters_data(self, symbol, quarters_data):
        """Save quarters data to CSV"""
        print(f"\n💾 Step 3: Saving quarters data for {symbol}")
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        saved_files = []
        
        # Save Consolidated data if available
        if quarters_data['consolidated']:
            filename = f"{symbol}_Quarters_CONSOLIDATED_{timestamp}.csv"
            filepath = os.path.join(self.results_folder, filename)
            
            data = quarters_data['consolidated']['data']
            
            with open(filepath, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerows(data)
            
            print(f"✅ Saved CONSOLIDATED: {filename}")
            print(f"📊 Data: {len(data)} rows x {len(data[0]) if data else 0} columns")
            
            # Show if expandable data was extracted
            if 'expandable_clicked' in quarters_data['consolidated']:
                clicked = quarters_data['consolidated']['expandable_clicked']
                print(f"🔍 Expandable elements clicked: {clicked}")
            
            saved_files.append(filepath)
        
        # Save Standalone data if available
        if quarters_data['standalone']:
            filename = f"{symbol}_Quarters_STANDALONE_{timestamp}.csv"
            filepath = os.path.join(self.results_folder, filename)
            
            data = quarters_data['standalone']['data']
            
            with open(filepath, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerows(data)
            
            print(f"✅ Saved STANDALONE: {filename}")
            print(f"📊 Data: {len(data)} rows x {len(data[0]) if data else 0} columns")
            
            # Show if expandable data was extracted
            if 'expandable_clicked' in quarters_data['standalone']:
                clicked = quarters_data['standalone']['expandable_clicked']
                print(f"🔍 Expandable elements clicked: {clicked}")
            
            saved_files.append(filepath)
        
        return saved_files
    
    def verify_csv_files(self, csv_files):
        """Verify and display CSV content"""
        print(f"\n🔍 Step 4: Verifying saved files")
        
        for filepath in csv_files:
            try:
                print(f"\n📖 File: {os.path.basename(filepath)}")
                df = pd.read_csv(filepath, header=None)
                
                print(f"📊 Shape: {df.shape[0]} rows x {df.shape[1]} columns")
                print(f"🔍 First 5 rows:")
                for i in range(min(5, len(df))):
                    row_data = df.iloc[i].tolist()
                    print(f"   Row {i+1}: {row_data[:4]}...")  # Show first 4 columns
                
                # Look for expandable data (indented rows)
                expandable_count = 0
                for i in range(len(df)):
                    if df.iloc[i, 0] and str(df.iloc[i, 0]).startswith('  '):
                        expandable_count += 1
                
                if expandable_count > 0:
                    print(f"✅ Found {expandable_count} expandable data rows")
                else:
                    print(f"⚠️ No expandable data found (may need Selenium)")
                
            except Exception as e:
                print(f"❌ Error reading {filepath}: {e}")
    
    def run_quarters_extraction(self):
        """Main function - Extract from #quarters section only"""
        print("🚀 QUARTERS SECTION EXTRACTOR - STEP 1")
        print("="*60)
        print("🎯 Consolidated: /company/{symbol}/consolidated/#quarters")
        print("🎯 Standalone:   /company/{symbol}/#quarters")
        print("📋 Extract: Quarterly Results from both URLs")
        if SELENIUM_AVAILABLE:
            print("✅ Features: Skip RAW PDF, Extract + sign data, Timer")
        else:
            print("⚠️ Features: Skip RAW PDF, Basic extraction, Timer")
            print("💡 Install Selenium for + sign data: pip install selenium")
        print("="*60)
        
        # START TIMER
        start_time = time.time()
        print(f"⏱️  Started at: {datetime.now().strftime('%H:%M:%S')}")
        
        # Setup Selenium if available
        if self.use_selenium:
            if not self.setup_selenium():
                print("⚠️ Falling back to basic extraction")
        
        # Read symbols
        csv_path = r"E:\JN\TestSymbol.csv"
        symbols = self.read_symbols_from_csv(csv_path)
        
        if not symbols:
            print("❌ No symbols to process. Exiting.")
            if self.use_selenium:
                self.close_selenium()
            return
        
        print(f"\n✅ Will process {len(symbols)} symbols")
        
        # Process each symbol
        total_files_created = []
        successful_extractions = 0
        symbol_timings = []
        
        for i, symbol in enumerate(symbols, 1):
            symbol_start_time = time.time()
            
            print(f"\n{'='*40} {i}/{len(symbols)} {'='*40}")
            print(f"🎯 Processing: {symbol} - #quarters section")
            print(f"⏱️  Symbol started at: {datetime.now().strftime('%H:%M:%S')}")
            
            # Add delay between requests
            if i > 1:
                print("⏳ Waiting 3 seconds...")
                time.sleep(3)
            
            # Extract quarters section data
            quarters_data = self.extract_quarters_section_data(symbol)
            
            if quarters_data:
                # Save data
                saved_files = self.save_quarters_data(symbol, quarters_data)
                total_files_created.extend(saved_files)
                successful_extractions += 1
                
                # Quick verification
                self.verify_csv_files(saved_files)
            else:
                print(f"⚠️ No quarters section data found for {symbol}")
            
            # Symbol timing
            symbol_end_time = time.time()
            symbol_time = symbol_end_time - symbol_start_time
            symbol_timings.append((symbol, symbol_time))
            print(f"⏱️  {symbol} completed in: {symbol_time:.2f} seconds")
        
        # Close Selenium
        if self.use_selenium:
            self.close_selenium()
        
        # FINAL TIMER & SUMMARY
        end_time = time.time()
        total_time = end_time - start_time
        
        print(f"\n{'='*60}")
        print("📊 QUARTERS SECTION EXTRACTION SUMMARY")
        print(f"{'='*60}")
        print(f"⏱️  TOTAL TIME: {total_time:.2f} seconds ({total_time/60:.2f} minutes)")
        print(f"⏱️  Average per symbol: {total_time/len(symbols):.2f} seconds")
        print(f"📈 Symbols processed: {len(symbols)}")
        print(f"✅ Successful extractions: {successful_extractions}")
        print(f"💾 CSV files created: {len(total_files_created)}")
        
        # Individual symbol timings
        if symbol_timings:
            print(f"\n⏱️  INDIVIDUAL SYMBOL TIMINGS:")
            for symbol, timing in symbol_timings:
                print(f"   {symbol}: {timing:.2f} seconds")
        
        if total_files_created:
            print(f"\n📁 Files created in '{self.results_folder}' folder:")
            for file_path in total_files_created:
                print(f"   - {os.path.basename(file_path)}")
        else:
            print("❌ No files were created!")
        
        print(f"\n🎯 FEATURES IMPLEMENTED:")
        print("✅ 1. RAW PDF rows skipped automatically")
        if SELENIUM_AVAILABLE:
            print("✅ 2. Expandable rows (+ sign data) extracted with Selenium")
        else:
            print("⚠️ 2. Basic extraction only (install Selenium for + sign data)")
        print("✅ 3. Timer showing total and per-symbol timing")
        
        print(f"\n🎯 TO GET + SIGN DATA:")
        print("💡 1. Install Selenium: pip install selenium")
        print("💡 2. Download ChromeDriver from: https://chromedriver.chromium.org/")
        print("💡 3. Make sure ChromeDriver is in your PATH")

# Main execution
if __name__ == "__main__":
    print("📈 STEP 1: #quarters Section with + Sign Data")
    print("🎯 Goal: Extract expandable data (YoY Growth, Material Cost%)")
    print("🎯 Method: Selenium for dynamic content + basic fallback")
    print()
    
    extractor = QuarterlyExtractor()
    extractor.run_quarters_extraction()
    
    print(f"\n🎉 STEP 1 COMPLETE!")
    print("💡 Check CSV files for expandable data (indented rows)")
    print("💡 If no expandable data, install Selenium + ChromeDriver")
    input("\nPress Enter to exit...")

✅ Selenium available - Will extract + sign expandable data
📈 STEP 1: #quarters Section with + Sign Data
🎯 Goal: Extract expandable data (YoY Growth, Material Cost%)
🎯 Method: Selenium for dynamic content + basic fallback

🚀 QUARTERS SECTION EXTRACTOR - STEP 1
🎯 Consolidated: /company/{symbol}/consolidated/#quarters
🎯 Standalone:   /company/{symbol}/#quarters
📋 Extract: Quarterly Results from both URLs
✅ Features: Skip RAW PDF, Extract + sign data, Timer
⏱️  Started at: 09:20:56
✅ Selenium WebDriver initialized

🔍 Step 1: Reading symbols from: E:\JN\TestSymbol.csv
📊 CSV columns found: ['SYMBOL']
✅ Found 1 symbols using column: SYMBOL
📋 Final symbols: ['RELIANCE']

✅ Will process 1 symbols

🎯 Processing: RELIANCE - #quarters section
⏱️  Symbol started at: 09:21:04

📈 Step 2: Processing RELIANCE - #quarters section
🌐 Fetching CONSOLIDATED: https://www.screener.in/company/RELIANCE/consolidated/#quarters
🤖 Using Selenium for consolidated data...
📊 Found 12 tables

🔍 Checking table 1 for qua


Press Enter to exit... 
