In [1]:
# Class_02_B: Financial Metrics Extractor from Screener.in (FIXED VERSION)
# Target: Extract Compounded Growth, CAGR, and ROE data

import requests
from bs4 import BeautifulSoup
import re
import json
import time
import csv
import os
from datetime import datetime

class ScreenerFinancialExtractor:
    def __init__(self):
        """
        Initialize the extractor with headers to mimic browser request
        """
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }
        
    def fetch_page(self, url):
        """
        Fetch the webpage content
        Args: url (str) - Screener.in company URL
        Returns: BeautifulSoup object or None
        """
        try:
            print(f"📡 Fetching data from: {url}")
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            print("✅ Page fetched successfully!")
            return soup
            
        except requests.RequestException as e:
            print(f"❌ Error fetching page: {e}")
            return None
    
    def extract_financial_metrics(self, soup):
        """
        Extract the four main financial metrics sections using the correct Screener.in structure
        Returns: Dictionary with structured financial data
        """
        print("\n🔍 Starting financial metrics extraction...")
        
        metrics = {
            'compounded_sales_growth': {},
            'compounded_profit_growth': {},
            'stock_price_cagr': {},
            'return_on_equity': {}
        }
        
        try:
            # Method 1: Find the specific tables with financial metrics
            # Screener.in uses tables with specific patterns
            
            # Look for tables containing our target metrics
            tables = soup.find_all('table')
            print(f"📊 Found {len(tables)} tables to analyze")
            
            for i, table in enumerate(tables):
                table_text = table.get_text()
                
                # Check each metric section
                if "Compounded Sales Growth" in table_text:
                    print(f"   🎯 Found Compounded Sales Growth in table {i+1}")
                    self.extract_metric_from_table(table, metrics['compounded_sales_growth'])
                
                elif "Compounded Profit Growth" in table_text:
                    print(f"   🎯 Found Compounded Profit Growth in table {i+1}")
                    self.extract_metric_from_table(table, metrics['compounded_profit_growth'])
                
                elif "Stock Price CAGR" in table_text:
                    print(f"   🎯 Found Stock Price CAGR in table {i+1}")
                    self.extract_metric_from_table(table, metrics['stock_price_cagr'])
                
                elif "Return on Equity" in table_text:
                    print(f"   🎯 Found Return on Equity in table {i+1}")
                    self.extract_metric_from_table(table, metrics['return_on_equity'])
            
            # Method 2: Direct text extraction for the specific Screener.in format
            page_text = soup.get_text()
            
            # Extract using the exact patterns seen in the fetched data
            self.extract_by_text_patterns(page_text, metrics)
            
            # Method 3: Look for div/section containers with ratios
            ratio_sections = soup.find_all(['div', 'section'], 
                                         class_=re.compile(r'ratio|metric|growth|return', re.I))
            print(f"📊 Found {len(ratio_sections)} ratio sections")
            
        except Exception as e:
            print(f"⚠️ Error in extraction: {e}")
            
        return metrics
    
    def extract_metric_from_table(self, table, target_dict):
        """
        Extract metrics from a specific table
        """
        rows = table.find_all('tr')
        
        for row in rows:
            cells = row.find_all(['td', 'th'])
            
            if len(cells) >= 2:
                period = cells[0].get_text().strip()
                value = cells[1].get_text().strip()
                
                # Clean up the period name and value
                if period and value and '%' in value:
                    # Remove extra characters and normalize
                    period = period.replace(':', '').strip()
                    value = value.strip()
                    
                    # Store the data
                    target_dict[period] = value
                    print(f"     ✅ {period}: {value}")
    
    def extract_by_text_patterns(self, page_text, metrics):
        """
        Extract metrics using text pattern matching (specific to Screener.in format)
        """
        print("   🔍 Using text pattern extraction...")
        
        # Define the exact patterns we see in Screener.in
        patterns = {
            'compounded_sales_growth': {
                'title': 'Compounded Sales Growth',
                'periods': ['10 Years', '5 Years', '3 Years', 'TTM']
            },
            'compounded_profit_growth': {
                'title': 'Compounded Profit Growth',
                'periods': ['10 Years', '5 Years', '3 Years', 'TTM']
            },
            'stock_price_cagr': {
                'title': 'Stock Price CAGR',
                'periods': ['10 Years', '5 Years', '3 Years', '1 Year']
            },
            'return_on_equity': {
                'title': 'Return on Equity',
                'periods': ['10 Years', '5 Years', '3 Years', 'Last Year']
            }
        }
        
        for metric_key, pattern_info in patterns.items():
            title = pattern_info['title']
            periods = pattern_info['periods']
            
            # Find the section in text
            title_pos = page_text.find(title)
            if title_pos != -1:
                print(f"   📍 Found {title} at position {title_pos}")
                
                # Get the section text (next 500 characters after title)
                section_text = page_text[title_pos:title_pos + 500]
                
                # Extract each period's value
                for period in periods:
                    value = self.find_value_for_period(section_text, period)
                    if value:
                        metrics[metric_key][period] = value
                        print(f"     ✅ {period}: {value}")
    
    def find_value_for_period(self, section_text, period):
        """
        Find the percentage value for a specific time period
        """
        # Create a pattern that looks for the period followed by percentage
        # Pattern: "10 Years: 10%" or "10 Years 10%"
        
        patterns = [
            rf'{re.escape(period)}[:\s]*(-?\d+(?:\.\d+)?)%',
            rf'{re.escape(period)}[:\s]*(-?\d+(?:\.\d+)?)\s*%',
            rf'{re.escape(period)}.*?(-?\d+(?:\.\d+)?)%'
        ]
        
        for pattern in patterns:
            match = re.search(pattern, section_text, re.IGNORECASE)
            if match:
                return f"{match.group(1)}%"
        
        return None
    
    def format_output(self, metrics):
        """
        Format the extracted metrics for display
        """
        print("\n" + "="*60)
        print("📈 FINANCIAL METRICS EXTRACTION RESULTS")
        print("="*60)
        
        sections = [
            ('Compounded Sales Growth', metrics['compounded_sales_growth']),
            ('Compounded Profit Growth', metrics['compounded_profit_growth']),
            ('Stock Price CAGR', metrics['stock_price_cagr']),
            ('Return on Equity', metrics['return_on_equity'])
        ]
        
        for section_name, section_data in sections:
            print(f"\n📊 {section_name}:")
            if section_data:
                for period, value in section_data.items():
                    print(f"   {period}:\t{value}")
            else:
                print("   ❌ No data extracted")
            print("-" * 40)
            
        # Summary
        total_extracted = sum(len(section_data) for _, section_data in sections)
        print(f"\n📋 SUMMARY:")
        print(f"   Total data points extracted: {total_extracted}")
        print(f"   Sections with data: {sum(1 for _, section_data in sections if section_data)}/4")
    
    def create_folder_structure(self):
        """
        Create organized folder structure for data storage
        """
        folders = [
            'extracted_data',
            'extracted_data/financial_metrics',
            'extracted_data/financial_metrics/csv',
            'extracted_data/financial_metrics/json'
        ]
        
        for folder in folders:
            if not os.path.exists(folder):
                os.makedirs(folder)
                print(f"📁 Created folder: {folder}")
        
        return 'extracted_data/financial_metrics'
    
    def save_to_csv(self, metrics, company_name="COMPANY"):
        """
        Save extracted metrics to CSV file in organized folder
        """
        try:
            # Create folder structure
            base_folder = self.create_folder_structure()
            
            # Generate filename with timestamp
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            csv_filename = f"{base_folder}/csv/{company_name}_financial_metrics_{timestamp}.csv"
            
            # Prepare data for CSV
            csv_data = []
            
            # Add header
            csv_data.append(['Metric Category', 'Time Period', 'Value'])
            
            # Add data rows
            metric_names = {
                'compounded_sales_growth': 'Compounded Sales Growth',
                'compounded_profit_growth': 'Compounded Profit Growth',
                'stock_price_cagr': 'Stock Price CAGR',
                'return_on_equity': 'Return on Equity'
            }
            
            for metric_key, metric_data in metrics.items():
                category_name = metric_names.get(metric_key, metric_key)
                
                if metric_data:
                    for period, value in metric_data.items():
                        csv_data.append([category_name, period, value])
                else:
                    csv_data.append([category_name, 'No Data', 'N/A'])
            
            # Write to CSV
            with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerows(csv_data)
            
            print(f"\n💾 CSV data saved to: {csv_filename}")
            
            # Also save JSON for backup
            json_filename = f"{base_folder}/json/{company_name}_financial_metrics_{timestamp}.json"
            with open(json_filename, 'w', encoding='utf-8') as f:
                json.dump(metrics, f, indent=2, ensure_ascii=False)
            print(f"💾 JSON backup saved to: {json_filename}")
            
            return csv_filename, json_filename
            
        except Exception as e:
            print(f"❌ Error saving files: {e}")
            return None, None
    
    def create_summary_csv(self, metrics, company_name="COMPANY"):
        """
        Create a summary CSV with all metrics in one row (for easy comparison)
        """
        try:
            base_folder = self.create_folder_structure()
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            summary_filename = f"{base_folder}/csv/{company_name}_summary_{timestamp}.csv"
            
            # Prepare headers
            headers = ['Company', 'Date_Extracted']
            
            # Add all metric columns
            metric_columns = [
                'Sales_Growth_10Y', 'Sales_Growth_5Y', 'Sales_Growth_3Y', 'Sales_Growth_TTM',
                'Profit_Growth_10Y', 'Profit_Growth_5Y', 'Profit_Growth_3Y', 'Profit_Growth_TTM',
                'Stock_CAGR_10Y', 'Stock_CAGR_5Y', 'Stock_CAGR_3Y', 'Stock_CAGR_1Y',
                'ROE_10Y', 'ROE_5Y', 'ROE_3Y', 'ROE_Last_Year'
            ]
            
            headers.extend(metric_columns)
            
            # Prepare data row
            data_row = [company_name, datetime.now().strftime("%Y-%m-%d %H:%M:%S")]
            
            # Map data to columns
            sales_data = metrics['compounded_sales_growth']
            profit_data = metrics['compounded_profit_growth']
            stock_data = metrics['stock_price_cagr']
            roe_data = metrics['return_on_equity']
            
            # Add sales growth data
            data_row.extend([
                sales_data.get('10 Years', 'N/A'),
                sales_data.get('5 Years', 'N/A'),
                sales_data.get('3 Years', 'N/A'),
                sales_data.get('TTM', 'N/A')
            ])
            
            # Add profit growth data
            data_row.extend([
                profit_data.get('10 Years', 'N/A'),
                profit_data.get('5 Years', 'N/A'),
                profit_data.get('3 Years', 'N/A'),
                profit_data.get('TTM', 'N/A')
            ])
            
            # Add stock CAGR data
            data_row.extend([
                stock_data.get('10 Years', 'N/A'),
                stock_data.get('5 Years', 'N/A'),
                stock_data.get('3 Years', 'N/A'),
                stock_data.get('1 Year', 'N/A')
            ])
            
            # Add ROE data
            data_row.extend([
                roe_data.get('10 Years', 'N/A'),
                roe_data.get('5 Years', 'N/A'),
                roe_data.get('3 Years', 'N/A'),
                roe_data.get('Last Year', 'N/A')
            ])
            
            # Write to CSV
            with open(summary_filename, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow(headers)
                writer.writerow(data_row)
            
            print(f"📊 Summary CSV saved to: {summary_filename}")
            return summary_filename
            
        except Exception as e:
            print(f"❌ Error creating summary CSV: {e}")
            return None
            
    def extract_company_name(self, soup):
        """
        Extract company name from the page for better file naming
        """
        try:
            # Try different selectors for company name
            title_tag = soup.find('title')
            if title_tag:
                title_text = title_tag.get_text()
                # Extract company name from title (usually first part)
                company_name = title_text.split(' share price')[0].split(' |')[0]
                # Clean the name for filename
                company_name = re.sub(r'[^\w\s-]', '', company_name).strip()
                company_name = re.sub(r'\s+', '_', company_name)
                return company_name
            
            # Backup: try to find h1 with company name
            h1_tag = soup.find('h1')
            if h1_tag:
                company_name = h1_tag.get_text().strip()
                company_name = re.sub(r'[^\w\s-]', '', company_name).strip()
                company_name = re.sub(r'\s+', '_', company_name)
                return company_name
                
        except Exception as e:
            print(f"⚠️ Could not extract company name: {e}")
            
        return "UNKNOWN_COMPANY"
        """
        Debug function to understand page structure better
        """
        print("\n🔧 DEBUG: Page Structure Analysis")
        print("-" * 40)
        
        # Find all text containing our target words
        page_text = soup.get_text()
        
        target_phrases = [
            "Compounded Sales Growth",
            "Compounded Profit Growth", 
            "Stock Price CAGR",
            "Return on Equity"
        ]
        
        for phrase in target_phrases:
            pos = page_text.find(phrase)
            if pos != -1:
                # Show context around the phrase
                start = max(0, pos - 50)
                end = min(len(page_text), pos + 200)
                context = page_text[start:end].replace('\n', ' ').replace('  ', ' ')
                print(f"\n🔍 Found '{phrase}' at position {pos}:")
                print(f"   Context: ...{context}...")
            else:
                print(f"❌ '{phrase}' not found in page text")


def main():
    """
    Main execution function - Class_02_B Demo (FIXED VERSION)
    """
    print("🚀 Class_02_B: Financial Metrics Extractor (FIXED)")
    print("=" * 50)
    
    # Initialize extractor
    extractor = ScreenerFinancialExtractor()
    
    # Example URL (Reliance Industries)
    url = "https://www.screener.in/company/RELIANCE/consolidated/"
    
    # You can change this URL to test with different companies
    # url = input("Enter Screener.in URL: ").strip()
    
    # Fetch and extract
    soup = extractor.fetch_page(url)
    
    if soup:
        # Debug page structure (optional - uncomment to see page analysis)
        # extractor.debug_page_structure(soup)
        
        # Extract company name for better file naming
        company_name = extractor.extract_company_name(soup)
        print(f"🏢 Company identified as: {company_name}")
        
        # Extract financial metrics
        metrics = extractor.extract_financial_metrics(soup)
        
        # Display results
        extractor.format_output(metrics)
        
        # Save to CSV and JSON files in organized folders
        csv_file, json_file = extractor.save_to_csv(metrics, company_name)
        
        # Create summary CSV for easy comparison
        summary_file = extractor.create_summary_csv(metrics, company_name)
        
        if csv_file and json_file:
            print("\n✅ Class_02_B completed successfully!")
            print(f"✅ Files saved in organized folder structure")
            print(f"📁 CSV: {csv_file}")
            print(f"📁 JSON: {json_file}")
            if summary_file:
                print(f"📁 Summary: {summary_file}")
        else:
            print("\n⚠️ Data extraction completed but file saving failed.")
            
        print("\n📝 Folder Structure Created:")
        print("📁 extracted_data/")
        print("   📁 financial_metrics/")
        print("      📁 csv/          <- Detailed & Summary CSV files")
        print("      📁 json/         <- JSON backup files")
        
        print("\n📝 Next Steps:")
        print("- Open CSV files in Excel/Google Sheets for analysis")
        print("- Use summary CSV to compare multiple companies") 
        print("- Ready for Class_03 (combining basic + financial data)")
        
    else:
        print("❌ Failed to fetch page. Check URL and internet connection.")


# Execute if run directly
if __name__ == "__main__":
    main()


# 🎓 LEARNING NOTES FOR BEGINNERS:
"""
CLASS_02_B IMPROVEMENTS - CSV FILE MANAGEMENT:

1. **ORGANIZED FOLDER STRUCTURE**:
   📁 extracted_data/
      📁 financial_metrics/
         📁 csv/     <- CSV files for spreadsheet analysis
         📁 json/    <- JSON backup files

2. **TWO TYPES OF CSV FILES**:
   - **Detailed CSV**: Each metric as separate rows
   - **Summary CSV**: All metrics in one row (great for comparing companies)

3. **AUTOMATIC FILE NAMING**:
   - Extracts company name from page
   - Adds timestamp to prevent overwrites
   - Example: "Reliance_Industries_Ltd_financial_metrics_20241201_143022.csv"

4. **CROSS-CHECKING FEATURES**:
   - Both CSV and JSON saved for verification
   - Summary format perfect for Excel pivot tables
   - Timestamped files for tracking data changes

5. **FOLDER AUTO-CREATION**:
   - Script creates all necessary folders automatically
   - Organized structure for multiple companies
   - Easy to backup and share data files

NOW YOUR DATA IS SAVED AS CSV IN ORGANIZED FOLDERS! 📊📁
"""

🚀 Class_02_B: Financial Metrics Extractor (FIXED)
📡 Fetching data from: https://www.screener.in/company/RELIANCE/consolidated/
✅ Page fetched successfully!
🏢 Company identified as: Reliance_Industries_Ltd

🔍 Starting financial metrics extraction...
📊 Found 11 tables to analyze
   🎯 Found Compounded Sales Growth in table 3
     ✅ 10 Years: 10%
     ✅ 5 Years: 10%
     ✅ 3 Years: 11%
     ✅ TTM: 6%
   🎯 Found Compounded Profit Growth in table 4
     ✅ 10 Years: 12%
     ✅ 5 Years: 10%
     ✅ 3 Years: 5%
     ✅ TTM: 9%
   🎯 Found Stock Price CAGR in table 5
     ✅ 10 Years: 22%
     ✅ 5 Years: 7%
     ✅ 3 Years: 5%
     ✅ 1 Year: -11%
   🎯 Found Return on Equity in table 6
     ✅ 10 Years: 9%
     ✅ 5 Years: 8%
     ✅ 3 Years: 9%
     ✅ Last Year: 8%
   🔍 Using text pattern extraction...
   📍 Found Compounded Sales Growth at position 9729
     ✅ 10 Years: 10%
     ✅ 5 Years: 10%
     ✅ 3 Years: 11%
     ✅ TTM: 6%
   📍 Found Compounded Profit Growth at position 9813
     ✅ 10 Years: 12%
  

'\nCLASS_02_B IMPROVEMENTS - CSV FILE MANAGEMENT:\n\n1. **ORGANIZED FOLDER STRUCTURE**:\n   📁 extracted_data/\n      📁 financial_metrics/\n         📁 csv/     <- CSV files for spreadsheet analysis\n         📁 json/    <- JSON backup files\n\n2. **TWO TYPES OF CSV FILES**:\n   - **Detailed CSV**: Each metric as separate rows\n   - **Summary CSV**: All metrics in one row (great for comparing companies)\n\n3. **AUTOMATIC FILE NAMING**:\n   - Extracts company name from page\n   - Adds timestamp to prevent overwrites\n   - Example: "Reliance_Industries_Ltd_financial_metrics_20241201_143022.csv"\n\n4. **CROSS-CHECKING FEATURES**:\n   - Both CSV and JSON saved for verification\n   - Summary format perfect for Excel pivot tables\n   - Timestamped files for tracking data changes\n\n5. **FOLDER AUTO-CREATION**:\n   - Script creates all necessary folders automatically\n   - Organized structure for multiple companies\n   - Easy to backup and share data files\n\nNOW YOUR DATA IS SAVED AS CSV IN ORG