In [5]:
import os
import json
import requests
import pypdf
import re
from pathlib import Path
from datetime import datetime
import pandas as pd

# Configuration
OLLAMA_URL = 'http://localhost:11434/api/generate'
MODEL_NAME = 'llama3.1'
DOCUMENTS_FOLDER = '/Users/macbookair/Downloads/Rationale'  # Change this to your documents folder path
OUTPUT_FILE = 'extracted_ratings_extended.json'

def clean_text(text):
    """Clean and normalize extracted text"""
    text = re.sub(r'\n\s*\n', '\n\n', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def read_document(file_path):
    """Read document content based on file type"""
    file_path = Path(file_path)
    
    try:
        if file_path.suffix.lower() == '.pdf':
            with open(file_path, 'rb') as file:
                pdf_reader = pypdf.PdfReader(file)
                content = ""
                for page_num, page in enumerate(pdf_reader.pages):
                    page_text = page.extract_text()
                    if page_text.strip():
                        content += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
                return clean_text(content)
        else:
            # Handle text files
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                content = file.read()
                return clean_text(content)
    except Exception as e:
        print(f"Error reading {file_path}: {str(e)}")
        return None

def extract_rating_information(document_content, filename):
    """Extract rating information using LLM"""
    
    extraction_prompt = f"""
You are a financial analyst tasked with extracting specific information from rating agency documents. 

Analyze the following document and extract ONLY the information that is explicitly mentioned:

DOCUMENT:
{document_content}

Extract the following information and format as JSON:

1. Company Name: The name of the entity being rated
2. Assessment Year and Month: When the assessment/rating was conducted or published
3. Rating Action Basis: The primary reason or basis for the rating action
4. Rating Drivers (Positive): Factors that support or could improve the rating
5. Rating Drivers (Negative): Factors that constrain or could worsen the rating  
6. Rating Triggers (Upgrade/Downgrade): Specific conditions that could lead to rating changes
7. ESG Descriptor: Any Environmental, Social, Governance factors mentioned
8. Related Criteria: References to rating methodologies, criteria, or frameworks used
9. Specific Ratings: Extract exact rating levels and what they apply to

CRITICAL INSTRUCTIONS:
- If any information is not explicitly mentioned in the document, state "Not mentioned in the document"
- Use precise, analytical language
- Quote directly from the document when possible
- Be thorough but only include information that is clearly stated
- For the Rating Drivers, both Positive and Negative, please include more points into the JSON instead of just 1 point to avoid over-summarization
- For Rating Triggers, both Upgrade and Downgrade, please include more points into the JSON instead of just 1 point to avoud over-summarization
- Please do not provide information that is not in the document

Output ONLY a valid JSON object in this exact format:

{{
  "company_name": "",
  "assessment_year": "",
  "assessment_month": "",
  "rating_action_basis": "",
  "rating_drivers_positive": [],
  "rating_drivers_negative": [],
  "rating_triggers_upgrade": [],
  "rating_triggers_downgrade": [],
  "esg_descriptor": "",
  "related_criteria": [],
  "specific_ratings": []
}}
"""

    headers = {'Content-Type': 'application/json'}
    data = {
        'model': MODEL_NAME,
        'prompt': extraction_prompt,
        'stream': False,
        'options': {
            'temperature': 0.1,  # Low temperature for consistency
            'num_ctx': 8192,     # Large context window for long documents
            'top_p': 0.9
        }
    }
    
    try:
        print(f"Processing {filename}...")
        response = requests.post(OLLAMA_URL, headers=headers, json=data, timeout=120)
        
        if response.status_code == 200:
            result = response.json()
            ai_response = result.get('response', '')
            
            # Try to extract JSON from the response
            try:
                # Find JSON in the response (in case there's extra text)
                json_start = ai_response.find('{')
                json_end = ai_response.rfind('}') + 1
                
                if json_start != -1 and json_end != 0:
                    json_str = ai_response[json_start:json_end]
                    extracted_data = json.loads(json_str)
                    extracted_data['source_file'] = filename
                    extracted_data['extraction_timestamp'] = datetime.now().isoformat()
                    return extracted_data
                else:
                    print(f"No valid JSON found in response for {filename}")
                    return None
                    
            except json.JSONDecodeError as e:
                print(f"JSON parsing error for {filename}: {str(e)}")
                print(f"Raw response: {ai_response[:500]}...")
                return None
                
        else:
            print(f"API error for {filename}: {response.status_code} - {response.text}")
            return None
            
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")
        return None

def process_documents_batch(documents_folder, supported_extensions=None):
    """Process all documents in a folder"""
    
    if supported_extensions is None:
        supported_extensions = {'.pdf', '.txt', '.md', '.py', '.js', '.html', '.css', '.json'}
    
    documents_path = Path(documents_folder)
    
    if not documents_path.exists():
        print(f"Documents folder '{documents_folder}' does not exist!")
        print("Please create the folder and add your documents.")
        return []
    
    # Find all supported document files
    document_files = []
    for ext in supported_extensions:
        document_files.extend(documents_path.glob(f"*{ext}"))
    
    if not document_files:
        print(f"No supported documents found in '{documents_folder}'")
        print(f"Supported extensions: {supported_extensions}")
        return []
    
    print(f"Found {len(document_files)} documents to process:")
    for file in document_files:
        print(f"  - {file.name}")
    
    extracted_results = []
    
    # Process each document
    for i, doc_file in enumerate(document_files, 1):
        print(f"\n[{i}/{len(document_files)}] Processing: {doc_file.name}")
        
        # Read document content
        content = read_document(doc_file)
        if content is None:
            print(f"Skipping {doc_file.name} due to read error")
            continue
        
        if len(content.strip()) == 0:
            print(f"Skipping {doc_file.name} - empty content")
            continue
        
        # Extract information
        extracted_info = extract_rating_information(content, doc_file.name)
        
        if extracted_info:
            extracted_results.append(extracted_info)
            print(f"✓ Successfully extracted information from {doc_file.name}")
        else:
            print(f"✗ Failed to extract information from {doc_file.name}")
    
    return extracted_results

def save_results(results, output_file):
    """Save extraction results to JSON file"""
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        print(f"\nResults saved to: {output_file}")
        
        # Also create a summary CSV
        if results:
            df_data = []
            for result in results:
                row = {
                    'source_file': result.get('source_file', ''),
                    'company_name': result.get('company_name', ''),
                    'assessment_year': result.get('assessment_year', ''),
                    'assessment_month': result.get('assessment_month', ''),
                    'rating_action_basis': result.get('rating_action_basis', '')[:100] + '...' if len(result.get('rating_action_basis', '')) > 100 else result.get('rating_action_basis', ''),
                    'num_positive_drivers': len(result.get('rating_drivers_positive', [])),
                    'num_negative_drivers': len(result.get('rating_drivers_negative', [])),
                    'num_specific_ratings': len(result.get('specific_ratings', [])),
                    'extraction_timestamp': result.get('extraction_timestamp', '')
                }
                df_data.append(row)
            
            df = pd.DataFrame(df_data)
            csv_file = output_file.replace('.json', '_summary.csv')
            df.to_csv(csv_file, index=False)
            print(f"Summary saved to: {csv_file}")
            
    except Exception as e:
        print(f"Error saving results: {str(e)}")

def display_sample_result(results):
    """Display a sample result for verification"""
    if results:
        print("\n" + "="*60)
        print("SAMPLE EXTRACTION RESULT:")
        print("="*60)
        sample = results[0]
        print(f"File: {sample.get('source_file', 'Unknown')}")
        print(f"Company: {sample.get('company_name', 'Not found')}")
        print(f"Assessment: {sample.get('assessment_month', 'Not found')} {sample.get('assessment_year', 'Not found')}")
        print(f"Rating Action Basis: {sample.get('rating_action_basis', 'Not found')[:200]}...")
        print(f"Positive Drivers: {len(sample.get('rating_drivers_positive', []))} found")
        print(f"Negative Drivers: {len(sample.get('rating_drivers_negative', []))} found")
        print(f"Specific Ratings: {len(sample.get('specific_ratings', []))} found")
        print("="*60)

# MAIN EXECUTION LOOP
if __name__ == "__main__":
    print("Financial Rating Information Extraction Tool")
    print("=" * 50)
    
    # Check if Ollama is running
    try:
        test_response = requests.get('http://localhost:11434/api/tags', timeout=5)
        if test_response.status_code != 200:
            print("⚠️  Warning: Ollama may not be running properly")
    except:
        print("❌ Error: Cannot connect to Ollama. Make sure it's running on localhost:11434")
        exit(1)
    
    print(f"📁 Looking for documents in: {DOCUMENTS_FOLDER}")
    print(f"🤖 Using model: {MODEL_NAME}")
    print(f"💾 Output file: {OUTPUT_FILE}")
    print()
    
    # Process all documents
    results = process_documents_batch(DOCUMENTS_FOLDER)
    
    if results:
        print(f"\n🎉 Successfully processed {len(results)} documents!")
        
        # Save results
        save_results(results, OUTPUT_FILE)
        
        # Display sample
        display_sample_result(results)
        
        print(f"\n📊 Summary:")
        print(f"  - Documents processed: {len(results)}")
        print(f"  - Results saved to: {OUTPUT_FILE}")
        print(f"  - Summary CSV created: {OUTPUT_FILE.replace('.json', '_summary.csv')}")
        
    else:
        print("\n❌ No documents were successfully processed.")
        print("Please check:")
        print("  1. Documents exist in the specified folder")
        print("  2. Documents are in supported formats")
        print("  3. Ollama is running and accessible")
        print("  4. The model 'llama3.1' is available")

# OPTIONAL: Interactive mode for single document testing
def test_single_document(file_path):
    """Test extraction on a single document"""
    print(f"Testing single document: {file_path}")
    
    content = read_document(file_path)
    if content:
        result = extract_rating_information(content, Path(file_path).name)
        if result:
            print(json.dumps(result, indent=2))
        else:
            print("Failed to extract information")
    else:
        print("Failed to read document")

# Uncomment the line below to test a single document:
# test_single_document("./path/to/your/test/document.pdf")

Financial Rating Information Extraction Tool
📁 Looking for documents in: /Users/macbookair/Downloads/Rationale
🤖 Using model: llama3.1
💾 Output file: extracted_ratings_extended.json

Found 28 documents to process:
  - Mydin_Rationale_final (Dec 2016).pdf
  - Mydin_Rationale (Dec 2019).pdf
  - Bank Muamalat (April 2010 Rationale).pdf
  - Bank Muamalat_ June 2019.pdf
  - Mydin Rationale_Dec 2014.pdf
  - AEON_Rationale (Apr 2018).pdf
  - Bank Muamalat rationale (May 2016).pdf
  - AEON_Rationale (final).pdf
  - Mydin (Jan 2013).pdf
  - Sample Report 5.pdf
  - Sample Report 4.pdf
  - Mydin Rationale (Dec 2017).pdf
  - Sample Report 1.pdf
  - Sample Report 3.pdf
  - Bank Muamalat rationale (June 2012).pdf
  - Sample Report 2.pdf
  - Mydin - Dec 2018 (Final).pdf
  - Mydin_Rationale_Dec2015.pdf
  - Bank Muamalat (May 2011 Rationale) Final Rating.pdf
  - Bank Muamalat (March 2009).pdf
  - AEON (M) Rationale (2020) - Final.pdf
  - Mydin - Rationale (final rating)_Nov 2011.pdf
  - Bank Muamalat_R

In [3]:
test_single_document('/Users/macbookair/Downloads/AEON_Rationale (Apr 2017).pdf')

Testing single document: /Users/macbookair/Downloads/AEON_Rationale (Apr 2017).pdf
Processing AEON_Rationale (Apr 2017).pdf...
{
  "company_name": "AEON CO. BHD.",
  "assessment_year": "2017",
  "assessment_month": "Not mentioned in the document",
  "rating_action_basis": "The company's financial performance and debt levels",
  "rating_drivers_positive": [
    "Strong revenue growth"
  ],
  "rating_drivers_negative": [
    "High debt levels",
    "Weak profitability"
  ],
  "rating_triggers_upgrade": [
    "Improvement in profitability",
    "Reduction in debt levels"
  ],
  "rating_triggers_downgrade": [
    "Further deterioration in financial performance",
    "Increase in debt levels"
  ],
  "esg_descriptor": "Not mentioned in the document",
  "related_criteria": [
    "RAM Rating methodologies and criteria"
  ],
  "specific_ratings": [
    {
      "rating_level": "BBB- (Stable)",
      "applicable_to": "Long-term issuer rating"
    },
    {
      "rating_level": "A3 (Stable)",
    