<a href="https://colab.research.google.com/github/venkatsai2730/PDF-summarizer/blob/main/Insights_of_pdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pymupdf spacy textblob tabulate
!python -m spacy download en_core_web_lg


import fitz  # PyMuPDF
import re
import spacy
import logging
import pandas as pd
from textblob import TextBlob
from collections import defaultdict
from tabulate import tabulate
from google.colab import files
import subprocess

# Ensure spaCy model is installed
try:
    nlp = spacy.load("en_core_web_lg")
except OSError:
    print("Downloading 'en_core_web_lg' model...")
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"])
    nlp = spacy.load("en_core_web_lg")

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class FinancialPDFAnalyzer:
    """
    A class to analyze financial PDF documents for investment analysis.
    """
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.text = ""
        self.data = defaultdict(list)

    def extract_text(self):
        """Extracts text from a PDF file."""
        logging.info("Extracting text from PDF...")
        try:
            doc = fitz.open(self.pdf_path)
            self.text = "\n".join(page.get_text("text") for page in doc)
        except Exception as e:
            logging.error(f"Error extracting text: {e}")
        return self.text

    def preprocess_text(self):
        """Cleans and preprocesses extracted text."""
        logging.info("Preprocessing text...")
        self.text = re.sub(r'\n+', '\n', self.text)  # Remove extra newlines
        self.text = re.sub(r'\s+', ' ', self.text).strip()  # Normalize spaces

    def extract_key_information(self):
        """Extracts key investment-related insights from the text."""
        logging.info("Extracting key financial insights...")
        doc = nlp(self.text)

        categories = {
            "Future Growth": ["growth prospects", "market expansion", "forecast"],
            "Business Model Changes": ["strategy", "acquisition", "merger", "restructuring"],
            "Market Triggers": ["inflation", "interest rates", "policy change"],
            "Material Information": ["earnings impact", "financial statement"],
            "Risk Factors": ["challenges", "uncertainty", "lawsuit", "shortfall"],
            "Financial Metrics": ["revenue", "profit", "EBITDA", "margin"],
            "Management Commentary": ["CEO", "executive", "board", "guidance"]
        }

        for sent in doc.sents:
            for category, keywords in categories.items():
                if any(keyword in sent.text.lower() for keyword in keywords):
                    self.data[category].append(sent.text)
                    break

    def analyze_sentiment(self):
        """Performs sentiment analysis on management commentary."""
        logging.info("Performing sentiment analysis...")
        sentiments = [TextBlob(text).sentiment.polarity for text in self.data["Management Commentary"]]
        avg_sentiment = sum(sentiments) / len(sentiments) if sentiments else 0
        self.data["Sentiment Analysis"] = [f"Sentiment Score: {avg_sentiment:.2f}"]

    def generate_report(self):
        """Generates and prints a structured report."""
        logging.info("Generating investment analysis report...")
        report = "\n\n".join([f"## {category}\n" + "\n".join(texts) for category, texts in self.data.items()])
        with open("Financial_Analysis_Report.txt", "w") as f:
            f.write(report)
        print(tabulate(self.data.items(), headers=["Category", "Extracted Information"], tablefmt="grid"))

    def run_analysis(self):
        """Runs the complete analysis pipeline."""
        self.extract_text()
        self.preprocess_text()
        self.extract_key_information()
        self.analyze_sentiment()
        self.generate_report()

if __name__ == "__main__":
    print("Upload a PDF file for analysis:")
    uploaded = files.upload()
    pdf_path = list(uploaded.keys())[0]
    analyzer = FinancialPDFAnalyzer(pdf_path)
    analyzer.run_analysis()
    print("\n✅ Report saved as 'Financial_Analysis_Report.txt'")


Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.3
Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab

Saving SJS Transcript Call.pdf to SJS Transcript Call.pdf
+------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------