In [9]:
!pip install spacy
!pip install transformers
!pip install nltk
!pip install PyPDF2
!pip install python-docx --upgrade



In [10]:
import PyPDF2
import docx
import spacy
from transformers import pipeline
import os
nlp = spacy.load("en_core_web_sm")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
def read_pdf(file_path):
    text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text.strip()
def read_docx(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text.strip()
def preprocess_text(text):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    return sentences
def extract_entities(text):
    doc = nlp(text)
    entities = [(entity.text, entity.label_) for entity in doc.ents]
    return entities
def summarize_text(text):
    if len(text) > 1024:
        text = text[:1024]
    summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
    return summary[0]['summary_text']
def process_legal_document(text):
    sentences = preprocess_text(text)
    entities = extract_entities(text)
    summary = summarize_text(text)
    return {
        "Sentences": sentences,
        "Entities": entities,
        "Summary": summary
    }
def main():
    file_path = input("Enter the path of legal file: ")
    if not os.path.isfile(file_path):
        print("Cannot Find!")
        return
    _, file_extension = os.path.splitext(file_path)
    if file_extension.lower() == '.pdf':
        document_text = read_pdf(file_path)
    elif file_extension.lower() == '.docx':
        document_text = read_docx(file_path)
    else:
        print("Only PDF or MS doc/docx files supported!!!")
        return
    result = process_legal_document(document_text)
    print("Extracted legal sentences:", result["Sentences"])
    print("Legal named entities:", result["Entities"])
    print("Final Summary:", result["Summary"])
if __name__ == "__main__":
    main()

Enter the path of legal file: /content/legal.docx
Legal named entities: [('1.', 'CARDINAL'), ('Docular Limited', 'ORG'), ('2', 'CARDINAL'), ('3', 'CARDINAL'), ('only one', 'CARDINAL'), ('4', 'CARDINAL'), ('5.', 'CARDINAL'), ('1.', 'CARDINAL'), ('Credit', 'ORG'), ('1.1', 'CARDINAL'), ('Docular', 'PERSON'), ('2.', 'CARDINAL'), ('2.1', 'CARDINAL'), ('2.2', 'CARDINAL'), ('3', 'CARDINAL'), ('3.1', 'CARDINAL'), ('3.2', 'CARDINAL'), ('Section 3.1', 'LAW'), ('4.', 'CARDINAL'), ('4.1', 'CARDINAL'), ('5.', 'CARDINAL'), ('5.1', 'CARDINAL'), ('5.2', 'CARDINAL'), ('5.3', 'CARDINAL'), ('6', 'CARDINAL'), ('6.1', 'CARDINAL'), ('6.2', 'CARDINAL'), ('6.3', 'CARDINAL'), ('7', 'CARDINAL'), ('Limits', 'ORG'), ('7.1', 'CARDINAL'), ('SEQ Legal', 'ORG'), ('Section 1: Credit\nSection', 'LAW'), ('Optional', 'ORG'), ('Section 5: Interactive', 'LAW'), ('Section 7: Limits', 'LAW'), ('Section 7.1', 'LAW')]
Final Summary: You will need to edit this template before use. Guidance notes to help you do so are set out at