In [2]:
#Abstractive Summarization with Hugging Face Transformers
#Uses a pre-trained model like BART
#Use this for natural language summaries (more human-like)

import docx  # python-docx to handle DOCX files
from transformers import pipeline

# Function to extract text from DOCX
def extract_docx_text(path):
    doc = docx.Document(path)
    return "\n".join([para.text for para in doc.paragraphs])

# Load the summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Set your DOCX file path here
docx_path = r"C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\Dr.X Files\Stats.docx"  # Change this to your actual file path

# Extract text from the DOCX file
docx_text = extract_docx_text(docx_path)

# Make sure the text isn't too short or too long for the model
if len(docx_text) > 1024:
    docx_text = docx_text[:1024]  # Limit to 1024 characters for demo (can be chunked for full doc)

# Generate summary
summary = summarizer(docx_text, max_length=150, min_length=50, do_sample=False)

# Print the summary
print("Summary:\n", summary[0]['summary_text'])



Device set to use cpu


Summary:
 The Machine Readable file provides regression coefficients and intercepts for different components to calculate low, mid, and high material price estimates. The file provides a list of envelope and non-envelope components (e.g., Windows, Water Heaters) and any associated classes within those components.


In [3]:
#T5 Model with Transformers
#t5-small handles up to 512 tokens — not characters — so we split the text by word count.

import docx  # python-docx to handle DOCX files
from transformers import T5Tokenizer, T5ForConditionalGeneration

# -------- Step 1: Extract text from DOCX --------
def extract_docx_text(path):
    doc = docx.Document(path)
    return "\n".join([para.text for para in doc.paragraphs])

# -------- Step 2: Chunk text into 512-token pieces --------
def chunk_text(text, chunk_size=500):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

# -------- Step 3: Load T5 model and tokenizer --------
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# -------- Step 4: Summarize each chunk --------
def summarize_with_t5(text_chunks):
    summaries = []
    for chunk in text_chunks:
        input_text = "summarize: " + chunk
        input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

        output_ids = model.generate(
            input_ids,
            max_length=150,
            min_length=40,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        summaries.append(summary)
    return " ".join(summaries)

# -------- Step 5: Put it all together --------
docx_path = r"C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\Dr.X Files\Stats.docx"  # Update this to your DOCX file
full_text = extract_docx_text(docx_path)
text_chunks = chunk_text(full_text)
final_summary = summarize_with_t5(text_chunks)

# -------- Print the result --------
print("FINAL SUMMARY:\n")
print(final_summary)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


FINAL SUMMARY:

the machine readable file provides regression coefficients and intercepts for different components to calculate low, mid, and high material price estimates and labor multipliers/add-ons. the file provides a list of envelope and non-envelope components (e.g., Windows, Water Heaters) and any associated classes within those components (e.g., Low Emissivity, Electric Instantaneous) additional data The last section of the file contains additional data not directly within the calculation of each component and product class. these include the expected lifetime (in years) of the component, cost variation considerations, a list of data sources used in the analysis for each component. the labor cost is calculated by subtracting the material price from the installed cost. Example 2 Unfinished Attic Ceiling Batt Insulation (Retrofit Installation Adder) Example for calculating the low, mid, and high retail price along with the associated labor for replacing ceiling insulation in an 

In [4]:
#TextRank Summarization with Sumy
import docx  # python-docx to handle DOCX files
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

# -------- Step 1: Extract text from DOCX --------
def extract_docx_text(path):
    doc = docx.Document(path)
    return "\n".join([para.text for para in doc.paragraphs])

# -------- Step 2: Summarize the extracted text --------
def summarize_docx_text(docx_path):
    text = extract_docx_text(docx_path)
    
    # Use the TextRank Summarizer
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, sentences_count=5)  # Number of sentences in the summary

    # Output the summary
    print("TextRank Summary:\n")
    for sentence in summary:
        print(sentence)

# -------- Step 3: Run the summarization --------
docx_path = r"C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\Dr.X Files\Stats.docx"  # Update this path with your DOCX file
summarize_docx_text(docx_path)




TextRank Summary:

Introduction The purpose of the Machine Readable file (“the file”) is to provide regression coefficients and intercepts for different components to calculate low, mid, and high (10th, 50th, and 90th percentile) material price estimates and labor multipliers/add-ons to estimate new construction and retrofit project costs.
The second and third sections of the file (Retail Price Regression) show the “Coefficient-Low”, “Coefficient-Mid”, and “Coefficient-High” values that correspond to the low, mid, and high quantile regression coefficients that are used to multiply the chosen performance metric values.
After getting the estimated material price from calculating the material price regression using the coefficients, intercepts, and chosen performance metric input values, the multiplier is used to calculate the total installed cost or cost per square foot.
The numbers in red correspond to the different coefficients in the flat CSV file for the two performance metrics and t

In [5]:
#Luhn Summarizer (Sumy)
import docx  # python-docx to handle DOCX files
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.luhn import LuhnSummarizer

# -------- Step 1: Extract text from DOCX --------
def extract_docx_text(path):
    doc = docx.Document(path)
    return "\n".join([para.text for para in doc.paragraphs])

# -------- Step 2: Summarize the extracted text --------
def summarize_docx_text(docx_path):
    text = extract_docx_text(docx_path)
    
    # Use the Luhn Summarizer
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LuhnSummarizer()
    summary = summarizer(parser.document, sentences_count=5)  # Number of sentences in the summary

    # Output the summary
    print("Luhn Summary:\n")
    for sentence in summary:
        print(sentence)

# -------- Step 3: Run the summarization --------
docx_path = r"C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\Dr.X Files\Stats.docx"  # Update this path with your DOCX file
summarize_docx_text(docx_path)



Luhn Summary:

Introduction The purpose of the Machine Readable file (“the file”) is to provide regression coefficients and intercepts for different components to calculate low, mid, and high (10th, 50th, and 90th percentile) material price estimates and labor multipliers/add-ons to estimate new construction and retrofit project costs.
After getting the estimated material price from calculating the material price regression using the coefficients, intercepts, and chosen performance metric input values, the multiplier is used to calculate the total installed cost or cost per square foot.
Prevailing local wages Drive time Access Presence/condition/type of existing insulation Existing construction and materials Moisture issues present Condition of existing flue Need for condensate line/drain Need to bring in combustion air Condition of existing electrical system Presence of hazardous materials Nature/size of leaks Extent of preparation Each regression was given a confidence rating in the 

In [6]:
#SpaCy + Sentence Scoring (Custom Extractive)
import docx  # python-docx to handle DOCX files
import spacy

# -------- Step 1: Extract text from DOCX --------
def extract_docx_text(path):
    doc = docx.Document(path)
    return "\n".join([para.text for para in doc.paragraphs])

# -------- Step 2: Summarize the extracted text using SpaCy --------
def summarize_docx_text(docx_path):
    # Load the SpaCy model
    nlp = spacy.load("en_core_web_sm")
    
    # Extract text from the DOCX file
    text = extract_docx_text(docx_path)
    
    # Process the text with SpaCy
    doc = nlp(text)
    
    # Dictionary to store sentence scores
    sentence_scores = {}

    # Simple heuristic: score = named entities + noun chunks
    for sent in doc.sents:
        score = len(list(sent.ents)) + len(list(sent.noun_chunks))
        sentence_scores[sent] = score

    # Get top 5 sentences based on the highest score
    top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:5]

    # Output the top sentences
    print("SpaCy Custom Extractive Summary:\n")
    for sentence in top_sentences:
        print(sentence.text.strip())

# -------- Step 3: Run the summarization --------
docx_path = r"C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\Dr.X Files\Stats.docx"  # Update with your DOCX file path
summarize_docx_text(docx_path)


SpaCy Custom Extractive Summary:

Prevailing local wages
Drive time
Access
Presence/condition/type of existing insulation
Existing construction and materials
Moisture issues present
Condition of existing flue
Need for condensate line/drain
Need to bring in combustion air
Condition of existing electrical system
Presence of hazardous materials
Nature/size of leaks
Extent of preparation
Each regression was given a confidence rating in the categories of sample size (SS), median  (R2), and source diversity, to qualify how robust the data and corresponding regressions are.
The numbers in red correspond to the different coefficients in the flat CSV file for the two performance metrics and the low, mid, and high regressions:

Where A is the capacity in tons, B is the efficiency in SEER1, and C is the intercept value (constant).
Introduction
The purpose of the Machine Readable file (“the file”) is to provide regression coefficients and intercepts for different components to calculate low, mid, 