In [1]:
import re
import spacy

# Load English model
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    text = re.sub(r'\[[0-9]*\]', '', text)  # remove references
    return text.strip()


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def extractive_summary(text, num_sentences=3):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    
    # TF-IDF for sentence embeddings
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)
    
    # Similarity matrix (sentence vs. all)
    scores = cosine_similarity(X, X.sum(axis=0).reshape(1, -1)).flatten()
    
    # Top N sentences
    top_indices = np.argsort(scores)[-num_sentences:]
    top_sentences = [sentences[i] for i in sorted(top_indices)]
    
    return ' '.join(top_sentences)


In [3]:
from transformers import pipeline

# Load summarizer


In [22]:
from sumy.parsers.plaintext import PlaintextParser
from nltk.tokenize import sent_tokenize
from sumy.summarizers.lsa import LsaSummarizer

def extractive_summary(text, num_sentences=3):
    parser = PlaintextParser.from_string(text, lambda x: sent_tokenize(x))  # Use NLTK's tokenizer
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return " ".join(str(sentence) for sentence in summary)


In [28]:
import nltk
nltk.download('punkt')
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

def extractive_summary(text, num_sentences=3):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return " ".join(str(sentence) for sentence in summary)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ST\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
