In [2]:
!pip install nltk scikit-learn networkx transformers torch sumy spacy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m89.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: breadability, docopt
  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Created wheel for breadability: filename=brea

In [8]:
# Text Summarization - Extractive and Abstractive Models in Python
# Install required packages:
# pip install scikit-learn networkx transformers torch

import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import heapq

# Optional imports for abstractive summarization
try:
    from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    print("Transformers not available. Install with: pip install transformers torch")

class TextProcessor:
    """Basic text processing utilities without NLTK dependency"""

    def __init__(self):
        # Common English stopwords
        self.stop_words = {
            'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves',
            'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him',
            'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its',
            'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
            'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am',
            'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
            'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the',
            'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
            'at', 'by', 'for', 'with', 'through', 'during', 'before', 'after',
            'above', 'below', 'up', 'down', 'in', 'out', 'on', 'off', 'over',
            'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when',
            'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
            'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
            'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just',
            'don', 'should', 'now'
        }

    def sent_tokenize(self, text):
        """Split text into sentences using regex"""
        # Handle common abbreviations that shouldn't trigger sentence breaks
        text = re.sub(r'\b(?:Dr|Mr|Mrs|Ms|Prof|Sr|Jr)\.\s*', lambda m: m.group().replace('.', '<DOT>'), text)

        # Split on sentence endings
        sentences = re.split(r'[.!?]+\s+', text)

        # Restore abbreviation dots and clean up
        sentences = [s.replace('<DOT>', '.').strip() for s in sentences if s.strip()]

        return sentences

    def word_tokenize(self, text):
        """Split text into words using regex"""
        # Extract words (including contractions)
        words = re.findall(r"\b\w+(?:'\w+)?\b", text.lower())
        return words

    def clean_text(self, text):
        """Clean and preprocess text"""
        text = re.sub(r'\[[0-9]*\]', ' ', text)  # Remove citations
        text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
        return text.strip()

class ExtractiveSummarizer(TextProcessor):
    def __init__(self):
        super().__init__()

    def sentence_similarity(self, sent1, sent2, stopwords=None):
        """Calculate similarity between two sentences"""
        if stopwords is None:
            stopwords = self.stop_words

        words1 = [w for w in self.word_tokenize(sent1) if w not in stopwords]
        words2 = [w for w in self.word_tokenize(sent2) if w not in stopwords]

        if not words1 or not words2:
            return 0

        all_words = list(set(words1 + words2))

        vector1 = [words1.count(w) for w in all_words]
        vector2 = [words2.count(w) for w in all_words]

        # Calculate cosine similarity
        dot_product = sum(a * b for a, b in zip(vector1, vector2))
        magnitude1 = sum(a * a for a in vector1) ** 0.5
        magnitude2 = sum(a * a for a in vector2) ** 0.5

        if magnitude1 == 0 or magnitude2 == 0:
            return 0

        return dot_product / (magnitude1 * magnitude2)

    def textrank_summarize(self, text, num_sentences=3):
        """TextRank-based extractive summarization"""
        sentences = self.sent_tokenize(text)

        if len(sentences) <= num_sentences:
            return text

        # Create similarity matrix
        similarity_matrix = np.zeros([len(sentences), len(sentences)])

        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    similarity_matrix[i][j] = self.sentence_similarity(
                        sentences[i], sentences[j]
                    )

        # Create graph and apply PageRank
        try:
            nx_graph = nx.from_numpy_array(similarity_matrix)
            scores = nx.pagerank(nx_graph, max_iter=50)
        except:
            # Fallback if PageRank fails
            scores = {i: np.sum(similarity_matrix[i]) for i in range(len(sentences))}

        # Rank sentences
        ranked_sentences = sorted(
            ((scores[i], i, s) for i, s in enumerate(sentences)),
            reverse=True
        )

        # Select top sentences and maintain original order
        selected_indices = sorted([ranked_sentences[i][1] for i in range(num_sentences)])
        summary_sentences = [sentences[i] for i in selected_indices]

        return ' '.join(summary_sentences)

    def tfidf_summarize(self, text, num_sentences=3):
        """TF-IDF based extractive summarization"""
        sentences = self.sent_tokenize(text)

        if len(sentences) <= num_sentences:
            return text

        try:
            # Calculate TF-IDF scores
            vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
            tfidf_matrix = vectorizer.fit_transform(sentences)

            # Calculate sentence scores (sum of TF-IDF values)
            sentence_scores = np.array(tfidf_matrix.sum(axis=1)).flatten()

            # Get top sentences
            top_indices = sentence_scores.argsort()[-num_sentences:][::-1]
            top_indices.sort()  # Maintain original order

            summary = ' '.join([sentences[i] for i in top_indices])
            return summary
        except:
            # Fallback to frequency-based if TF-IDF fails
            return self.frequency_summarize(text, num_sentences)

    def frequency_summarize(self, text, num_sentences=3):
        """Frequency-based extractive summarization"""
        sentences = self.sent_tokenize(text)

        if len(sentences) <= num_sentences:
            return text

        # Calculate word frequencies
        clean_text = self.clean_text(text.lower())
        words = self.word_tokenize(clean_text)
        word_frequencies = {}

        for word in words:
            if word not in self.stop_words and len(word) > 2:
                word_frequencies[word] = word_frequencies.get(word, 0) + 1

        if not word_frequencies:
            return ' '.join(sentences[:num_sentences])

        # Normalize frequencies
        max_freq = max(word_frequencies.values())
        for word in word_frequencies:
            word_frequencies[word] = word_frequencies[word] / max_freq

        # Score sentences
        sentence_scores = {}
        for i, sentence in enumerate(sentences):
            words = self.word_tokenize(sentence.lower())
            score = 0
            word_count = 0
            for word in words:
                if word in word_frequencies:
                    score += word_frequencies[word]
                    word_count += 1

            if word_count > 0:
                sentence_scores[i] = score / word_count  # Average score

        # Get top sentences
        if not sentence_scores:
            return ' '.join(sentences[:num_sentences])

        top_indices = sorted(sentence_scores.keys(),
                           key=lambda x: sentence_scores[x],
                           reverse=True)[:num_sentences]
        top_indices.sort()  # Maintain original order

        return ' '.join([sentences[i] for i in top_indices])

class AbstractiveSummarizer:
    def __init__(self):
        self.models = {}
        if not TRANSFORMERS_AVAILABLE:
            print("Transformers library not available. Abstractive summarization disabled.")

    def load_model(self, model_name="facebook/bart-large-cnn"):
        """Load pre-trained abstractive summarization model"""
        if not TRANSFORMERS_AVAILABLE:
            raise ImportError("Transformers library not available")

        if model_name not in self.models:
            print(f"Loading model: {model_name}")
            try:
                self.models[model_name] = pipeline(
                    "summarization",
                    model=model_name,
                    tokenizer=model_name
                )
            except Exception as e:
                print(f"Failed to load model {model_name}: {e}")
                return None
        return self.models[model_name]

    def bart_summarize(self, text, max_length=150, min_length=50):
        """BART-based abstractive summarization"""
        if not TRANSFORMERS_AVAILABLE:
            print("Transformers not available. Cannot perform abstractive summarization.")
            return text[:500] + "..." if len(text) > 500 else text

        summarizer = self.load_model("facebook/bart-large-cnn")
        if summarizer is None:
            return text

        try:
            # Handle long texts
            max_input_length = 1024
            if len(text) > max_input_length:
                text = text[:max_input_length]

            summary = summarizer(text, max_length=max_length,
                               min_length=min_length, do_sample=False)
            return summary[0]['summary_text']
        except Exception as e:
            print(f"Error in BART summarization: {e}")
            return text

class HybridSummarizer:
    def __init__(self):
        self.extractive = ExtractiveSummarizer()
        self.abstractive = AbstractiveSummarizer() if TRANSFORMERS_AVAILABLE else None

    def hybrid_summarize(self, text, extract_ratio=0.4, final_length=100):
        """Combine extractive and abstractive approaches"""
        sentences = self.extractive.sent_tokenize(text)

        # First, extract key sentences
        num_extract_sentences = max(1, int(len(sentences) * extract_ratio))
        extracted_text = self.extractive.textrank_summarize(text, num_extract_sentences)

        # If abstractive summarization is available, apply it
        if self.abstractive and TRANSFORMERS_AVAILABLE:
            try:
                final_summary = self.abstractive.bart_summarize(
                    extracted_text,
                    max_length=final_length,
                    min_length=final_length//2
                )
                return final_summary
            except:
                return extracted_text
        else:
            return extracted_text

def evaluate_summary(original, summary):
    """Simple evaluation metrics"""
    processor = TextProcessor()

    orig_words = set(processor.word_tokenize(original))
    summ_words = set(processor.word_tokenize(summary))

    # Calculate basic metrics
    compression_ratio = len(summary) / len(original) if len(original) > 0 else 0
    word_overlap = len(orig_words.intersection(summ_words)) / len(orig_words) if len(orig_words) > 0 else 0

    return {
        'compression_ratio': compression_ratio,
        'word_overlap': word_overlap,
        'original_length': len(original),
        'summary_length': len(summary)
    }

# Example usage and comparison
def main():
    # Sample text
    sample_text = """
    Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to
    the natural intelligence displayed by humans and animals. Leading AI textbooks define
    the field as the study of "intelligent agents": any device that perceives its environment
    and takes actions that maximize its chance of successfully achieving its goals.
    Colloquially, the term "artificial intelligence" is often used to describe machines
    that mimic "cognitive" functions that humans associate with the human mind, such as
    "learning" and "problem solving". As machines become increasingly capable, tasks
    considered to require "intelligence" are often removed from the definition of AI,
    a phenomenon known as the AI effect. A quip in Tesler's Theorem says "AI is whatever
    hasn't been done yet." For instance, optical character recognition is frequently
    excluded from things considered to be AI, having become a routine technology.
    Modern machine learning techniques are essential to much of contemporary AI.
    These include deep learning, which uses artificial neural networks with multiple
    hidden layers to process complex patterns in data. Machine learning algorithms
    can be trained on large datasets to recognize patterns and make predictions or
    decisions without being explicitly programmed for specific tasks.
    """

    print("Original Text Length:", len(sample_text))
    print("-" * 80)

    # Extractive Summarization
    extractive = ExtractiveSummarizer()

    print("EXTRACTIVE SUMMARIZATION:")
    print("-" * 40)

    textrank_summary = extractive.textrank_summarize(sample_text, 2)
    print("TextRank Summary:")
    print(textrank_summary)
    print()

    tfidf_summary = extractive.tfidf_summarize(sample_text, 2)
    print("TF-IDF Summary:")
    print(tfidf_summary)
    print()

    freq_summary = extractive.frequency_summarize(sample_text, 2)
    print("Frequency-based Summary:")
    print(freq_summary)
    print()

    # Abstractive Summarization (only if transformers available)
    if TRANSFORMERS_AVAILABLE:
        print("ABSTRACTIVE SUMMARIZATION:")
        print("-" * 40)

        abstractive = AbstractiveSummarizer()

        try:
            bart_summary = abstractive.bart_summarize(sample_text)
            print("BART Summary:")
            print(bart_summary)
            print()
        except Exception as e:
            print(f"BART summarization failed: {e}")

        # Hybrid approach
        try:
            hybrid = HybridSummarizer()
            hybrid_summary = hybrid.hybrid_summarize(sample_text)
            print("Hybrid Summary:")
            print(hybrid_summary)
            print()
        except Exception as e:
            print(f"Hybrid summarization failed: {e}")
    else:
        print("Abstractive summarization not available (transformers not installed)")
        print()

    # Evaluation
    print("EVALUATION:")
    print("-" * 40)
    metrics = evaluate_summary(sample_text, textrank_summary)
    print("TextRank Metrics:", metrics)

if __name__ == "__main__":
    main()

Original Text Length: 1383
--------------------------------------------------------------------------------
EXTRACTIVE SUMMARIZATION:
----------------------------------------
TextRank Summary:
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to 
    the natural intelligence displayed by humans and animals As machines become increasingly capable, tasks 
    considered to require "intelligence" are often removed from the definition of AI, 
    a phenomenon known as the AI effect

TF-IDF Summary:
Leading AI textbooks define 
    the field as the study of "intelligent agents": any device that perceives its environment 
    and takes actions that maximize its chance of successfully achieving its goals A quip in Tesler's Theorem says "AI is whatever 
    hasn't been done yet." For instance, optical character recognition is frequently 
    excluded from things considered to be AI, having become a routine technology

Frequency-based Summary:
Artificial intelli

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


BART Summary:
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of "intelligent agents" As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI.

Loading model: facebook/bart-large-cnn


Device set to use cuda:0
Your max_length is set to 100, but your input_length is only 79. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


Hybrid Summary:
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to natural intelligence displayed by humans and animals. Modern machine learning techniques are essential to much of contemporary AI. As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI.

EVALUATION:
----------------------------------------
TextRank Metrics: {'compression_ratio': 0.23282718727404195, 'word_overlap': 0.264, 'original_length': 1383, 'summary_length': 322}
