In [1]:
!pip install nltk
!pip install transformers
!pip install torch
!pip install scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import nltk
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
nltk.download('brown')
from transformers import BertTokenizer, BertModel
import torch
from torch import nn
import numpy as np


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)


In [None]:
def generate_sentence_embeddings(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    inputs = {key: value.to(device) for key, value in inputs.items()}
    outputs = model(**inputs)
    attention_mask = inputs['attention_mask']
    embeddings = (outputs.last_hidden_state * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1).unsqueeze(-1)
    return embeddings.cpu().detach().numpy().flatten()


In [None]:
def lsa_summarizer(text, num_sentences=10):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(text.split('. '))
    svd = TruncatedSVD(n_components=num_sentences)
    lsa = svd.fit_transform(X)
    top_sentences = np.argsort(-lsa.sum(axis=1))[:num_sentences]
    summary = '. '.join([text.split('. ')[i] for i in top_sentences])
    return summary


In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, input_dim),
            nn.ReLU()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

input_dim = 768  # Dimension of BERT embeddings
hidden_dim = 256  # Dimension of the hidden layer
autoencoder = Autoencoder(input_dim, hidden_dim).to(device)


In [None]:
from torch.utils.data import Dataset, DataLoader, random_split
from nltk.corpus import brown
class TextSummarizationDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        embeddings = generate_sentence_embeddings(text)
        return embeddings

# Extract sentences from the Brown Corpus
texts = brown.sents(categories='news')[:1000]  # Using 1000 sentences for demonstration
texts = [' '.join(sent) for sent in texts]

total_len = len(texts)
train_len = int(0.75 * total_len)  # 75% for training
val_len = total_len - train_len    # 25% for validation

full_dataset = TextSummarizationDataset(texts)
train_dataset, val_dataset = random_split(full_dataset, [train_len, val_len])
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)


In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=0.001)
epochs = 5  # Reduced from 20 to 5


In [None]:
for epoch in range(epochs):
    for batch in train_loader:
        embeddings = batch.squeeze().to(device)
        optimizer.zero_grad()
        reconstructed = autoencoder(embeddings)
        loss = criterion(reconstructed, embeddings)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

class BERTEmbeddingTransformer(BaseEstimator, TransformerMixin):
    """Custom transformer to generate BERT embeddings."""
    def __init__(self, model_name='bert-base-uncased', max_length=512):
        self.model_name = model_name
        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """Generate BERT embeddings for input text."""
        embeddings = []
        for text in X:
            inputs = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length')
            inputs = {key: value.to(self.device) for key, value in inputs.items()}
            with torch.no_grad():
                outputs = self.model(**inputs)
            # Use the mean of the last hidden state as the embedding
            embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()
            embeddings.append(embedding)
        return np.array(embeddings)

class LSASummarizerTransformer(BaseEstimator, TransformerMixin):
    """Custom transformer to generate LSA-based summaries."""
    def __init__(self, num_sentences=10):
        self.num_sentences = num_sentences
        self.vectorizer = TfidfVectorizer()
        self.lsa = TruncatedSVD(n_components=100)

    def fit(self, X, y=None):
        """Fit the LSA model on the input text."""
        tfidf_matrix = self.vectorizer.fit_transform(X)
        self.lsa.fit(tfidf_matrix)
        return self

    def transform(self, X):
        """Generate LSA-based summaries for input text."""
        tfidf_matrix = self.vectorizer.transform(X)
        lsa_features = self.lsa.transform(tfidf_matrix)
        return lsa_features

In [None]:
# Extract articles and their categories
categories = brown.categories()
texts = []
labels = []

for category in categories:
    for fileid in brown.fileids(categories=category):
        texts.append(' '.join(brown.words(fileid)))
        labels.append(category)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.25, random_state=42)


In [None]:
def segment_text(text):
    from nltk.tokenize import sent_tokenize
    sentences = sent_tokenize(text)
    return sentences


In [None]:
def compute_similarity_matrix(embeddings):
    from sklearn.metrics.pairwise import cosine_similarity
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix




In [None]:
def rank_units(similarity_matrix, units, num_units=10):
    unit_ranks = np.argsort(-similarity_matrix.sum(axis=1))[:num_units]
    ranked_units = [units[i] for i in unit_ranks]
    return ranked_units


In [None]:
def evaluate_model_performance(data_loader, model, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            embeddings = batch.squeeze().to(device)
            reconstructed = model(embeddings)
            loss = criterion(reconstructed, embeddings)
            total_loss += loss.item()
    avg_loss = total_loss / len(data_loader)
    return avg_loss

avg_loss = evaluate_model_performance(val_loader, autoencoder, criterion)
print("Validation Loss:", avg_loss)


In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
def process_input_and_generate_summary():
    # Step 1: Get input text
    text = input("Enter the text to summarize: ")

    # Step 2: Segment the text into sentences
    sentences = segment_text(text)

    # Step 3: Generate BERT embeddings for each sentence
    bert_embeddings = np.array([generate_sentence_embeddings(sent) for sent in sentences])

    # Step 4: Compute similarity matrix using BERT embeddings
    similarity_matrix = compute_similarity_matrix(bert_embeddings)

    # Step 5: Rank sentences based on similarity scores (BERT)
    ranked_sentences = rank_units(similarity_matrix, sentences)

    # Step 6: Generate BERT summary
    bert_summary = '. '.join(ranked_sentences)

    # Step 7: Generate LSA summary
    lsa_summary = lsa_summarizer(text, num_sentences=10)
    combined_summary = "BERT Summary:\n" + '. '.join(ranked_sentences) + "\n\nLSA Summary:\n" + lsa_summary

    # Step 8: Print the combined summary
    print("Combined Summary (BERT + LSA):\n", combined_summary)
    # Step 9: Print BERT, LSA, and combined summaries
    print("BERT Summary:\n", bert_summary)
    print("\nLSA Summary:\n", lsa_summary)

# Example usage
process_input_and_generate_summary()

In [None]:
import numpy as np

def segment_text(text):
    """Segment the input text into sentences."""
    sentences = text.split('. ')
    return [s.strip() for s in sentences if s.strip()]

def generate_sentence_embeddings(sentence):
    """Generate sentence embeddings using a pre-trained BERT model."""
    # Placeholder for actual BERT embedding generation
    # Replace with actual implementation (e.g., using HuggingFace Transformers)
    return np.random.rand(768)  # Example: random vector of size 768

def compute_similarity_matrix(embeddings):
    """Compute the similarity matrix using cosine similarity."""
    from sklearn.metrics.pairwise import cosine_similarity
    return cosine_similarity(embeddings)

def rank_units(similarity_matrix, sentences):
    """Rank sentences based on similarity scores."""
    scores = np.mean(similarity_matrix, axis=1)
    ranked_indices = np.argsort(scores)[::-1]  # Sort in descending order
    return [sentences[i] for i in ranked_indices]

def lsa_summarizer(text, num_sentences=10):
    """Generate an LSA-based summary."""
    # Placeholder for actual LSA summarization
    # Replace with actual implementation (e.g., using Gensim or Scikit-learn)
    sentences = segment_text(text)
    return sentences[:num_sentences]  # Example: return first 10 sentences

def combine_summaries(bert_sentences, lsa_sentences, max_sentences=10):
    """Combine BERT and LSA summaries into a single summary."""
    # Remove duplicates while preserving order
    combined_sentences = []
    seen = set()
    for sent in bert_sentences + lsa_sentences:
        if sent not in seen:
            combined_sentences.append(sent)
            seen.add(sent)
    # Limit the number of sentences in the final summary
    return '. '.join(combined_sentences[:max_sentences])

def process_input_and_generate_summary():
    """Process input text and generate BERT, LSA, and combined summaries."""
    try:
        # Step 1: Get input text
        text = input("Enter the text to summarize: ")

        # Step 2: Segment the text into sentences
        sentences = segment_text(text)
        if not sentences:
            raise ValueError("No valid sentences found in the input text.")

        # Step 3: Generate BERT embeddings for each sentence
        bert_embeddings = np.array([generate_sentence_embeddings(sent) for sent in sentences])

        # Step 4: Compute similarity matrix using BERT embeddings
        similarity_matrix = compute_similarity_matrix(bert_embeddings)

        # Step 5: Rank sentences based on similarity scores (BERT)
        bert_ranked_sentences = rank_units(similarity_matrix, sentences)

        # Step 6: Generate BERT summary
        bert_summary = '. '.join(bert_ranked_sentences)

        # Step 7: Generate LSA summary
        lsa_ranked_sentences = lsa_summarizer(text, num_sentences=10)
        lsa_summary = '. '.join(lsa_ranked_sentences)

        # Step 8: Combine BERT and LSA summaries
        combined_summary = combine_summaries(bert_ranked_sentences, lsa_ranked_sentences)

        # Step 9: Print the summaries
        print("\nBERT Summary:\n", bert_summary)
        print("\nLSA Summary:\n", lsa_summary)
        print("\nCombined Summary (BERT + LSA):\n", combined_summary)

    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
process_input_and_generate_summary()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

In [None]:
# Example usage
from transformers import BertModel, BertTokenizer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Define the BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the Pipeline
vectorizer = TfidfVectorizer()
lsa = TruncatedSVD(n_components=100)
classifier = LogisticRegression()
pipeline = Pipeline([
    ('tfidf', vectorizer),
    ('lsa', lsa),
    ('clf', classifier)
])


In [None]:
import joblib
import torch
from transformers import BertModel, BertTokenizer

def save_model(model, tokenizer, pipeline, model_name="bert_text_summarizer", pipeline_name="lsa_classifier.pkl", full_model_path="full_model.pth"):
    """
    Save the BERT model, tokenizer, and Pipeline object separately.

    Args:
        model: The BERT model (e.g., BertModel).
        tokenizer: The BERT tokenizer (e.g., BertTokenizer).
        pipeline: The scikit-learn Pipeline object.
        model_name: Directory to save the BERT model and tokenizer.
        pipeline_name: Filename to save the Pipeline object.
        full_model_path: Filename to save the BERT model's state dictionary.
    """
    # Save BERT model and tokenizer using Hugging Face's save_pretrained
    model.save_pretrained(model_name)
    tokenizer.save_pretrained(model_name)

    # Save the Pipeline object using joblib
    joblib.dump(pipeline, pipeline_name)

    # Save the BERT model's state dictionary using torch.save
    torch.save(model.state_dict(), full_model_path)

    print(f"BERT model and tokenizer saved in directory: {model_name}/")
    print(f"Pipeline saved as: {pipeline_name}")
    print(f"BERT model state dictionary saved as: {full_model_path}")