In [1]:
!pip install nltk
!pip install transformers
!pip install torch
!pip install scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import nltk
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
nltk.download('brown')
from transformers import BertTokenizer, BertModel
import torch
from torch import nn
import numpy as np


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [5]:
def generate_sentence_embeddings(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    inputs = {key: value.to(device) for key, value in inputs.items()}
    outputs = model(**inputs)
    attention_mask = inputs['attention_mask']
    embeddings = (outputs.last_hidden_state * attention_mask.unsqueeze(-1)).sum(dim=1) / attention_mask.sum(dim=1).unsqueeze(-1)
    return embeddings.cpu().detach().numpy().flatten()


In [6]:
def lsa_summarizer(text, num_sentences=3):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(text.split('. '))
    svd = TruncatedSVD(n_components=num_sentences)
    lsa = svd.fit_transform(X)
    top_sentences = np.argsort(-lsa.sum(axis=1))[:num_sentences]
    summary = '. '.join([text.split('. ')[i] for i in top_sentences])
    return summary


In [7]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, input_dim),
            nn.ReLU()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

input_dim = 768  # Dimension of BERT embeddings
hidden_dim = 256  # Dimension of the hidden layer
autoencoder = Autoencoder(input_dim, hidden_dim).to(device)


In [8]:
from torch.utils.data import Dataset, DataLoader, random_split
from nltk.corpus import brown
class TextSummarizationDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        embeddings = generate_sentence_embeddings(text)
        return embeddings

# Extract sentences from the Brown Corpus
texts = brown.sents(categories='news')[:1000]  # Using 1000 sentences for demonstration
texts = [' '.join(sent) for sent in texts]

total_len = len(texts)
train_len = int(0.75 * total_len)  # 75% for training
val_len = total_len - train_len    # 25% for validation

full_dataset = TextSummarizationDataset(texts)
train_dataset, val_dataset = random_split(full_dataset, [train_len, val_len])
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)


In [9]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=0.001)
epochs = 5  # Reduced from 20 to 5


In [10]:
for epoch in range(epochs):
    for batch in train_loader:
        embeddings = batch.squeeze().to(device)
        optimizer.zero_grad()
        reconstructed = autoencoder(embeddings)
        loss = criterion(reconstructed, embeddings)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')


Epoch 1/5, Loss: 0.08213607221841812
Epoch 2/5, Loss: 0.08195643126964569
Epoch 3/5, Loss: 0.0772591084241867
Epoch 4/5, Loss: 0.06934292614459991
Epoch 5/5, Loss: 0.07209854573011398


In [11]:
class BERTEmbeddingTransformer:
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([generate_sentence_embeddings(text) for text in X])

class LSASummarizerTransformer:
    def __init__(self, num_sentences=3):
        self.num_sentences = num_sentences

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([generate_sentence_embeddings(lsa_summarizer(text, self.num_sentences)) for text in X])

classifier_pipeline = Pipeline([
    ('features', FeatureUnion([
        ('bert', BERTEmbeddingTransformer()),
        ('lsa', LSASummarizerTransformer(num_sentences=3))
    ])),
    ('clf', LogisticRegression())
])


In [12]:
# Extract articles and their categories
categories = brown.categories()
texts = []
labels = []

for category in categories:
    for fileid in brown.fileids(categories=category):
        texts.append(' '.join(brown.words(fileid)))
        labels.append(category)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.25, random_state=42)


In [13]:
classifier_pipeline.fit(X_train, y_train)
y_pred = classifier_pipeline.predict(X_val)
accuracy = classifier_pipeline.score(X_val, y_val)
print(f'Validation Accuracy: {accuracy}')
print(classification_report(y_val, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Validation Accuracy: 0.496
                 precision    recall  f1-score   support

      adventure       0.58      0.78      0.67         9
 belles_lettres       0.50      0.42      0.45        24
      editorial       0.20      0.33      0.25         3
        fiction       0.27      0.43      0.33         7
     government       1.00      0.33      0.50         6
        hobbies       0.29      0.33      0.31         6
          humor       0.00      0.00      0.00         2
        learned       0.43      0.64      0.51        14
           lore       0.67      0.32      0.43        19
        mystery       0.14      0.20      0.17         5
           news       0.56      1.00      0.71        10
       religion       1.00      0.25      0.40         4
        reviews       0.75      0.60      0.67         5
        romance       0.57      0.50      0.53         8
science_fiction       0.00      0.00      0.00         3

       accuracy                           0.47       125
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
def segment_text(text):
    from nltk.tokenize import sent_tokenize
    sentences = sent_tokenize(text)
    return sentences


In [15]:
def compute_similarity_matrix(embeddings):
    from sklearn.metrics.pairwise import cosine_similarity
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix




In [16]:
def rank_units(similarity_matrix, units, num_units=3):
    unit_ranks = np.argsort(-similarity_matrix.sum(axis=1))[:num_units]
    ranked_units = [units[i] for i in unit_ranks]
    return ranked_units


In [17]:
def evaluate_model_performance(data_loader, model, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            embeddings = batch.squeeze().to(device)
            reconstructed = model(embeddings)
            loss = criterion(reconstructed, embeddings)
            total_loss += loss.item()
    avg_loss = total_loss / len(data_loader)
    return avg_loss

avg_loss = evaluate_model_performance(val_loader, autoencoder, criterion)
print("Validation Loss:", avg_loss)


Validation Loss: 0.07437726429530553


In [19]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [24]:
def process_input_and_generate_summary():
    text = input("Enter the text to summarize: ")
    sentences = segment_text(text)
    embeddings = np.array([generate_sentence_embeddings(sent) for sent in sentences])
    similarity_matrix = compute_similarity_matrix(embeddings)
    ranked_units = rank_units(similarity_matrix, sentences)
    summary = '. '.join(ranked_units)

    bert_embeddings = generate_sentence_embeddings(summary)

    print("Summary:\n", summary)

process_input_and_generate_summary()


Enter the text to summarize: Building Linear Optimization Models:  Identify decision variables (e.g., number of products to produce).  Define the objective function (e.g., maximize profit).  Set constraints (e.g., resource limits).  Translate objectives and constraints into mathematical expressions.  Implement in a spreadsheet and use Excel Solver to find the optimal solution.  Analyze results, looking at optimal values, constraints, and slack values.  Solving Linear Optimization Models:  Identify feasible solutions that meet all constraints.  Find the optimal solution within the feasible region that provides the best value for the objective function.  Graphical Interpretation:  Visually understand linear optimization for problems with two decision variables.  Plot constraints on a graph to identify the feasible region.  Find the optimal solution at a corner point of the feasible region.  Using Optimization Models for Insights:  Explore different scenarios and understand the impact of 