<a href="https://colab.research.google.com/github/tubagokhan/SummarizationHybrid/blob/main/OptimumClusterNumber.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install scikit-learn
!pip install matplotlib
!pip install sentence_transformers

In [None]:
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize

import nltk
nltk.download('punkt')

import warnings
import re

In [None]:
# Step 1
from datasets import load_dataset

#dataset = load_dataset('cnn_dailymail', '3.0.0')
#corpus = dataset['train']['article'][50]

In [None]:
dataset = load_dataset("ccdv/govreport-summarization")
corpus= dataset['train']['report'][100]

In [None]:
# Preprocessing method
def preprocess_corpus(text):
    # Remove special characters and extra whitespaces
    text = re.sub(r"[^a-zA-Z0-9\s.]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

In [None]:
# Step 2
# Preprocess the corpus
corpus = preprocess_corpus(corpus)
sentences=sent_tokenize(corpus)
model = SentenceTransformer('all-mpnet-base-v2')
sentence_embeddings = model.encode(sentences)


In [None]:
# Step 3
#function calculates the optimal number of clusters using the elbow method. The function plots the elbow curve, which shows the inertia values for different cluster numbers. The user can visually inspect the plot to determine the elbow point, indicating the optimal number of clusters.
def find_optimum_clusters(data, max_clusters):
    inertias = []
    for k in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=0).fit(data)
        inertias.append(kmeans.inertia_)

    # Plotting the elbow curve
    plt.plot(range(1, max_clusters + 1), inertias)
    plt.xlabel("Number of Clusters")
    plt.ylabel("Inertia")
    plt.title("Elbow Curve")
    plt.show()

    # Calculate the optimal number of clusters using the elbow method
    diff = np.diff(inertias)
    acceleration = np.diff(diff)
    opt_cluster_num = acceleration.argmin() + 2  # Adding 2 to get the index of the minimum acceleration
    return opt_cluster_num

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    optimum_clusters = find_optimum_clusters(sentence_embeddings, int(len(sentences)/3))
print("Optimum cluster number:", optimum_clusters)

In [None]:
# Step 4
kmeans = KMeans(n_clusters=optimum_clusters, random_state=0, n_init='auto').fit(sentence_embeddings)

# Step 5
chosen_sentences = []
for cluster_id in range(optimum_clusters):
    cluster_indices = np.where(kmeans.labels_ == cluster_id)[0]
    chosen_sentence_index = np.random.choice(cluster_indices)
    chosen_sentences.append(corpus.split(". ")[chosen_sentence_index])

# Step 6
summary = ". ".join(chosen_sentences)
print("Summary:")
print(summary)


In [None]:
print(len(sentences))

In [None]:
print(len(chosen_sentences))