In [1]:
#!pip install PyPDF2

In [2]:
#!pip install spacy
#!python -m spacy download en_core_web_md

## load_documents:
- Input:
    - input_data: list of strings (documents) or list of file paths (strings)
    - input_type: string ('documents' or 'file_paths')
- Output: 
    - list of strings (documents)

## build_sentence_graph:
- Input: 
    - list of strings (documents)
- Output: 
    - 2D list (adjacency list representing the sentence graph)

## spectral_clustering:
- Input: 
    - 2D list (adjacency list representing the sentence graph)
- Output: 
    - list of lists, where each inner list contains spaCy sentence objects (each cluster)

## fit:
- Input: 
    - list of strings (documents)  
    
> No output (updates the self.clusters attribute)

## compress_clusters:
- Input: 
    - list of lists, where each inner list contains spaCy sentence objects (clusters)
- Output: 
    - string (final summary)

## transform:
- Output: 
    - string (final summary)  
    
> No input (uses the self.clusters attribute)

## fit_transform:
- Input: 
    - list of strings (documents)
    - Output: string (final summary)

In [27]:
import PyPDF2
import spacy

class SummPIPX:
    def __init__(self, clustering_method='spectral_clustering', graph_method='build_sentence_graph', compression_method='compress_clusters'):
        self.version = 'v1'
        self.clustering_method = clustering_method
        self.graph_method = graph_method
        self.compression_method = compression_method

    def __pdf_to_text__(self, path):
        pdfreader = PyPDF2.PdfReader(path)
        text=''
        for page in pdfreader.pages:
            text+=page.extract_text()
        return text
    
    def load_documents(self, input_data, input_type='documents'):
        # Check the input type
        if input_type == 'documents':
            # If input_type is 'documents', assume input_data represents the documents
            documents = input_data
        elif input_type == 'file_paths':
            # If input_type is 'file_paths', assume input_data represents file paths
            documents = [self.__pdf_to_text__(file) for file in files]
        else:
            raise ValueError(f"Invalid input_type: {input_type}")
        return documents
    
    def build_sentence_graph(self, documents):
        # Default graph building method
        pass

    def build_another_graph_method(self, documents):
        # Another graph building method
        pass

    def spectral_clustering(self, sentence_graph):
        # Default clustering method
        return []

    def another_clustering_method(self, sentence_graph):
        # Another clustering method
        return []

    def compress_clusters(self, clusters):
        nlp = spacy.load("en_core_web_md")
        summary_sentences = []

        for cluster in clusters:
            if len(cluster) == 1:
                # If there's only one sentence in the cluster, add it to the summary
                summary_sentences.append(cluster[0])
            else:
                # Calculate the similarity scores between all sentences in the cluster
                similarity_matrix = [[sent1.similarity(sent2) for sent2 in cluster] for sent1 in cluster]

                # Calculate the sum of similarity scores for each sentence
                similarity_sums = [sum(row) for row in similarity_matrix]

                # Find the index of the sentence with the highest similarity score sum
                most_relevant_index = similarity_sums.index(max(similarity_sums))

                # Add the most relevant sentence to the summary
                summary_sentences.append(cluster[most_relevant_index])

        # Concatenate the summary sentences
        summary = " ".join(summary_sentences)
        return summary

    def another_compression_method(self, clusters):
        # Another compression method
        # ...
        return ''

    def fit(self, documents):
        # Call the appropriate graph building method
        graph_method = getattr(self, self.graph_method)
        sentence_graph = graph_method(documents)
        
        # Call the appropriate clustering method
        clustering_method = getattr(self, self.clustering_method)
        self.clusters = clustering_method(sentence_graph)

    def transform(self):
        # Call the appropriate compression method
        compression_method = getattr(self, self.compression_method)
        summary = compression_method(self.clusters)
        return summary

## Testing

In [29]:
files = [
    'dataset/NeuralNetworks/1460210.pdf', 
    'dataset/NeuralNetworks/Oken.pdf',
    'dataset/NeuralNetworks/week7b-neuralnetwork.pdf'
]

summpip = SummPIPX()
documents = summpip.load_documents(files, input_type='file_paths')
summpip.fit(documents)
summpip.transform()

''