In [1]:
#!pip install PyPDF2

In [2]:
#!pip install spacy
#!python -m spacy download en_core_web_md

## load_documents:
- Input:
    - input_data: list of strings (documents) or list of file paths (strings)
    - input_type: string ('documents' or 'file_paths')
- Output: 
    - list of strings (documents)

## build_sentence_graph:
- Input: 
    - list of strings (documents)
- Output: 
    - 2D list (adjacency list representing the sentence graph)

## spectral_clustering:
- Input: 
    - 2D list (adjacency list representing the sentence graph)
- Output: 
    - list of lists, where each inner list contains spaCy sentence objects (each cluster)

## fit:
- Input: 
    - list of strings (documents)  
    
> No output (updates the self.clusters attribute)

## compress_clusters:
- Input: 
    - list of lists, where each inner list contains spaCy sentence objects (clusters)
- Output: 
    - string (final summary)

## transform:
- Output: 
    - string (final summary)  
    
> No input (uses the self.clusters attribute)

## fit_transform:
- Input: 
    - list of strings (documents)
    - Output: string (final summary)

## Sentence graph module from SummPIP Impl

In [3]:
# -*- coding: utf-8 -*-
"""
Created on 2020-01-16 10:17 PM

author  : michelle
"""

import numpy as np
import gensim.downloader as api
import spacy
from nltk.corpus import wordnet as wn
from ordered_set import OrderedSet
import scipy
from scipy import *


glove_word_vectors = api.load("glove-wiki-gigaword-100")
spacynlp=spacy.load("en_core_web_sm")

# Step 1: Deverbal Noun Reference
# step 1.1: get nouns for verbs in the current sentence
verbs_to_escape = ["be", "is","am","are","was", "were", "being","been","do","did",
               "done","have","had","get","got","gotten"]

# Step 3: Discourse Markers => only for two adjacent sentences
markers=["for","so","because","since","therefore","consequently","additionally","furthermore","moreover",
         "but","however","although","despite","similarly","otherwise","whereas","while","unlike","thus",
        "instead","nevertheless","afterward","finally","subsequently","conversely","later","next","then",
         "likewise","compared","besides","further","as","also","equally","hence","accordingly","stil",
        "simultaneously"]
# 39 markers

class SentenceGraph:
    def __init__(self, sentences_list, w2v, use_lm, lm_model, lm_tokenizer, ita=0.9, threshold=0.65):
        self.sentences_list = sentences_list

        self.length = len(sentences_list)

        self.w2v = w2v

        self.use_lm = use_lm 

        self.lm_model = lm_model

        self.tokenizer = lm_tokenizer

        # threshold for step1
        self.threshold = threshold

        # threshold for step4
        self.ita = ita

    def get_nouns_for_verbs(self, string):
        doc = spacynlp(string)
        nouns_list = []
        if len(doc)>0:
            for token in doc:
                # find noun reference for verbs, escaping verbs that are too ambiguous
                if token.pos_ == "VERB" and token.text not in verbs_to_escape:
                    # print("token.text ", token.text)
                    noun_forms = self._nounify(token.text)
                    nouns_list.extend(noun_forms)
        return nouns_list

    def _nounify(self, verb):
        # get the lemmas of base verbs;
        base = wn.morphy(verb, wn.VERB)
        if base:
            lemmas = wn.lemmas(base, pos="v")
            noun_forms = []
            # derive noun forms for each lemma
            for lemma in lemmas:
                nouns = [forms.name() for forms in lemma.derivationally_related_forms()]
                noun_forms.extend(nouns)
            # remove repetition
            nouns_set = OrderedSet(noun_forms)
            return nouns_set
        else:
            return []

    # step 1.2: find most similar word from word2vec
    # get most similar words for nouns, including itself
    def find_most_similar_words(self, word_vectors,nouns_list,threshold=0.65):
        similar_nouns_list=[]
        nouns_list=list(set(nouns_list))
        for noun in nouns_list:
            try:
                nn = word_vectors.most_similar(positive=[noun])
                # keep nn whose have high similary score
                nn = [ tuple_[0] for tuple_ in nn if tuple_[1] > threshold]
                similar_nouns_list.extend(nn)
            # pass on uncommon words
            except KeyError:
                pass
        similar_nouns_list.extend(nouns_list)
        return list(set(similar_nouns_list))

    # check if deverbal noun reference exits in the subsequent sentence
    def check_noun_reference(self, similar_nouns_list, subsequent_sen):
        flag=False
        doc = spacynlp(subsequent_sen)
        if len(doc)>0:
            for token in doc:
                if token.pos_ == "NOUN":
                    if token.text in similar_nouns_list:
                        flag=True
                        break
        return flag

    # step 2: Event/Entity Continuation
    # Str needs to be raw, i.e., use str before normalisation and stemming
    def compare_name_entity(self, str1, str2):
        flag = False
        doc1 = spacynlp(str1)
        doc2 = spacynlp(str2)
        if len(doc1)>0 and len(doc2)>0:
            ent_list1=[(ent.text, ent.label_) for ent in doc1.ents]
            ent_list2=[(ent.text, ent.label_) for ent in doc2.ents]
            for (text, label) in ent_list1:
                if (text, label) in ent_list2:
                    flag=True
                    break
        return flag

    def check_discourse_markers(self, str1,str2):
        flag = False
        doc2 = spacynlp(str2)
        if len(doc2)>0:
            first_token = doc2[0].text
            if first_token.lower() in markers:
                flag = True
        return flag

    # compute the cos similarity between a and b. a, b are numpy arrays
    def cos_sim(self, a, b):
        return 1 - scipy.spatial.distance.cosine(a,b)


    def make_graph_undirected(self, source, target, weight):
        source.extend(target)
        target.extend(source)
        weight.extend(weight)
        triplet_list=[ (source[i],target[i],weight[i]) for i in range(len(source))]
        sorted_by_src = sorted(triplet_list, key=lambda x: (x[0],x[1]))

        sorted_source = []
        sorted_target = []
        sorted_weight = []
        for triplet in sorted_by_src:
            sorted_source.append(triplet[0])
            sorted_target.append(triplet[1])
            sorted_weight.append(triplet[2])
        return sorted_source, sorted_target, sorted_weight

    # Step4: calculate sentence embeddings
    def get_sentence_embeddings(self,string):
        if not self.use_lm:
            v = self.get_wv_embedding(string)
        else:
            v = self.get_lm_embedding(string)
        return v

    # get sentence embeddings with w2v
    def get_wv_embedding(self, string):
        word_embeddings = self.w2v
        sent = string.lower()
        eps = 1e-10
        if len(sent) != 0:
            vectors = [word_embeddings.get(w, np.zeros((100,))) for w in sent.split()]
            v = np.mean(vectors, axis=0)
        else:
            v = np.zeros((100,))
        v = v + eps
        return v

    # get language model embedding
    def get_lm_embedding(self, string):
        sent = string.lower()
        eps = 1e-10
        if len(sent)!= 0:
            input_ids = torch.tensor([self.tokenizer.encode(sent)])
            last_hidden_state = self.lm_model(input_ids)[0]
            hidden_state=last_hidden_state.tolist()
            v = np.mean(hidden_state,axis=1)
        else:
            v = np.zeros((768,))
        v = v + eps
        return v

    # step 4: compare sentence similarity
    def check_if_similar_sentences(self,sentence_emb1,sentence_emb2):
        flag = False
        similarity = self.cos_sim(sentence_emb1,sentence_emb2)
        if similarity > self.ita:
            flag = True
        return flag


    def build_sentence_graph(self,):
        # spectral clustering  
        X = np.zeros([self.length, self.length])
        
        # get the vector size
        self.size = len(self.get_sentence_embeddings(self.sentences_list[0]))

        # get sentence vector holder
        emb_sentence_vectors = np.zeros([self.length,self.size])
 
        for i in range(self.length):
             emb_sen = self.get_sentence_embeddings(self.sentences_list[i])
             emb_sentence_vectors[i,] = emb_sen
        
        # iterate all sentence nodes to check if they should be connected
        for i in range(self.length):
            flag = False
            sen_i = self.sentences_list[i]
            # check above steps
            for j in range(i+1,self.length):
                sen_j = self.sentences_list[j]
                if (j-i) == 1:
                    # perform step1 and step3,which are only for adjacent sentences
                    nouns_list = self.get_nouns_for_verbs(sen_i)
                    # get most similar words for above nouns, including itself
                    similar_nouns_list = self.find_most_similar_words(glove_word_vectors, nouns_list,self.threshold)
                    # check for devebal noun 
                    flag = self.check_noun_reference(similar_nouns_list, sen_j)
                    if not flag:
                        # check for disourse markers
                        flag=self.check_discourse_markers(sen_i,sen_j)
                else:
                    # check for name entities
                    flag=self.compare_name_entity(sen_i,sen_j)

               # => step4 check for similar sentences
                if not flag:
                    # continue
                    i_sen_emb = emb_sentence_vectors[i,]
                    j_sen_emb = emb_sentence_vectors[j,]
                    flag = self.check_if_similar_sentences(i_sen_emb,j_sen_emb)

                if flag:                                      
                    X[i,j] = 1
                    X[j,i] = 1
        return X


In [4]:
import PyPDF2
import spacy

class SummPIPX:
    def __init__(self, clustering_method='spectral_clustering', graph_method='build_sentence_graph', compression_method='compress_clusters'):
        self.version = 'v1'
        self.clustering_method = clustering_method
        self.graph_method = graph_method
        self.compression_method = compression_method

    def __pdf_to_text__(self, path):
        pdfreader = PyPDF2.PdfReader(path)
        text=''
        for page in pdfreader.pages:
            text+=page.extract_text()
        return text
    
    def load_documents(self, input_data, input_type='documents'):
        # Check the input type
        if input_type == 'documents':
            # If input_type is 'documents', assume input_data represents the documents
            documents = input_data
        elif input_type == 'file_paths':
            # If input_type is 'file_paths', assume input_data represents file paths
            documents = [self.__pdf_to_text__(file) for file in files]
        else:
            raise ValueError(f"Invalid input_type: {input_type}")
        return documents
    
    def build_sentence_graph(self, documents):
        # Default graph building method
        pass

    def build_another_graph_method(self, documents):
        # Another graph building method
        pass

    def spectral_clustering(self, sentence_graph):
        # Default clustering method
        return []

    def another_clustering_method(self, sentence_graph):
        # Another clustering method
        return []

    def compress_clusters(self, clusters):
        nlp = spacy.load("en_core_web_md")
        summary_sentences = []

        for cluster in clusters:
            if len(cluster) == 1:
                # If there's only one sentence in the cluster, add it to the summary
                summary_sentences.append(cluster[0])
            else:
                # Calculate the similarity scores between all sentences in the cluster
                similarity_matrix = [[sent1.similarity(sent2) for sent2 in cluster] for sent1 in cluster]

                # Calculate the sum of similarity scores for each sentence
                similarity_sums = [sum(row) for row in similarity_matrix]

                # Find the index of the sentence with the highest similarity score sum
                most_relevant_index = similarity_sums.index(max(similarity_sums))

                # Add the most relevant sentence to the summary
                summary_sentences.append(cluster[most_relevant_index])

        # Concatenate the summary sentences
        summary = " ".join(summary_sentences)
        return summary

    def another_compression_method(self, clusters):
        # Another compression method
        # ...
        return ''

    def fit(self, documents):
        # Call the appropriate graph building method
        graph_method = getattr(self, self.graph_method)
        sentence_graph = graph_method(documents)
        
        # Call the appropriate clustering method
        clustering_method = getattr(self, self.clustering_method)
        self.clusters = clustering_method(sentence_graph)

    def transform(self):
        # Call the appropriate compression method
        compression_method = getattr(self, self.compression_method)
        summary = compression_method(self.clusters)
        return summary

## Testing

In [7]:
text=''
with open('dataset/NeuralNetworks/mergeddoc.txt', 'r') as f:
    text = '\n'.join(f.readlines())

graph = SentenceGraph(sentences_list, self.w2v)


112879

In [29]:
files = [
    'dataset/NeuralNetworks/1460210.pdf', 
    'dataset/NeuralNetworks/Oken.pdf',
    'dataset/NeuralNetworks/week7b-neuralnetwork.pdf'
]

summpip = SummPIPX()
documents = summpip.load_documents(files, input_type='file_paths')
summpip.fit(documents)
summpip.transform()

''