In [29]:
from sklearn.decomposition import LatentDirichletAllocation as LatentDocumentAllocation
def train_lda_model(document):
    lda_model = LatentDocumentAllocation(n_components=20, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
    document_topic_distribution = lda_model.fit_transform(document)

    return lda_model, document_topic_distribution

In [30]:
import numpy as np
def Kl_divergence(p, q):
    return np.sum(np.where(p != 0, p * np.log(p / q), 0))

In [61]:
from sklearn.feature_extraction import text as text
import nltk
import os
def get_tfidf_matrix(document):
    tfidf_vectorizer = text.TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(document)
    return tfidf_matrix
    
def summarize_all_documents(train, sentence_no):
    summaries = []
    for document, file_name in train:
        print("Summarizing document: ", file_name)
        try:
            document_model = get_tfidf_matrix(document)
            trained_lda_model, document_topic_distribution = train_lda_model(document_model)
            document_topic_distribution = document_topic_distribution / document_topic_distribution.sum(axis=1, keepdims=True)
            # topic_word_distribution = train_lda_model.components_ / train_lda_model.components_.sum(axis=1, keepdims=True)
            # topic_word_distribution = topic_word_distribution.transpose()
        except Exception as e:
            print("Error in summarizing document: ", file_name)
            print(e)
            continue
        cumulative_sentences = []
        picked = set()
        for _ in range(sentence_no):
            current_best_sentence = None
            current_best_score = -1
            for i in range(len(document)):
                if i in picked:
                    continue
                
                sentence_topic_distribution = document_topic_distribution[i]
                sentence_topic_distribution = sentence_topic_distribution / sentence_topic_distribution.sum()
                sentence_score = 0
                for j in range(len(cumulative_sentences)):
                    sentence_score += Kl_divergence(sentence_topic_distribution, document_topic_distribution[cumulative_sentences[j]])
                if sentence_score > current_best_score:
                    current_best_score = sentence_score
                    current_best_sentence = i
                    sentence = document[i]

            cumulative_sentences.append(current_best_sentence)
            picked.add(current_best_sentence)
        summaries.append((document, sentence))
    return summaries

def get_summary(train, sentence_no):
    summaries = summarize_all_documents(train, sentence_no)
    return summaries

def get_summary_from_file(file_name, sentence_no):
    
    with open(os.path.join(os.getcwd(), 'DUC2001/documents/', file_name), 'r') as f:
        document = f.read()
    document = document.split(' ')
    document = [document[i:i + 100] for i in range(0, len(document), 100)]
    document = [' '.join(sentence) for sentence in document]
    summaries = summarize_all_documents([(document, file_name)], sentence_no)
    return summaries

def get_summary_from_file_list(file_list, sentence_no):
    summaries = []
    for file_name in file_list:
        with open(os.path.join(file_name), 'r') as f:
        # with open(os.path.join(os.getcwd(), 'DUC2001/documents/', file_name), 'r') as f:
            document = f.read()
        document = document.split(' ')
        document = [document[i:i + 100] for i in range(0, len(document), 100)]
        document = [' '.join(sentence) for sentence in document]
        summaries += summarize_all_documents([(document, file_name)], sentence_no)
    return summaries

def get_summary_from_file_list_with_topic(file_list, sentence_no, topic):
    summaries = []
    for file_name in file_list:
        with open(file_name, 'r') as f:
            document = f.read()
        document = document.split(' ')
        document = [document[i:i + 100] for i in range(0, len(document), 100)]
        document = [' '.join(sentence) for sentence in document]
        summaries += summarize_all_documents([(document, file_name)], sentence_no)
    return summaries




In [47]:
import os
file_list = os.listdir('./DUC2001/documents/')
summaries = get_summary_from_file_list(file_list, 5)
for document, summary in summaries:
    print(summary)



d:\2022\Fall\DMT\Week13/DUC2001/AP830325-0143
Summarizing document:  AP830325-0143
d:\2022\Fall\DMT\Week13/DUC2001/AP880217-0175
Summarizing document:  AP880217-0175
d:\2022\Fall\DMT\Week13/DUC2001/AP880318-0051
Summarizing document:  AP880318-0051
d:\2022\Fall\DMT\Week13/DUC2001/AP880330-0119
Summarizing document:  AP880330-0119
d:\2022\Fall\DMT\Week13/DUC2001/AP880331-0140
Summarizing document:  AP880331-0140
d:\2022\Fall\DMT\Week13/DUC2001/AP880409-0015
Summarizing document:  AP880409-0015
d:\2022\Fall\DMT\Week13/DUC2001/AP880419-0131
Summarizing document:  AP880419-0131
d:\2022\Fall\DMT\Week13/DUC2001/AP880510-0178
Summarizing document:  AP880510-0178
d:\2022\Fall\DMT\Week13/DUC2001/AP880517-0226
Summarizing document:  AP880517-0226
d:\2022\Fall\DMT\Week13/DUC2001/AP880520-0264
Summarizing document:  AP880520-0264
d:\2022\Fall\DMT\Week13/DUC2001/AP880601-0040
Summarizing document:  AP880601-0040
d:\2022\Fall\DMT\Week13/DUC2001/AP880613-0161
Summarizing document:  AP880613-0161
d:\2

In [62]:

file_list = os.listdir('../week12/data/20_newsgroup/sci.space/')
for i in range(len(file_list)):
    file_list[i] = '../week12/data/20_newsgroup/sci.space/' + file_list[i]
summaries = get_summary_from_file_list(file_list, 5)
for document, summary in summaries:
    print(summary)



Summarizing document:  ../week12/data/20_newsgroup/sci.space/59497
Summarizing document:  ../week12/data/20_newsgroup/sci.space/59846
Summarizing document:  ../week12/data/20_newsgroup/sci.space/59848
Summarizing document:  ../week12/data/20_newsgroup/sci.space/59849
Summarizing document:  ../week12/data/20_newsgroup/sci.space/59850
Summarizing document:  ../week12/data/20_newsgroup/sci.space/59870
Summarizing document:  ../week12/data/20_newsgroup/sci.space/59871
Summarizing document:  ../week12/data/20_newsgroup/sci.space/59872
Summarizing document:  ../week12/data/20_newsgroup/sci.space/59873
Summarizing document:  ../week12/data/20_newsgroup/sci.space/59874
Summarizing document:  ../week12/data/20_newsgroup/sci.space/59904
Summarizing document:  ../week12/data/20_newsgroup/sci.space/59905
Summarizing document:  ../week12/data/20_newsgroup/sci.space/59906
Summarizing document:  ../week12/data/20_newsgroup/sci.space/59907
Summarizing document:  ../week12/data/20_newsgroup/sci.space/5