In [44]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


In [45]:
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [46]:
def preprocess_text(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Tokenize the sentences into words and remove stopwords
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    preprocessed_sentences = []
    for sentence in sentences:
        words = [ps.stem(word.lower()) for word in word_tokenize(sentence) if word.isalnum()]
        words = [word for word in words if word not in stop_words]
        preprocessed_sentences.append(' '.join(words))
    
    return preprocessed_sentences


In [47]:
def calculate_textrank_scores(sentences):
    # Vectorize the preprocessed sentences using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    # Compute the similarity matrix based on cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Apply PageRank algorithm to the similarity matrix
    damping_factor = 0.85
    scores = [1.0] * len(sentences)
    convergence_threshold = 0.001
    iteration_count = 10
    
    for _ in range(iteration_count):
        prev_scores = list(scores)
        for i in range(len(sentences)):
            score = 0.0
            for j in range(len(sentences)):
                if i != j:
                    score += (similarity_matrix[i][j] / sum(similarity_matrix[j]))
            scores[i] = (1 - damping_factor) + damping_factor * score
        
        if sum([abs(scores[i] - prev_scores[i]) for i in range(len(sentences))]) <= convergence_threshold:
            break
    
    return scores


In [48]:
def generate_summary(sentences, scores, num_sentences):
    ranked_sentences = sorted(((scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True)
    summary_sentences = [sentence for score, sentence in ranked_sentences[:num_sentences]]
    summary = ' '.join(summary_sentences)
    return summary


In [49]:
def main():
    # Prompt the user to enter the text file path
    file_path = input('Enter the text file path: ')

    # Read the text from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Preprocess the text
    preprocessed_sentences = preprocess_text(text)

    # Calculate the TextRank scores
    scores = calculate_textrank_scores(preprocessed_sentences)

    # Generate the summary
    num_sentences = 3  # Adjust the number of sentences in the summary as desired
    summary = generate_summary(preprocessed_sentences, scores, num_sentences)

    # Print the summary
    print('Summary:')
    print(summary)


if __name__ == '__main__':
    main()
