In [48]:
import nltk 
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk import sent_tokenize, word_tokenize
import pandas as pd
import numpy as np
import networkx as nx

In [49]:
# read & clean text file 

file = open("Iceland Article.txt", "r")
text = file.readlines()
sentences = nltk.sent_tokenize(text[0])
stop_words = stopwords.words('english')

text



In [50]:
def sentence_similarity(sent1, sent2, stopwords=None):
    # function to generate cosine similarity scores
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
        
    cosine_similarity = 1 - cosine_distance(vector1, vector2)
    return cosine_similarity 

In [51]:
def similarity_matrix(sentences, stop_words):
 
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i == j: 
                continue 
            similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j], stop_words)
    return similarity_matrix

In [54]:
def sentence_scores(file_name):
    summarize_text = []

    sentence_similarity_martix = similarity_matrix(sentences, stop_words)
    
    #rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)
    
    # sort the rank and pick top sentences
    scored_sentences = [(scores[i],s) for i,s in enumerate(sentences)]
    
    # adjust the width of dataframe to show full sentence
    pd.set_option('display.max_colwidth', -1)
    scored_sentences = pd.DataFrame(scored_sentences, columns=('Score','Sentence'))
    
    return scored_sentences
sentence_scores(file)

Unnamed: 0,Score,Sentence
0,0.07823,"At the very heart of Iceland is the country’s gleaming, glinting namesake: ice."
1,0.077142,"Glaciers make up roughly 10% of Iceland, and they bring close to 2 million tourists from all over the world to the country each year."
2,0.075263,But these mighty masses of ice are more fragile than they may seem.
3,0.077338,"In the wake of climate change, glaciologists predict that in 200 years all of Iceland’s glaciers will have disappeared."
4,0.073275,One already has.
5,0.077658,"In 2014, when its ice was no longer thick enough to move, Okjokull glacier was pronounced dead."
6,0.078485,A lake of melted ice and barren stretch of stone and dirt now dominates the landscape where the glacier once lived.
7,0.076857,"The site was renamed Ok, and “jokull,” meaning “glacier” in Icelandic, was dropped."
8,0.077997,"In August, local geologists and climate advocates installed a plaque at the site of the former glacier, which reads: “In the next 200 years, all our glaciers are expected to follow the same path."
9,0.074494,This monument is to acknowledge that we know what is happening and what needs to be done.


In [55]:
def summary(n):
    # function to compile summary of 'n' sentences based on top scores
    
    # create dataframe of sentences and scores
    df = sentence_scores(file)
    
    scores = df.iloc[:,0].tolist()
    
    # get list of 'n' top scores
    top_n_scores = []
    for i in range(n):
        top_n_scores.append(max(scores)) #append largest element to list of results
        scores.remove(max(scores)) # remove largest element from old list
  
    # get indexes of sentences with top 'n' scores
    sentence_idx = []
    for score in top_n_scores:
        i = df.Score[df.Score==score].index.tolist()
        sentence_idx.append(i)

    # reduce dimension of list
    sentence_idx = [num for num in sentence_idx for num in num]
    # sort index from smallest to largest to ensure sentences are in correct order
    sentence_idx.sort()

    # generate list of 'n' sentences and combine them 
    sentences = []
    for i in sentence_idx:
        sentence = df.iloc[i][1]
        sentences.append(sentence)
    summarized_text = ' '.join(sentences)

    # create dataframe for comparing texts 
    comparison = [text, [summarized_text]]
    comparison = pd.DataFrame.from_records(comparison).transpose()
    comparison.columns = ('Original', 'Summarized')

    return comparison

In [60]:
# test output with n = 5 

summary(5)

Unnamed: 0,Original,Summarized
0,"At the very heart of Iceland is the country’s gleaming, glinting namesake: ice. Glaciers make up roughly 10% of Iceland, and they bring close to 2 million tourists from all over the world to the country each year. But these mighty masses of ice are more fragile than they may seem. In the wake of climate change, glaciologists predict that in 200 years all of Iceland’s glaciers will have disappeared. One already has. In 2014, when its ice was no longer thick enough to move, Okjokull glacier was pronounced dead. A lake of melted ice and barren stretch of stone and dirt now dominates the landscape where the glacier once lived. The site was renamed Ok, and “jokull,” meaning “glacier” in Icelandic, was dropped. In August, local geologists and climate advocates installed a plaque at the site of the former glacier, which reads: “In the next 200 years, all our glaciers are expected to follow the same path. This monument is to acknowledge that we know what is happening and what needs to be done. Only you know if we did it.” Written by Icelandic author Andri Snaer Magnason, the inscription is meant to serve as a warning that unless something changes, experiencing Iceland’s awe-inspiring glaciers will be a privilege of past generations. Some locals, though, are hopeful that Icelandic companies offering “sustainable tours” by offsetting their emissions with reforestation, capping the sizes of their groups and adhering to strict recycling rules will be able to help preserve the country’s glaciers. The Icelandic government is also making it a top priority to save these melting giants, pledging to cut 40% of Iceland’s emissions by 2030.","At the very heart of Iceland is the country’s gleaming, glinting namesake: ice. A lake of melted ice and barren stretch of stone and dirt now dominates the landscape where the glacier once lived. In August, local geologists and climate advocates installed a plaque at the site of the former glacier, which reads: “In the next 200 years, all our glaciers are expected to follow the same path. Only you know if we did it.” Written by Icelandic author Andri Snaer Magnason, the inscription is meant to serve as a warning that unless something changes, experiencing Iceland’s awe-inspiring glaciers will be a privilege of past generations. Some locals, though, are hopeful that Icelandic companies offering “sustainable tours” by offsetting their emissions with reforestation, capping the sizes of their groups and adhering to strict recycling rules will be able to help preserve the country’s glaciers."
