In [None]:
import string
from heapq import nlargest
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
import string

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
nlp = spacy.load('en_core_web_sm') 

In [None]:
stopwords = list(STOP_WORDS)

In [None]:
punctuation = string.punctuation + '\n'
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n'

Tokenize the Document

In [None]:
def tokenize(doc):
    tokens = [token.text for token in doc]
    return tokens

Create a word frequency dictionary of every word in the document

In [None]:
def wordFrequencies(doc):
    word_frequencies = {}
    for word in doc:
        if word.text.lower() not in stopwords:
            if word.text.lower() not in punctuation:
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    return word_frequencies

Divides the every word frequency by the max word frequency in the document

In [None]:
def wordFreqPerMax(wordFrequencies):
    max_frequency = max(wordFrequencies.values())
    for word in wordFrequencies.keys():
        wordFrequencies[word] = wordFrequencies[word]/max_frequency
    return wordFrequencies

Tokenize each sentence

In [None]:
def sent_tokenizer(doc):
    sent_tokens = [sent for sent in doc.sents]
    return sent_tokens 

Scores Each Sentence in the document based on the word frequencies of each sentence to show how important it is in the document

In [None]:
def sent_scores(sent_tokens, word_frequencies):
    scores = {}
    for sent in sent_tokens:
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                if sent not in scores.keys():
                    scores[sent] = word_frequencies[word.text.lower()]
                else:
                    scores[sent] += word_frequencies[word.text.lower()]
    return scores

In [None]:
def summarize(text):
    text = nlp(text)
    word_frequencies = wordFrequencies(text)
    word_frequencies = wordFreqPerMax(word_frequencies)
    sent_tokens = sent_tokenizer(text)
    scores = sent_scores(sent_tokens, word_frequencies)
    select_length = 4
    #select_length = int(len(sent_tokens)*0.3)
    summary = nlargest(select_length, scores, key = scores.get)
    
    return summary

In [None]:
def concatenate_sentences(summaries):
    summary_strings = []
    for summary in summaries:
        summary_string = ''
        for sentence in summary:
            summary_string += sentence.text + ' '
        summary_strings.append(summary_string.strip())
    return summary_strings

In [None]:
practice = "My name is Nick Calvaresi. I am grad student at Northeastern University. I am studying Data Science with a passion in Sports. In my free time I like doing many other things. I like to hang with my dog, family and friends, play and watch football, go on hikes, skiing, and scuba diving. I am currently 24 years old and living in Boston"

In [None]:
doc = nlp(practice)

In [None]:
prac = summarize(doc)
print(prac)

[I like to hang with my dog, family and friends, play and watch football, go on hikes, skiing, and scuba diving., In my free time I like doing many other things., I am currently 24 years old and living in Boston, I am grad student at Northeastern University.]


In [None]:
concatenate_sentences(prac)

['I like to hang with my dog , family and friends , play and watch football , go on hikes , skiing , and scuba diving .',
 'In my free time I like doing many other things .',
 'I am currently 24 years old and living in Boston',
 'I am grad student at Northeastern University .']

In [None]:
#!pip install datasets

In [None]:
from datasets import load_dataset
import pandas as pd

In [None]:
df = load_dataset("cnn_dailymail", '3.0.0')

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Downloading and preparing dataset cnn_dailymail/3.0.0 to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de...


Downloading data files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/159M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/661k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset cnn_dailymail downloaded and prepared to /root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
#pip install rouge

In [None]:
from rouge import Rouge

In [None]:
rouge = Rouge()

In [None]:
train = [dic for dic in df["train"]]
df_train = pd.DataFrame(train)

In [None]:
test = [dic for dic in df["test"]]
df_test = pd.DataFrame(test)

In [None]:
df_train_x = df_train["article"]
df_train_y = df_train["highlights"]

In [None]:
tester = df_train_x.head(3500)
len(tester)

3500

In [None]:
len(df_train_y)

287113

In [None]:
new_train_y = []
for sub in df_train_y:
    new_train_y.append(sub.replace("\n", ""))

In [None]:
len(new_train_y)

287113

In [None]:
print(new_train_y[:10])

["Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .Young actor says he has no plans to fritter his cash away .Radcliffe's earnings from first five Potter films have been held in trust fund .", 'Mentally ill inmates in Miami are housed on the "forgotten floor"Judge Steven Leifman says most are there as a result of "avoidable felonies"While CNN tours facility, patient shouts: "I am the son of the president"Leifman says the system is unjust and he\'s fighting for change .', 'NEW: "I thought I was going to die," driver says .Man says pickup truck was folded in half; he just has cut on face .Driver: "I probably had a 30-, 35-foot free fall"Minnesota bridge collapsed during rush hour Wednesday .', 'Five small polyps found during procedure; "none worrisome," spokesman says .President reclaims powers transferred to vice president .Bush undergoes routine colonoscopy at Camp David .', "NEW: NFL chief, Atlanta Falcons owner critical of Michael Vick's conduct .NFL suspen

In [None]:
summaries = []
references = []
for i in range(len(tester)):
    summ = summarize(tester[i])
    summaries.append(summ)
    references.append(new_train_y[i])

In [None]:
summaries = concatenate_sentences(summaries)

In [None]:
print(len(references))

3500


In [None]:
#for i in summaries:
 # print(i.lower())

In [None]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [None]:
def clean_text(text):
    text = text.lower()
    text = text.split()
    tmp = []
    for word in text:
        if word in contractions:
            tmp.append(contractions[word])
        else:
            tmp.append(word)
    text = ' '.join(tmp)
    return text

In [None]:
clean_summaries = []
for i in summaries:
  clean_summaries.append(clean_text(i))

In [None]:
clean_references = []
for i in references:
  clean_references.append(clean_text(i))

In [None]:
rouge.get_scores(summaries, references, avg=True)

{'rouge-1': {'r': 0.40514556717067396,
  'p': 0.16408271807649508,
  'f': 0.2308189369320439},
 'rouge-2': {'r': 0.1370451202558655,
  'p': 0.04664151271948122,
  'f': 0.0687120275448437},
 'rouge-l': {'r': 0.3770268608048172,
  'p': 0.1527758384764616,
  'f': 0.21488323637365783}}

In [None]:
rouge.get_scores(clean_summaries, clean_references, avg=True)

{'rouge-1': {'r': 0.42939793527874714,
  'p': 0.1755352584020423,
  'f': 0.2461471459209962},
 'rouge-2': {'r': 0.1451701391812121,
  'p': 0.04951051923095803,
  'f': 0.0728939595132959},
 'rouge-l': {'r': 0.3968983131613413,
  'p': 0.16233016913400264,
  'f': 0.22759637727677468}}