In [None]:
import numpy as np
import pandas as pd 
import os

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

from sumy.summarizers.lex_rank import LexRankSummarizer 
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from sumy.summarizers.sum_basic import SumBasicSummarizer
from sumy.summarizers.kl import KLSummarizer
from sumy.summarizers.reduction import ReductionSummarizer

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amazon-fine-food-reviews/database.sqlite
/kaggle/input/amazon-fine-food-reviews/hashes.txt
/kaggle/input/amazon-fine-food-reviews/Reviews.csv


In [30]:
df = pd.read_csv('../input/amazon-fine-food-reviews/Reviews.csv', index_col = 'Id')[:5]
df.head(2)

Unnamed: 0_level_0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...


In [46]:
df.dropna(inplace=True)
# df.isnull().sum()

In [47]:
reviews = pd.DataFrame({
    'Review':df['Text'].values, 
    'Summary':df['Summary'].values})
reviews.head()

Unnamed: 0,Review,Summary
0,I have bought several of the Vitality canned d...,Good Quality Dog Food
1,Product arrived labeled as Jumbo Salted Peanut...,Not as Advertised
2,This is a confection that has been around a fe...,"""Delight"" says it all"
3,If you are looking for the secret ingredient i...,Cough Medicine
4,Great taffy at a great price. There was a wid...,Great taffy


In [48]:
contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there'd": "there had",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where'd": "where did",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are"
}

In [49]:
def clean_text(text, remove_stopwords = True):
    text = text.lower().split()
    new_text = []
    for word in text:
        if word in contractions:
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    text = " ".join(new_text)
    
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'\.+', ".", text)
    text = re.sub(r'[_"\-;%()|+&=*%,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)
        
    return text

In [50]:
clean_summaries = []
for summary in reviews['Summary']:
    clean_summaries.append(clean_text(summary, remove_stopwords=False))
print("Cleaned summaries")

clean_texts = []
for text in reviews['Review']:
    clean_texts.append(clean_text(text))
print("Cleaned Reviews")

Cleaned summaries
Cleaned Reviews


In [51]:
for i in range(5):
    print("Clean Review #",i+1)
    print(clean_texts[i])
    print("Summary")
    print(clean_summaries[i])
    print()

Clean Review # 1
bought several vitality canned dog food products found good quality. product looks like stew processed meat smells better. labrador finicky appreciates product better most.
Summary
good quality dog food

Clean Review # 2
product arrived labeled jumbo salted peanuts.the peanuts actually small sized unsalted. sure error vendor intended represent product jumbo .
Summary
not as advertised

Clean Review # 3
confection around centuries. light pillowy citrus gelatin nuts case filberts. cut tiny squares liberally coated powdered sugar. tiny mouthful heaven. chewy flavorful. highly recommend yummy treat. familiar story c.s. lewis lion witch wardrobe treat seduces edmund selling brother sisters witch.
Summary
 delight  says it all

Clean Review # 4
looking secret ingredient robitussin believe found it. got addition root beer extract ordered good made cherry soda. flavor medicinal.
Summary
cough medicine

Clean Review # 5
great taffy great price. wide assortment yummy taffy. deli

In [52]:
# trainData = reviews.iloc[:int(len(reviews) * 0.8)]
# testData = reviews.iloc[int(len(reviews) * 0.8):]

# trainX = trainData['Review'].values
# trainy = trainData['Summary'].values
# testX = testData['Review'].values
# testy = testData['Summary'].values

In [53]:
language = 'english'
sentence_count = 1
sw = stopwords.words('english') 

In [58]:
def cosine(X, Y):
    X_list = word_tokenize(X)  
    Y_list = word_tokenize(Y)  
    l1 =[]
    l2 =[] 
    X_set = {w for w in X_list if not w in sw}  
    Y_set = {w for w in Y_list if not w in sw} 
    rvector = X_set.union(Y_set)  
    for w in rvector: 
        if w in X_set: 
            l1.append(1)  
        else: 
            l1.append(0) 
        if w in Y_set: 
            l2.append(1) 
        else: 
            l2.append(0)
    c = 0
    for i in range(len(rvector)): 
            c += l1[i] * l2[i] 
    temp = float((sum(l1)*sum(l2))**0.5)
    if float((sum(l1)*sum(l2))**0.5) == 0:
        temp = -float("inf")
    cosine = c / temp
    return cosine

## LexRank

In [55]:
summarizer1 = LexRankSummarizer(Stemmer(language))
summarizer1.stopwords = get_stop_words(language)
lexRankSummary = []
for i in range(len(clean_texts)):
    parser = PlaintextParser(clean_texts[i], Tokenizer(language))
    lexRankSummary.append(summarizer1(parser.document, sentence_count))

In [59]:
lexRankCosine = 0
for i in range(len(clean_summaries)):
    for sentence in lexRankSummary[i]:
        lexRankCosine += cosine(clean_summaries[i], str(sentence))
lexRankCosine = lexRankCosine/len(clean_summaries)

## Luhn

In [60]:
summarizer2 = LuhnSummarizer(Stemmer(language))
summarizer2.stop_words = get_stop_words(language)
luhnSummary = []
for i in range(len(clean_texts)):
    parser = PlaintextParser.from_string(clean_texts[i], Tokenizer(language))
    luhnSummary.append(summarizer2(parser.document, sentence_count))

In [61]:
luhnCosine = 0
for i in range(len(clean_summaries)):
    for sentence in luhnSummary[i]:
        luhnCosine += cosine(clean_summaries[i], str(sentence))
luhnCosine = luhnCosine/len(clean_summaries)

## LSA

In [62]:
summarizer3 = LsaSummarizer(Stemmer(language))
summarizer3.stop_words = get_stop_words(language)
lsaSummary = []
for i in range(len(clean_texts)):
    parser = PlaintextParser.from_string(clean_texts[i], Tokenizer(language))
    lsaSummary.append(summarizer3(parser.document, sentence_count))

  warn(message % (words_count, sentences_count))


In [63]:
lsaCosine = 0
for i in range(len(clean_summaries)):
    for sentence in lsaSummary[i]:
        lsaCosine += cosine(clean_summaries[i], str(sentence))
lsaCosine = lsaCosine/len(clean_summaries)

## TextRankSummarizer

In [64]:
summarizer4 = TextRankSummarizer(Stemmer(language))
summarizer4.stop_words = get_stop_words(language)
textRankSummary = []
for i in range(len(clean_texts)):
    parser = PlaintextParser.from_string(clean_texts[i], Tokenizer(language))
    textRankSummary.append(summarizer4(parser.document, sentence_count))

In [65]:
textRankCosine = 0
for i in range(len(clean_summaries)):
    for sentence in textRankSummary[i]:
        textRankCosine += cosine(clean_summaries[i], str(sentence))
textRankCosine = textRankCosine/len(clean_summaries)

## Edmundson

In [71]:
summarizer5 = EdmundsonSummarizer(Stemmer(language))
summarizer5.stop_words = get_stop_words(language)
edmundsonSummary = []
for i in range(len(clean_texts)):
    summarizer5.bonus_words = ['good'] + clean_texts[i].split()
    summarizer5.stigma_words = ['bad', 'worst']
    summarizer5.null_words = ['zdfgthdvndadv']
    parser = PlaintextParser.from_string(clean_texts[i], Tokenizer(language))
    edmundsonSummary.append(summarizer5(parser.document, sentence_count))

In [72]:
edmundsonCosine = 0
for i in range(len(clean_summaries)):
    for sentence in edmundsonSummary[i]:
        edmundsonCosine += cosine(clean_summaries[i], str(sentence))
edmundsonCosine = edmundsonCosine/len(clean_summaries)

## SumBasic

In [73]:
summarizer6 = SumBasicSummarizer(Stemmer(language))
summarizer6.stop_words = get_stop_words(language)
sumBasicSummary = []
for i in range(len(clean_texts)):
    parser = PlaintextParser.from_string(clean_texts[i], Tokenizer(language))
    sumBasicSummary.append(summarizer6(parser.document, sentence_count))

In [74]:
sumBasicCosine = 0
for i in range(len(clean_summaries)):
    for sentence in sumBasicSummary[i]:
        sumBasicCosine += cosine(clean_summaries[i], str(sentence))
sumBasicCosine = sumBasicCosine/len(clean_summaries)

## KL Summariser

In [75]:
summarizer7 = KLSummarizer(Stemmer(language))
summarizer7.stop_words = get_stop_words(language)
klSummary = []
for i in range(len(clean_texts)):
    parser = PlaintextParser.from_string(clean_texts[i], Tokenizer(language))
    klSummary.append(summarizer7(parser.document, sentence_count))

In [76]:
klCosine = 0
for i in range(len(clean_summaries)):
    for sentence in klSummary[i]:
        klCosine += cosine(clean_summaries[i], str(sentence))
klCosine = klCosine/len(clean_summaries)

## Reduction Summarizer

In [77]:
summarizer8 = KLSummarizer(Stemmer(language))
summarizer8.stop_words = get_stop_words(language)
reductionSummary = []
for i in range(len(clean_texts)):
    parser = PlaintextParser.from_string(clean_texts[i], Tokenizer(language))
    reductionSummary.append(summarizer8(parser.document, sentence_count))

In [78]:
reductionCosine = 0
for i in range(len(clean_summaries)):
    for sentence in reductionSummary[i]:
        reductionCosine += cosine(clean_summaries[i], str(sentence))
reductionCosine = reductionCosine/len(clean_summaries)

## Metrics

In [79]:
print('Similarity between golden summary & LexRank   : %.3f' % lexRankCosine)
print('Similarity between golden summary & Luhn      : %.3f' % luhnCosine)
print('Similarity between golden summary & LSA       : %.3f' % lsaCosine)
print('Similarity between golden summary & TextRank  : %.3f' % textRankCosine)
print('Similarity between golden summary & Edmundson : %.3f' % edmundsonCosine)
print('Similarity between golden summary & sumBasic  : %.3f' % sumBasicCosine)
print('Similarity between golden summary & KL        : %.3f' % klCosine)
print('Similarity between golden summary & Reduction : %.3f' % reductionCosine)

Similarity between golden summary & LexRank   : 0.159
Similarity between golden summary & Luhn      : 0.142
Similarity between golden summary & LSA       : 0.116
Similarity between golden summary & TextRank  : 0.131
Similarity between golden summary & Edmundson : 0.122
Similarity between golden summary & sumBasic  : 0.145
Similarity between golden summary & KL        : 0.118
Similarity between golden summary & Reduction : 0.118
