### Text Summarization


In [1]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize


In [2]:
text = '''Feeling very blue today. We found out that Barney is very sick. He’s my beloved Golden Retriever pup. Well, he’s not a pup anymore, he is 12, but for me, he always will be. The vet said he’s developed some kinda kidney disease. I worry if it’s too late to fix it at this point, the vet seemed worried as well. I have been trying to spend as much time with Barney as possible. Poor thing seems to be suffering. I feel super helpless and useless. I just wish we caught it earlier. There is a slight glimpse of hope though. Another vet said he might still recover. Unlikely but it might happen. Having a hard time focusing on my school or anything else. I tend to lose focus fast even when things are normal but especially right now. Everyone is stressing out about Barney and expecting the worst to happen. It’s heartbreaking. My younger brother Peter is very emotional. I mean I’m emotional but he’s taking it to another level. My parents are sad too. He has been part of our lives for so long that it’s hard to imagine otherwise. The one that’s always there for you and loves you unconditionally. I seem to be the only one who believes he’s gonna make it.
'''

In [3]:
def _create_frequency_table(text_string) -> dict:

    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable

In [4]:
def _score_sentences(sentences, freqTable) -> dict:
    sentenceValue = dict()

    for sentence in sentences:
        word_count_in_sentence = (len(word_tokenize(sentence)))
        for wordValue in freqTable:
            if wordValue in sentence.lower():
                if sentence[:10] in sentenceValue:
                    sentenceValue[sentence[:10]] += freqTable[wordValue]
                else:
                    sentenceValue[sentence[:10]] = freqTable[wordValue]

        sentenceValue[sentence[:10]] = sentenceValue[sentence[:10]] // word_count_in_sentence

    return sentenceValue

In [5]:
def _find_average_score(sentenceValue) -> int:
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    average = int(sumValues / len(sentenceValue))

    return average

In [6]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:10] in sentenceValue and sentenceValue[sentence[:10]] > (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [7]:
freq_table = _create_frequency_table(text)

sentences = sent_tokenize(text)

sentence_scores = _score_sentences(sentences, freq_table)

threshold = _find_average_score(sentence_scores)

summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)

print(summary)

 Feeling very blue today. He’s my beloved Golden Retriever pup. Poor thing seems to be suffering. I feel super helpless and useless. Another vet said he might still recover. Unlikely but it might happen. It’s heartbreaking. My parents are sad too.


### Text Similarity Score

In [8]:
sen = ["Feeling very blue today. We found out that Barney is very sick. He’s my beloved Golden Retriever pup. Well, he’s not a pup anymore, he is 12, but for me he always will be. The vet said he’s developed some kinda kidney disease. I worry if it’s too late to fix it at this point, the vet seemed worried as well. I have been trying to spend as much time with Barney as possible.Poor thing seems to be suffering. I feel super helpless and useless. I just wish we caught it earlier. There is a slight glimpse of hope though. Another vet said he might still recover. Unlikely but it might happen. Having a hard time focusing on my school or anything else. I tend to lose focus fast even when things are normal but especially right now. Everyone is stressing outabout Barney and expecting the worst to happen. It’s heartbreaking. My younger brother Peteris very emotional. I mean I’m emotional but he’s taking it to another level. My parents are sadtoo. He has been part of our lives for so long that it’s hard to imagine otherwise. The one that’salways there for you and loves you unconditionally. I seem to be the only one who believes he’s gonna make it.",
 "Had a huge fight with my mom today. I don’t like fighting and tend to avoid conflict. But my mom has no problem lashing out. She was saying very mean things about my dad. My mom &amp; dad are divorced, and he recently started another family. I am also struggling to accept it. My dad canceled on me AGAIN. I was very looking forward to our dinner. I should also mention that he JUST had another daughter. I understand he’s busy and all, but feeling rather neglected. He promised I would always be his priority, but it doesn’t look like it. Perhaps I shouldn’t have fought with my mom yesterday. Is she right about him? Maybe he is selfish. Decided to take my mind off my family drama. I love working out in my gym and today I saw an ad about marathon training. So I immediately signed up. I already did a half marathon before and loved it. Time to extend my horizons. It’s gonna be intense, but I’m ready. We’ll see how far I’ll go, but I feel determined."]

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')
#Encoding:
sen_embeddings = model.encode(sen)
sen_embeddings.shape

(2, 768)

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(
    [sen_embeddings[0]],
    sen_embeddings[1:]
)

In [10]:
print(sim*100)

[[75.62519]]
