In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
%%writefile "../summarizer_service.py"

from django.http import HttpResponse
from mangorest.mango import webapi
import whisper, hashlib, os, datetime, json, torch
from transformers import pipeline
import keybert
import math

def preprocess(text):
    """
    Remove timelines and return the result in this format:
    {SPEAKER}: {SENTENCES}
    """
    result = []
    lines = text.strip().split('\n')
    for line in lines:
        parts = line.split('|')
        speaker = parts[1].strip().split(':')[0]
        content = parts[1].strip().split(':')[1].strip()
        result.append(f"{speaker}: {content}")
    return '\n'.join(result)   

#-----------------------------models------------------------------------------------------------------------               
summarizer = pipeline("summarization", "vmarklynn/bart-large-cnn-samsum-icsi-ami", truncation=True)
kw_model = keybert.KeyBERT(model='all-mpnet-base-v2')
#-----------------------------------------------------------------------------------------------------               

@webapi("/parrot/summarize_text/")
def summarizeText(request, **kwargs):
    post_data = request.POST.dict()
    transcription = post_data.get('transcription')
    text = post_data.get('text')
    wordCount = post_data.get('wordCount')
    
    input_cleanned_text = preprocess(transcription)
    print("\n\n", input_cleanned_text, "\n\n")
    print( "min_length: ", math.ceil(int(wordCount) * 0.1))
    print("\n\nSummarizing...")
    summary = summarizer(input_cleanned_text, min_length = math.ceil(int(wordCount) * 0.1))[0]['summary_text']
    print("\n", summary, "\n")
    
    keywords = kw_model.extract_keywords(text, 
                                     keyphrase_ngram_range=(1, 1), 
                                     stop_words='english', 
                                     highlight=False,
                                     top_n=5)
    keywords_list_1= list(dict(keywords).keys())
    print(keywords_list_1)
    keywords = kw_model.extract_keywords(text, 
                                     keyphrase_ngram_range=(2, 2), 
                                     stop_words='english', 
                                     highlight=False,
                                     top_n=5)
    keywords_list_2= list(dict(keywords).keys())
    print(keywords_list_2)    
    keywords = kw_model.extract_keywords(text, 
                                     keyphrase_ngram_range=(3, 3), 
                                     stop_words='english', 
                                     highlight=False,
                                     top_n=5)    
    keywords_list_3 = list(dict(keywords).keys())
    print(keywords_list_3)
    
    response = {'transcription': transcription, 'summary': summary, 
                'keywords_list_1': keywords_list_1, 'keywords_list_2': keywords_list_2,
                'keywords_list_3': keywords_list_3,}
    return HttpResponse(json.dumps(response), content_type='application/json')

#-----------------------------------------------------------------------------------------------------               
@webapi("/parrot/summarize_summary/")
def summarizeSummary(request, **kwargs):
    post_data = request.POST.dict()
    summary_input = post_data.get('summary')
    wordCount = post_data.get('wordCount-summ')
    
    print( "min: ", math.ceil(int(wordCount) * 0.1), "max: ", math.ceil(int(wordCount) * 0.2))
    print("\n\nSummarizing again...")
    summary = summarizer(summary_input, min_length = math.ceil(int(wordCount) * 0.1))[0]['summary_text']
    print("\n", summary, "\n")
    
    response = {'summary': summary}
    return HttpResponse(json.dumps(response), content_type='application/json')

#-----------------------------------------------------------------------------------------------------               

@webapi("/parrot/summarize_text_v2/")
def summarizeText(request, **kwargs):
    post_data = request.POST.dict()
    transcription = post_data.get('transcription')
    text = post_data.get('text')
    wordCount_input = post_data.get('wordCount-input')
    
    input_cleanned_text = preprocess(transcription)
    # print("\n\n", input_cleanned_text, "\n\n")
    print( "word count input length: ", wordCount_input)
    print("\n\nSummarizing...")
    summary = summarizer(input_cleanned_text, min_length = wordCount_input)[0]['summary_text']
    print("\n", summary, "\n")
    
    keywords = kw_model.extract_keywords(text, 
                                     keyphrase_ngram_range=(1, 1), 
                                     stop_words='english', 
                                     highlight=False,
                                     top_n=5)
    keywords_list_1= list(dict(keywords).keys())
    print(keywords_list_1)
    keywords = kw_model.extract_keywords(text, 
                                     keyphrase_ngram_range=(2, 2), 
                                     stop_words='english', 
                                     highlight=False,
                                     top_n=5)
    keywords_list_2= list(dict(keywords).keys())
    print(keywords_list_2)    
    keywords = kw_model.extract_keywords(text, 
                                     keyphrase_ngram_range=(3, 3), 
                                     stop_words='english', 
                                     highlight=False,
                                     top_n=5)    
    keywords_list_3 = list(dict(keywords).keys())
    print(keywords_list_3)
    
    response = {'transcription': transcription, 'summary': summary, 
                'keywords_list_1': keywords_list_1, 'keywords_list_2': keywords_list_2,
                'keywords_list_3': keywords_list_3,}
    return HttpResponse(json.dumps(response), content_type='application/json')

Overwriting ../summarizer_service.py


# Testing

In [6]:
from django.http import HttpResponse
from mangorest.mango import webapi
from transformers import pipeline
summarizer = pipeline("summarization", "vmarklynn/bart-large-cnn-samsum-icsi-ami", truncation=True)

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
dialogue = """
SPEAKER_00: Yeah, the universal ones.
SPEAKER_01: So presumably that might be an idea too.
SPEAKER_01: But it's also 25, you need a lot of new features.
SPEAKER_01: Yeah, yeah.
SPEAKER_01: I mean what, 25 euros is about?
SPEAKER_01: I don't know, 15 pounds or so?
SPEAKER_01: And that's quite a lot for a more control.
SPEAKER_00: Well, my first thought would be most remote controls are grey or black.
SPEAKER_00: As you said, they come with a TV, so it's normally just your basic grey, black, remote controls as a function.
SPEAKER_00: So maybe we could think about colour.
SPEAKER_00: That might make it a bit different from the rest at least.
SPEAKER_00: And as you say, we need to have some kind of gimmick.
SPEAKER_00: So I thought maybe something like...
"""
original_word_count = 119

### Getting errors if letting users set the minimum word counts

In [20]:
user_input = 500
summary = summarizer(dialogue, min_length = user_input)[0]['summary_text']
summary

Your min_length=500 must be inferior than your max_length=142.


ValueError: Unfeasible length constraints: the minimum length (500) is larger than the maximum length (142)

### What if we change the max length implicitly? ---> then it becomes a text generation model when user input exceeds the original word count

In [21]:
user_input = 500
summary = summarizer(dialogue, min_length = user_input, max_length = user_input + 1)[0]['summary_text']
summary

Your max_length is set to 501, but you input_length is only 255. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=127)


'Firstly, the group discussed the price of the remote control, which was 25.5 Euros. Secondly, the team discussed the design of the new remote control. Thirdly, they talked about the colour of the product. Lastly, they decided to design the product with a lot of new features, such as the ability to use the remote as a remote control with a TV. Finally, they agreed to have some kind of gimmick to make the product stand out from other remote controls, like the colour and the shape of the design. The team decided to use a yellow remote control as well as a red one, which would make it more user-friendly and appealing to the younger generation. The meeting ended with a discussion about the budget and the target group. The group decided to keep the price at 25 Euros and the size of the project at 25 million Euros. They also agreed that the product should be made of plastic and the price should not exceed 25 Euros. The final decision was made by the group members, and the team would make a f