In [None]:
%reload_ext autoreload
%autoreload 2

### Establishing our summarization service

In [None]:
#%%writefile "../summarizer_service.py"

from django.http import HttpResponse
from mangorest.mango import webapi
import whisper, hashlib, os, datetime, json, torch
from transformers import pipeline
import keybert
import math

def preprocess(text):
    """
    Remove timelines and return the result in this format:
    {SPEAKER}: {SENTENCES}
    """
    result = []
    lines = text.strip().split('\n')
    for line in lines:
        parts = line.split('|')
        speaker = parts[1].strip().split(':')[0]
        content = parts[1].strip().split(':')[1].strip()
        result.append(f"{speaker}: {content}")
    return '\n'.join(result)   

#-----------------------------models------------------------------------------------------------------------               
summarizer = pipeline("summarization", "vmarklynn/bart-large-cnn-samsum-acsi-ami-v2", truncation=True)
kw_model = keybert.KeyBERT(model='all-mpnet-base-v2')
#-----------------------------------------------------------------------------------------------------               

@webapi("/parrot/summarize_text/")
def summarizeText(request, **kwargs):
    post_data = request.POST.dict()
    transcription = post_data.get('transcription')
    text = post_data.get('text')
    wordCount = post_data.get('wordCount')
    
    input_cleanned_text = preprocess(transcription)
    print("\n\n", input_cleanned_text, "\n\n")
    # print( "min: ", math.ceil(int(wordCount) * 0.1), "max: ", math.ceil(int(wordCount) * 0.25))
    print("\n\nSummarizing...")
    summary = summarizer(input_cleanned_text)[0]['summary_text']
    print("\n", summary, "\n")
    
    keywords = kw_model.extract_keywords(text, 
                                     keyphrase_ngram_range=(1, 1), 
                                     stop_words='english', 
                                     highlight=False,
                                     top_n=5)
    keywords_list_1= list(dict(keywords).keys())
    print(keywords_list_1)
    keywords = kw_model.extract_keywords(text, 
                                     keyphrase_ngram_range=(2, 2), 
                                     stop_words='english', 
                                     highlight=False,
                                     top_n=5)
    keywords_list_2= list(dict(keywords).keys())
    print(keywords_list_2)    
    keywords = kw_model.extract_keywords(text, 
                                     keyphrase_ngram_range=(3, 3), 
                                     stop_words='english', 
                                     highlight=False,
                                     top_n=5)    
    keywords_list_3 = list(dict(keywords).keys())
    print(keywords_list_3)
    
    response = {'transcription': transcription, 'summary': summary, 
                'keywords_list_1': keywords_list_1, 'keywords_list_2': keywords_list_2,
                'keywords_list_3': keywords_list_3,}
    return HttpResponse(json.dumps(response), content_type='application/json')

#-----------------------------------------------------------------------------------------------------               
@webapi("/parrot/summarize_summary/")
def summarizeSummary(request, **kwargs):
    post_data = request.POST.dict()
    summary_input = post_data.get('summary')
    wordCount = post_data.get('wordCount-summ')
    
    print( "min: ", math.ceil(int(wordCount) * 0.1), "max: ", math.ceil(int(wordCount) * 0.25))
    print("\n\nSummarizing again...")
    summary = summarizer(summary_input, min_length = math.ceil(int(wordCount) * 0.1), max_length = math.ceil(int(wordCount) * 0.25))[0]['summary_text']
    print("\n", summary, "\n")
    
    response = {'summary': summary}
    return HttpResponse(json.dumps(response), content_type='application/json')

# TEST

In [None]:
import sys, os
for c in "http_proxy https_proxy HTTP_PROXY HTTPS_PROXY CURL_CA_BUNDLE".split():
    if (c not in os.environ or not os.environ[c]):
        continue;
    print(os.environ[c])
    os.environ[c] = ''
#sys.path.append("..")


In [4]:
%%writefile "../summarizer.py"

from transformers import pipeline
import os, keybert

"""
Assumes input is as follows:
    text='''
    0:00:00 - 0:00:06 | SPEAKER_01: Yeah, we had a long 
    0:00:06 - 0:00:10 | SPEAKER_01: Morgan wants to make it hard.
    0:00:10 - 0:00:13 | None: The counter is not moving.
    0:00:13 - 0:00:16 | SPEAKER_01: It doesn't.
    0:00:16 - 0:00:18 | SPEAKER_00: I didn't even check yesterday.
    0:00:18 - 0:00:20 | SPEAKER_01: It didn't move
    0:00:20 - 0:00:22 | SPEAKER_01: I don't know if 
    0:00:22 - 0:00:24 | SPEAKER_01: Channel 3?
    '''

Remove timelines and return the result in this format:
{SPEAKER}: {SENTENCES}
"""
def cleanup(text):
    result = []
    lines = text.strip().split('\n')
    for line in lines:
        parts = line.split('|')
        speaker = parts[1].strip().split(':')[0]
        content = parts[1].strip().split(':')[1].strip()
        result.append(f"{speaker}: {content}")
    return '\n'.join(result)   

#-----------------------------models------------------------------------------------------------------               
summarizer = pipeline("summarization", "vmarklynn/bart-large-cnn-samsum-acsi-ami-v2", truncation=True)
kw_model = keybert.KeyBERT(model='all-mpnet-base-v2')
#-----------------------------------------------------------------------------------------------------               
def summarizeText(transcription, wordCount=1024):
    text = cleanup(transcription)
    summary = summarizer(text)[0]['summary_text']
    
    keywords = kw_model.extract_keywords(text, 
                                     keyphrase_ngram_range=(1, 1), 
                                     stop_words='english', 
                                     highlight=False,
                                     top_n=5)
    keywords_list_1= list(dict(keywords).keys())
    print(keywords_list_1)
    keywords = kw_model.extract_keywords(text, 
                                     keyphrase_ngram_range=(2, 2), 
                                     stop_words='english', 
                                     highlight=False,
                                     top_n=5)
    keywords_list_2= list(dict(keywords).keys())
    print(keywords_list_2)    
    keywords = kw_model.extract_keywords(text, 
                                     keyphrase_ngram_range=(3, 3), 
                                     stop_words='english', 
                                     highlight=False,
                                     top_n=5)    
    keywords_list_3 = list(dict(keywords).keys())
    print(keywords_list_3)
    
    ret = { 'summary': summary, 
            'keywords_list_1': keywords_list_1, 
            'keywords_list_2': keywords_list_2,
            'keywords_list_3': keywords_list_3,}

    return ret


Overwriting ../summarizer.py


In [12]:
import parrot.summarizer as summarizer

text='''
0:00:00 - 0:00:06 | SPEAKER_01: Yeah, we had a long discussion about how easy we want to make it for people to bleep things out.
0:00:06 - 0:00:10 | SPEAKER_01: Morgan wants to make it hard.
0:00:10 - 0:00:13 | None: The counter is not moving.
0:00:13 - 0:00:16 | SPEAKER_01: It doesn't.
0:00:16 - 0:00:18 | SPEAKER_00: I didn't even check yesterday.
0:00:18 - 0:00:20 | SPEAKER_01: It didn't move yesterday either when I started it.
0:00:20 - 0:00:22 | SPEAKER_01: I don't know if it doesn't look like both.
0:00:22 - 0:00:24 | SPEAKER_01: Channel 3?
'''
ret = summarizer.summarizeText(text)
#ct = cleanup(text)
#print(ct)
#summary = summarizer(ct)


2023-08-24 08:38:02,919 sentence_transformers.SentenceTransformer INFO: Load pretrained SentenceTransformer: all-mpnet-base-v2
2023-08-24 08:38:03,824 sentence_transformers.SentenceTransformer INFO: Use pytorch device: cpu
Your max_length is set to 142, but you input_length is only 133. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=66)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
['bleep', 'counter', 'channel', 'speaker_00', 'speaker_01']
['bleep things', 'people bleep', 'counter moving', 'hard counter', 'speaker_01 channel']
['people bleep things', 'counter moving speaker_01', 'make people bleep', 'bleep things speaker_01', 'speaker_01 doesn speaker_00']
