In [1]:
import numpy as np
import openai
import os
from string import Template
import re
import pypandoc

openai.api_key_path = '/home/tim/projects/openai/apikey.txt'

In [2]:
# Which completion model to use?  

MODELS = {
    "something_else":'',
    "text1": "text-davinci-003"
}

In [3]:
# Look at me prompt engineering.

topics = Template("""
I'm going to ask you to summarize a meeting transcript.  The transcript will contain a marker for each speaker's 
utterance, which you can ignore.  

Here's an eample of a marker for a speaker named Tim Dolbeare:

0:1:47.940 --> 0:1:48.410
Tim Dolbeare

Where's an example of a marker for a speaker named Lydia Ng:

0:1:26.950 --> 0:1:41.660
David Feng

Please start your summary with a bulleted list of topics, and then a second bulleted containing a summary of each topic.

Here's the transcript:
    
$q   
""")

summary = Template("""
I'm going to ask you to summarize a meeting transcript.  In the transcript each speaker's utterance begins with 
two new line characters, and then their name.  

Here's the transcript:
    
$q   
""")

super_sum = Template("""
I'm going to ask you to summarize a collection of meeting summary excerpts. All of the excerpts are 
from the same meeting, and have been concatenated together below.

Please structure your response as--
- a summarized list of important topics discussed
- for each topic, a summary of the discussion in one or two sentences
- Finally, a list of any actions decided upon.

Here's the collection of summaries:
    
$q   
""")

topic_summary = Template("""
I'm going to ask you to summarize a collection of meeting summary excerpts. All of the excerpts are 
from the same meeting, and have been concatenated together.

Your job is to produce a list of important topics discussed in the meeting.  Please try to remove any 
redundancy from your final list of topics, so that each subject appears only once in the topic list, even if it 
was discussed multiple times.

Here's the collection of summaries:
    
$q   
""")


general_question = Template("""
Answer the question as truthfully as possible, and if you're unsure of the answer, say "Sorry, I don't know".

Q: $q
""")
                   

free = Template("""
$q
""")

PROMPTS = {
    "topics": topics,
    "summary": summary,
    "general": general_question,
    "super_sum": super_sum,
    "topic_summary": topic_summary,
    "free": free
}

### What Do?
* Convert docx to txt
* Pre-process txt to remove chaff
* chunk and summarize
  * chunk txt into summarizeable pieces
  * summarize each chunk
  * combine summaries
  
#### To Do
* hierachical summarizing for really long transcripts
* estimate cost of summary


In [20]:
def word_to_text(word_file):
    path, name = os.path.split(word_file)
    name = os.path.splitext(name)[0]
    txt_name = name+'.txt'
    outfile = os.path.join(path, txt_name)
    
    pypandoc.convert_file(word_file, 'plain', outputfile=outfile)
    return outfile, name

In [5]:
def load_transcript(file_name):
    with open(file_name) as f:
        return f.read()       

In [6]:
def preprocess_transcript(transcript):
    # remove timestamps
    temp = re.sub("[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{1,3} --> [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{1,3}\n", '', transcript)
    return temp

In [7]:
def chunk_text(text, chunk_size=200):
    utterances = text.split('\n\n')
    total = len(utterances)
    #total = len(text)
    chunk_count = (total//chunk_size) + 1

    chunks = []
    for i in range(0,chunk_count):
        start = i*chunk_size
        end = min(start+chunk_size, total)
        chunks.append(utterances[start:end])

    return chunks

In [8]:
# Given a prompt and a question, return a translation from NL.
# The key parameter that we're varying in his notebook is translation_type, which
# selects for one of the prompts designed above.

def submit_prompt(q, translation_type, completion_model='text1', temp=0, max_tokens=900):
    
    prompt = PROMPTS[translation_type]
    
    r = openai.Completion.create(
        prompt=prompt.substitute({'q': q}),
        temperature=temp,
        max_tokens=max_tokens,
        model=MODELS[completion_model]
    )["choices"][0]["text"].strip(" \n")       
    r = r.replace('A:', '').strip()
    
    return f"\n{r}\n"

In [9]:
def summarize_chunks(chunks, prompt='summary', max_tokens=900):
    r = ''
    for chunk in chunks:
        r += submit_prompt(chunk, prompt, max_tokens=max_tokens)
        
    return r

In [29]:
def create_summary(word_file, chunk_size=100, max_tokens=500):
    
    transcript_file, file_label = word_to_text(word_file)
    raw_transcript = load_transcript(transcript_file)
    preprocessed = preprocess_transcript(raw_transcript)
    chunks = chunk_text(preprocessed, chunk_size=chunk_size)

    sub_sums = summarize_chunks(chunks, prompt='summary', max_tokens=max_tokens)    
    sum_sum = submit_prompt(sub_sums, 'super_sum', temp=0, max_tokens=1000)
    
    return {
        "steps": sub_sums,
        "summary": sum_sum,
        "title": file_label
    }

In [32]:
s = create_summary('/home/tim/work/projects/transcripts/CodeOcean&AWSOPS2023-02-15.docx')
print('\nMeeting: '+s['title']+'\n')
print(s['summary'])


Meeting: CodeOcean&AWSOPS2023-02-15


Important Topics Discussed:
- Features of Code Ocean
- Use of language models to create summaries
- Use of Code Ocean for AIBS scientists
- Hosting Python or R
- Advantages of using Code Ocean
- Cost of using Code Ocean
- Production workflows
- Criteria for determining when to use Code Ocean
- Analysis or position paper to make an informed decision
- Connecting with an AWS solution architect
- Presentation from June of last year by David
- Risk of Code Ocean going under

Summaries of Discussions:
- Shoaib Mufti and Tim Dolbeare discussed the potential of using Code Ocean to manage data science projects, including the features of Code Ocean, such as the ability to create capsules that encapsulate the compute environment, the use of S3 as storage, and the no-code workflow system. 
- Tim Dolbeare discussed how Code Ocean can be used to host Python or R, and how administrators can define what machines are available to users. 
- Rob and Tim discussed t

In [36]:
s = create_summary('/home/tim/work/projects/transcripts/AWS_LZ_Fun_Time_2023-02-10.docx')
print('\nMeeting: '+s['title']+'\n')
print(s['summary'])


Meeting: AWS_LZ_Fun_Time_2023-02-10


Summary of Important Topics Discussed:
- Need to create a transcript in the meeting
- Use cases for identity management
- Possibility of using AWS Identity Center and Cognito to manage user groups
- Need to be able to flexibly bring people in or out of a service on top of AWS
- Possibility of using attributes to provide different roles for different projects
- Need to be able to share with collaborators and partition access to resources within a cloud
- Budget in the app and how to access the bucket
- Credentials needed to access plain vanilla S3 buckets
- Solutions that would support B2C tenant and open ID connect
- Challenges of managing user access to different applications
- Possibility of using an identity provider to manage access
- Possibility of federating with larger institutions to manage access
- Possibility of using Azure AD as an abstraction layer to integrate with different applications
- Need for a single identity provider that coul

In [34]:
s = create_summary('/home/tim/work/projects/transcripts/Teams-enabledTapeBackupandDataWarehouseDeepDive_2023-02-17.docx')
print('\nMeeting: '+s['title']+'\n')
print(s['summary'])


Meeting: Teams-enabledTapeBackupandDataWarehouseDeepDive _2023-02-17


Important topics discussed:
- Legacy web applications of the Allen Institute
- Containerizing the spinal cord Atlas in a Kubernetes
- Moving the image service to object storage
- Backup and archive system
- Software development project from 2000
- Upgrading the Postgres 9.4 version migration
- Apache configuration
- Two generations of the image service
- Source code for the old one
- Cloud Optimized Geotiff format
- FSX for Lustre
- Data ingestion
- Tape backup system
- Redirecting what is currently going to tape into glacier
- Retrieving content from glacier
- Digital Asset Tracking Service
- Image viewer tool
- Neural clancer
- 3D Slicer image
- Data governance
- Disaster recovery
- Backup
- Deleting data sets
- Next steps for dealing with LIMS

Summary of topics discussed:
- Rob Young and Tim Dolbeare discussed the legacy web applications of the Allen Institute, which have not been migrated to the data warehouse