In [1]:
import numpy as np
import openai
from string import Template
import re
import pypandoc

openai.api_key_path = '/home/tim/projects/openai/apikey.txt'

In [2]:
# Which completion model to use?  

MODELS = {
    "something_else":'',
    "text1": "text-davinci-003"
}

In [46]:
# Look at me prompt engineering.

topics = Template("""
I'm going to ask you to summarize a meeting transcript.  The transcript will contain a marker for each speaker's 
utterance, which you can ignore.  

Here's an eample of a marker for a speaker named Tim Dolbeare:

0:1:47.940 --> 0:1:48.410
Tim Dolbeare

Where's an example of a marker for a speaker named Lydia Ng:

0:1:26.950 --> 0:1:41.660
David Feng

Please start your summary with a bulleted list of topics, and then a second bulleted containing a summary of each topic.

Here's the transcript:
    
$q   
""")

summary = Template("""
I'm going to ask you to summarize a meeting transcript.  In the transcript each speaker's utterance begins with 
two new line characters, and then their name.  

Here's the transcript:
    
$q   
""")

super_sum = Template("""
I'm going to ask you to summarize a collection of meeting summary excerpts. All of the excerpts are 
from the same meeting, and have been concatenated together below.

Please structure your response as--
- a summarized list of important topics discussed
- for each topic, a summary of the discussion in one or two sentences
- Finally, a list of any actions decided upon.

Here's the collection of summaries:
    
$q   
""")

topic_summary = Template("""
I'm going to ask you to summarize a collection of meeting summary excerpts. All of the excerpts are 
from the same meeting, and have been concatenated together.

Your job is to produce a list of important topics discussed in the meeting.  Please try to remove any 
redundancy from your final list of topics, so that each subject appears only once in the topic list, even if it 
was discussed multiple times.

Here's the collection of summaries:
    
$q   
""")


general_question = Template("""
Answer the question as truthfully as possible, and if you're unsure of the answer, say "Sorry, I don't know".

Q: $q
""")
                   

free = Template("""
$q
""")

PROMPTS = {
    "topics": topics,
    "summary": summary,
    "general": general_question,
    "super_sum": super_sum,
    "topic_summary": topic_summary,
    "free": free
}

### What Do?
* Convert docx to txt
* Pre-process txt to remove chaff
* chunk and summarize
  * chunk txt into summarizeable pieces
  * summarize each chunk
  * combine summaries


In [4]:
def word_to_text(word_file, text_file_name):
    pypandoc.convert_file(word_file, 'plain', outputfile=text_file_name)
    return text_file_name

In [5]:
def load_transcript(file_name):
    with open(file_name) as f:
        return f.read()       

In [6]:
def preprocess_transcript(transcript):
    # remove timestamps
    temp = re.sub("[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{1,3} --> [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{1,3}\n", '', transcript)
    return temp

In [26]:
def chunk_text(text, chunk_size=200):
    utterances = text.split('\n\n')
    total = len(utterances)
    #total = len(text)
    chunk_count = (total//chunk_size) + 1

    chunks = []
    for i in range(0,chunk_count):
        start = i*chunk_size
        end = min(start+chunk_size, total)
        chunks.append(utterances[start:end])

    return chunks

In [12]:
# Given a prompt and a question, return a translation from NL.
# The key parameter that we're varying in his notebook is translation_type, which
# selects for one of the prompts designed above.

def submit_prompt(q, translation_type, completion_model='text1', temp=0, max_tokens=900):
    
    prompt = PROMPTS[translation_type]
    
    r = openai.Completion.create(
        prompt=prompt.substitute({'q': q}),
        temperature=temp,
        max_tokens=max_tokens,
        model=MODELS[completion_model]
    )["choices"][0]["text"].strip(" \n")       
    r = r.replace('A:', '').strip()
    
    return f"\n{r}\n"

In [9]:
def summarize_chunks(chunks, prompt='summary', max_tokens=900):
    r = ''
    for chunk in chunks:
        r += submit_prompt(chunk, prompt, max_tokens=max_tokens)
        
    return r

In [16]:
preprocessed

"Tim Dolbeare\nMind if I?\n\nRob Young\nOoh.\n\nTim Dolbeare\nCreate a transcript in the meeting. Ohh you're already creating.\n\nTim Dolbeare\nThat's great.\n\nDavid Feng\nGuys I didn't.\n\nRob Young\nApparently on what's up, how are you?\n\nDavid Feng\nWord just today offered to summarize a document and put it in executive\nsummary at the top of it. Like wow, that's crazy. Anyway, sorry.\nContinue.\n\nRob Young\nYeah.\n\nRob Young\nWe'll all be out of a job soon enough.\n\nRob Young\nWe can only hope.\n\nRob Young\nOh, look, it's transcribing me right now.\n\nShane Vance\nNice.\n\nRob Young\nThat is, if you guys.\n\nTim Dolbeare\nI I have an ulterior motive for that, Rob. I'm, I'm going to download\nthat and.\n\nTim Dolbeare\nPut it through a.\n\nTim Dolbeare\nGPT 3 model and try to make notes out of it.\n\nRob Young\nDid you did you read about Dan?\n\nTim Dolbeare\nYeah.\n\nRob Young\nThe the alter ego for Chad GPT and then where someone is on slash dot\nwhere they basically gave in

In [37]:
infile = '/home/tim/work/projects/transcripts/AWS_LZ_Fun_Time_2023-02-10.docx'
outfile = '/home/tim/work/projects/transcripts/AWS_LZ_Fun_Time_2023-02-10.txt'
CodeOceanAndAWSOPS_2023-02-15

#transcript_file = word_to_text(infile, outfile)

transcript_file = '/home/tim/work/projects/transcripts/AWS_LZ_Fun_Time_2023-02-10.txt'

raw_transcript = load_transcript(transcript_file)
preprocessed = preprocess_transcript(raw_transcript)
chunks = chunk_text(preprocessed, chunk_size=100)

#len(chunks)

sub_sums = summarize_chunks(chunks, prompt='summary', max_tokens=900)
sub_sums
print(sub_sums)

"\nSummary: David Feng is looking for a way to use AWS Identity Center to push user groups to a third party service, Code Ocean, which allows scientists to do analysis on data in the cloud. Rob Young suggested that Cognito might be an option to enable the pushing of user groups, and Tim Dolbeare mentioned that he needs to be able to share with collaborators and partition access to resources within the cloud. Aiken David suggested that access to Code Ocean doesn't necessarily mean giving access to an S3 bucket, and all access is controlled by the app.\n\nIn this meeting, the participants discussed the budget in the app, the ability to access plain vanilla S3 buckets, the need for an authenticator, and the need for an identity provider. They also discussed the issue of David's screen not being visible and the need to reset it.\n\nDavid Feng and Rob Young discussed the challenge of controlling access to their apps and data. They discussed the possibility of using Azure AD to manage access

In [41]:
final = summarize_chunks(sub_sums, prompt='super_sum', max_tokens=900)
print(final)

KeyboardInterrupt: 

In [40]:
print(sub_sums)


Summary: David Feng is looking for a way to use AWS Identity Center to push user groups to a third party service, Code Ocean, which allows scientists to do analysis on data in the cloud. Rob Young suggested that Cognito might be an option to enable the pushing of user groups, and Tim Dolbeare mentioned that he needs to be able to share with collaborators and partition access to resources within the cloud. Aiken David suggested that access to Code Ocean doesn't necessarily mean giving access to an S3 bucket, and all access is controlled by the app.

In this meeting, the participants discussed the budget in the app, the ability to access plain vanilla S3 buckets, the need for an authenticator, and the need for an identity provider. They also discussed the issue of David's screen not being visible and the need to reset it.

David Feng and Rob Young discussed the challenge of controlling access to their apps and data. They discussed the possibility of using Azure AD to manage access, as w

In [36]:
mm = summarize_chunks([chunks[1]], prompt='summary', max_tokens=900)
print(mm)


In this meeting, the participants discussed the budget in the app, the ability to access plain vanilla S3 buckets, the need for an authenticator, and the need for an identity provider. They also discussed the issue of David's screen being blacked out and the need to reset it.



In [42]:
sum_sum = submit_prompt(sub_sums, 'super_sum', temp=0, max_tokens=1000)
print(sum_sum)


Important Topics Discussed:
- Using AWS Identity Center to push user groups to a third party service, Code Ocean
- Using Cognito to enable the pushing of user groups
- Access to Code Ocean and S3 buckets
- Using Azure AD to manage access
- Using Orchid as an identity provider
- Using Azure AD B2C as a lightweight solution
- Using Azure AD for Allen Institute employees
- Using an API to manage access to data stored in an S3 bucket
- Automating the connection between users and projects managed in a database
- Assigning resources or access to S3 buckets
- Using AWS Transfer Family

Topic Summaries:
- David Feng and Rob Young discussed the challenge of controlling access to their apps and data, and the possibility of using Azure AD, Orchid, and federating with larger institutions.
- They discussed the need for a single identity provider that could collaborate across different identity providers, such as Azure AD, COGNITO, and Octa, and the possibility of using Azure AD B2C.
- They discuss

In [47]:
infile = '/home/tim/work/projects/transcripts/CodeOceanAndAWSOPS_2023-02-15.docx'
outfile = '/home/tim/work/projects/transcripts/CodeOceanAndAWSOPS_2023-02-15.txt'

transcript_file = word_to_text(infile, outfile)

raw_transcript = load_transcript(transcript_file)
preprocessed = preprocess_transcript(raw_transcript)
chunks = chunk_text(preprocessed, chunk_size=100)

print(f"There are {len(chunks)} chunks in this transcript")

sub_sums = summarize_chunks(chunks, prompt='summary', max_tokens=500)
print("Sub-summaries:")
print(sub_sums)

There are 5 chunks in this transcript
Sub-summaries:

Shoaib Mufti and Tim Dolbeare discussed the possibility of recording staff meetings and using language models to create summaries. They discussed the use of Code Ocean, a platform similar to Jupiter Labs, which allows data scientists to create capsules that encapsulate the compute environment and data. They also discussed the use of S3 as storage and a no-code workflow system. Tyler did not join the meeting.

Tim Dolbeare discussed how Code Ocean can be used to host Python or R, and how administrators can define what machines are available to users. He also mentioned that only one scientist has used the drag and drop feature, and that administrators can set up permissions boundaries. Shoaib Mufti asked questions about the advantages of using Code Ocean, and Michael Wang and Rob Young discussed the cost of using Code Ocean and the need for good engineering practices.

Rob and Tim discuss the use of Code Ocean for production workflows

In [48]:
sum_sum = submit_prompt(sub_sums, 'super_sum', temp=0, max_tokens=1000)
print(sum_sum)


Important topics discussed:
- Use of Code Ocean for production workflows
- Advantages and disadvantages of using Code Ocean
- Criteria for determining when to use Code Ocean
- Risk of Code Ocean going under
- Presentation from June of last year by David

Summary of discussion:
- Shoaib Mufti and Tim Dolbeare discussed the possibility of using Code Ocean, a platform similar to Jupiter Labs, for production workflows in AWS. 
- They discussed the pros and cons of using Code Ocean, such as visualizations, search features, and the ability to orchestrate and schedule resources. 
- They also discussed the need to do an analysis or write a position paper to make an informed decision, and the risk of Code Ocean going under as a company. 
- Tyler Mollenkopf provided details about the presentation from June of last year by David, which included a visual of the workflow and ratings of the six jobs that the platform wanted to accomplish.

Actions decided upon:
- Test the theory by running a workfl

In [98]:
p = "Please e-write the following text, removing anything taht doesn't seem usefull for summarization: " + mm
print(f"prompt: {p}")

q = submit_prompt(p, 'free', temp=0, max_tokens=1000)

print(f"Response: {q}")

prompt: Please e-write the following text, removing anything taht doesn't seem usefull for summarization: 
avid
And then you've got the UW data scientists, which is a different
identity provider.

Aiken, David
And then you've got the AWS OPS, which is a different identity provider.

Aiken, David
So the question is, can we use a single identity provider to manage
access to all of these different applications? And if so, what identity
provider should we use?

In this meeting, the participants discussed the need for a single identity provider to manage access to multiple applications. They discussed the possibility of using Azure AD, Orchid, or AWS Cognito as the identity provider. They also discussed the need to ensure that access is revoked when users leave the organization. They discussed the possibility of using Azure AD to manage access and roles, and the possibility of using AWS to manage access. No action was decided upon.

Response: 
In this meeting, the participants discussed the

In [68]:
ss = submit_prompt(sub_sums, 'topics', max_tokens=2000)
print(ss)


Actions Decided Upon:
- Have a huddle with Ashmeet and Mike to brainstorm ideas and come back with proposals or designs.



In [69]:
cc = chunk_text(ss)
len(cc)

1

In [70]:
print(sub_sums)


Topics discussed:
- Using GPT 3 models to create notes
- Using AWS Identity Center to manage users and user groups
- Code Ocean requirement for identity management
- Using Cognito in conjunction with Identity Center
- Using GWT Team Bearer for identity management
- Accessing S3 buckets through Code Ocean

Summary:
The group discussed using GPT 3 models to create notes, using AWS Identity Center to manage users and user groups, and the Code Ocean requirement for identity management. They discussed using Cognito in conjunction with Identity Center, using GWT Team Bearer for identity management, and accessing S3 buckets through Code Ocean.

Actions Decided Upon:
- Use GPT 3 models to create notes
- Use AWS Identity Center to manage users and user groups
- Use Cognito in conjunction with Identity Center
- Use GWT Team Bearer for identity management
- Access S3 buckets through Code Ocean

avid
And then you've got the UW data scientists, which is a different
identity provider.

Aiken, David

In [4]:


# Example file:
infile = '/home/tim/work/projects/transcripts/AWS_LZ_Fun_Time_2023-02-10.docx'
outfile = '/home/tim/work/projects/transcripts/AWS_LZ_Fun_Time_2023-02-10.txt'
output = pypandoc.convert_file(infile, 'plain', outputfile=outfile)
assert output == ""

In [6]:
# Given a prompt and a question, return a translation from NL.
# The key parameter that we're varying in his notebook is translation_type, which
# selects for one of the prompts designed above.

def submit_prompt(q, translation_type, completion_model='text1', temp=0, max_tokens=300):
    
    prompt = PROMPTS[translation_type]
    
    r = openai.Completion.create(
        prompt=prompt.substitute({'q': q}),
        temperature=temp,
        max_tokens=max_tokens,
        model=MODELS[completion_model]
    )["choices"][0]["text"].strip(" \n")       
    r = r.replace('A:', '').strip()
    
    return r

In [7]:
def preprocess_transcript(transcript):
    # remove timestamps
    temp = re.sub("[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{1,3} --> [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{1,3}\n", '', transcript)
    return temp

In [8]:
def load_transcript(file_name):
    with open(file_name) as f:
        return f.read()
        

In [11]:
transcript_txt = '/home/tim/work/projects/transcripts/AWS_LZ_Fun_Time_2023-02-10.txt'
part1 = load_transcript(transcript_txt)
print(f"number of chars in transcript: {len(part1)}")
l = part1.count('\n')
print(f"Number of lines in transcript: {l}")
#part1_1 = part1[:3600]
part1 = preprocess_transcript(part1)
print(f"number of chars after removing timestamps: {len(part1)}")
l = part1.count('\n')
print(f"number of lines after removing timestamps: {l}")
word_count = len(part1.split())
print(f"Number of words after preprocessing: {word_count}")



number of chars in transcript: 72076
Number of lines in transcript: 3023
number of chars after removing timestamps: 54298
number of lines after removing timestamps: 2366
Number of words after preprocessing: 10004


In [32]:
chunk_size = 10000
total = len(part1)
chunk_count = (total//chunk_size) + 1

chunks = []
for i in range(0,chunk_count):
    #chunks.append(part1[[i*chunk_size]:chunk_size])
    #print(i*chunk_size)
    start = i*chunk_size
    end = min(start+chunk_size, total)
    print((start, end))
    chunks.append(part1[start:end])

len(chunks)

(0, 10000)
(10000, 20000)
(20000, 30000)
(30000, 40000)
(40000, 50000)
(50000, 54298)


6

In [33]:
def summarize_chunks(chunks, prompt='summary', max_tokens=900):
    r = ''
    for chunk in chunks:
        r += submit_prompt(chunk, prompt, max_tokens=max_tokens)
        
    return r

In [35]:
def chunk_text(text, chunk_size=10000):
    total = len(text)
    chunk_count = (total//chunk_size) + 1

    chunks = []
    for i in range(0,chunk_count):
        start = i*chunk_size
        end = min(start+chunk_size, total)
        chunks.append(text[start:end])

    return chunks

In [34]:
r = summarize_chunks(chunks, max_tokens=900)
print(r)

Summary:
- Discussion of creating a transcript of the meeting
- Discussion of using GPT 3 model to make notes
- Discussion of using AWS Identity Center to manage users and user groups
- Discussion of using Cognito in conjunction with AWS Identity Center
- Discussion of using GWT Team Bearer for identity management
- Discussion of using Octa for identity management
- Discussion of access to code Ocean and S3 buckets
- Discussion of merging Rob's platform with code Ocean

Actions:
- Download transcript and put it through GPT 3 model
- Pull the trigger on setting up Octave
- Chisel together a diagramSummary:
- Discussed different identity providers for accessing apps, including Azure AD, Orchid, and AWS
- Discussed the need to control access to apps, and the need to be able to revoke access when someone leaves an organization
- Discussed the possibility of using Azure AD as an identity provider for the platform services maintained by Rob's team
- Discussed the possibility of using Orchid 

In [37]:
t = chunk_text(r)
len(t)

1

In [41]:
t[0]

"Summary:\n- Discussion of creating a transcript of the meeting\n- Discussion of using GPT 3 model to make notes\n- Discussion of using AWS Identity Center to manage users and user groups\n- Discussion of using Cognito in conjunction with AWS Identity Center\n- Discussion of using GWT Team Bearer for identity management\n- Discussion of using Octa for identity management\n- Discussion of access to code Ocean and S3 buckets\n- Discussion of merging Rob's platform with code Ocean\n\nActions:\n- Download transcript and put it through GPT 3 model\n- Pull the trigger on setting up Octave\n- Chisel together a diagramSummary:\n- Discussed different identity providers for accessing apps, including Azure AD, Orchid, and AWS\n- Discussed the need to control access to apps, and the need to be able to revoke access when someone leaves an organization\n- Discussed the possibility of using Azure AD as an identity provider for the platform services maintained by Rob's team\n- Discussed the possibilit

In [38]:
f = summarize_chunks(t, max_tokens=900)
print(f)

- Research Nasuni and AWS Transfer Family to see if they can be used to automate the process of keeping data in sync


In [42]:
r = submit_prompt(t[0], 'topics', max_tokens=900)
print(r)

Summary:
- Discussed ways to manage access to Allen Institute data
- Discussed using Azure AD, Cognito, and Octa as identity providers
- Discussed using AWS Identity Center to manage groups and access to resources
- Discussed the need to control access to apps and the ability to revoke access when someone leaves an organization
- Discussed automating the process of keeping data in sync
- Discussed providing an abstraction layer to make it easier for scientists to use
- Discussed the service Nasuni and AWS Transfer Family

Actions:
- Download transcript and put it through GPT 3 model
- Pull the trigger on setting up Octave
- Chisel together a diagram
- Do due diligence around looking at Cognito and Octa
- Consider using AWS Identity Center to manage groups and access to resources
- Rob Young to do research on pushing groups to Code Ocean
- David Feng to confirm that Azure AD B2C plays nice with Code Ocean
- AWS - Ashmeet Pahwa to look into the use case and do research on it
- Huddle wit

In [None]:
print(part1)

In [None]:
import re
re.sub("[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{1,3} --> [0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{1,3}", '', part1_1)

In [None]:

r = submit_prompt(chunks[2]+chunks[3], 'topics', max_tokens=2000)
print(r)

In [None]:
print('Topics:\n- User Management\n- Configuration Steps\n- Autoscaling\n- HPC Setup\n- Workflow Management\n\nSummary:\n- User Management: Code Ocean offers integrations with Okta and Auth0 for identity management, which allows for assigning users to groups and sharing data and capsules with groups.\n- Configuration Steps: Cloud formation template is used to deploy updates, which can take up to half an hour and cause an outage.\n- Autoscaling: There is an auto scaling pool that pulls resources from, and users can request dedicated instances or spot instances.\n- HPC Setup: Not currently supported, but may be in the near future.\n- Workflow Management: Code Ocean uses Nextflow, which has a no-code script generator and allows users to write their own scripts.')

In [None]:
for chunk in chunks:
    r = submit_prompt(chunk, 'topics', max_tokens=2000)
    print(r)

In [None]:
r = translate_question(r + "\nlol!", 'jerry')
print(r)