In [1]:
from torch import cuda, bfloat16
import transformers
from huggingface_hub import notebook_login
notebook_login()

model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)

# begin initializing HF items, you need an access token
model_config = transformers.AutoConfig.from_pretrained(
    model_id,)


model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
)

stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids

import torch

stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids

from transformers import StoppingCriteria, StoppingCriteriaList

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

generate_text = transformers.pipeline(
    model=model, 
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded on cuda:0


In [37]:
with open("../transcripts/meeting002.txt") as f:
    transcript_str = f.read()

print(transcript_str)

"\nMeeting Transcript\n\nDate: 2023-10-13\n\nTime: 11:27 PST\n\nAttendees:\n\nJohn Doe, Design Lead\nJane Smith, Designer\nMary Green, Designer\nJohn Doe: Alright, everyone, let's get started. Today, we're going to be discussing the green building design concept that Jane has developed.\n\nJane Smith: Thank you, John. Here's a summary of my concept:\n\nThe building will be designed to maximize natural light and ventilation.\nWe will use renewable energy sources, such as solar panels and wind turbines.\nWe will use sustainable materials, such as recycled wood and bamboo.\nWe will incorporate water conservation features, such as rainwater harvesting and low-flow toilets.\nJohn Doe: That sounds great, Jane. I'm really impressed with the concept.\n\nMary Green: Me too. I think it's a very innovative and sustainable design.\n\nJohn Doe: However, I have one concern: cost. I'm worried that the cost of implementing all of these green features will be too high.\n\nJane Smith: I understand your 

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain.llms import OpenAI


meeting_date = ResponseSchema(
        name="meeting_date",
        description="date of the meeting stored in datetime format DD/MM/YYYY.",
    )
attendees_list = ResponseSchema(
        name="attendees_list",
        description="Full name of everyone present in the meeting, each stored as a string in the list.",
    )

start_time = ResponseSchema(
        name="start_time",
        description="time the meeting started in 24 hour HH:mm format in datetime",
    )

end_time = ResponseSchema(
        name="end_time",
        description="time the meeting ended in 24 hour HH:mm format in datetime",
    )

output_parser = StructuredOutputParser.from_response_schemas(
    [meeting_date, attendees_list, start_time, end_time]
)

response_format = output_parser.get_format_instructions()

prompt = ChatPromptTemplate.from_template("You are a helpful formatting assistant. Return the meeting_date, attendees_list, start_time, end_time separately. '''{meeting_info}''' \n {format_instructions}")

llm_openai = OpenAI()
formated_prompt = prompt.format(**{"meeting_info":transcript_str, "format_instructions":output_parser.get_format_instructions()})
response_openai = llm_openai(formated_prompt)
# print(response_openai)
# print('printing response')
meeting_info_dict = output_parser.parse(response_openai)

In [36]:
meeting_info_dict

{'meeting_date': '2023-10-13',
 'attendees_list': ['John Doe', 'Jane Smith', 'Mary Green'],
 'start_time': '11:27 PST',
 'end_time': '14:27 PST'}

In [2]:
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import TextLoader

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
text_chunks = text_splitter.split_documents(TextLoader("transcripts/meeting002.txt").load())


model_name = ["sentence-transformers/all-mpnet-base-v2", 'sentence-transformers/all-MiniLM-L6-v2', "sentence-transformers/paraphrase-MiniLM-L6-v2"]
embeddings = HuggingFaceEmbeddings(model_name=model_name[1],model_kwargs={'device': 'cuda'})
vectorstore=FAISS.from_documents(text_chunks, embeddings)

llm=HuggingFacePipeline(pipeline=generate_text, model_kwargs={'temperature':0})
chain =  RetrievalQA.from_chain_type(llm=llm, chain_type = "stuff",return_source_documents=False, retriever=vectorstore.as_retriever())
query_list = [ "you are a helpful AI meeting minute writer. Help me to write a meeting summary in prose, detailing all the important tasks mentioned, especially the people in charge of each task and the respective deadlines if any. Your minute should be ordered based on the topics discussed and not on chronological order."]
chat_history = []
for query in query_list:
    result = chain({"query": query, "chat_history": chat_history})
    #chat_history.append((query, result["result"].strip("\n")))
    result=chain({"query": query, "chat_history": []},return_only_outputs=True)
    print(result['result'])

In [41]:
result['result']

' Sure! Based on the transcript provided, here is a meeting summary in prose:\n\nThe meeting began with a discussion of the green building design concept developed by Jane Smith. John Doe expressed his appreciation for the concept, but also raised concerns about the cost of implementing the green features. To address this concern, Jane suggested doing further research on the cost of the green features and updating the team in two weeks.\n\nJohn Doe then assigned tasks to the team members. He asked Jane and Mary to research the cost of implementing the green features and update the team in two weeks. He also asked Jane to research vendors who could provide the green features and update the team by October 16th.\n\nIn conclusion, the meeting assigned specific tasks to team members and set deadlines for their completion. Jane was responsible for researching the cost of the green features, while John was responsible for researching vendors who could provide them. Both were given deadlines 

In [5]:
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain.llms import OpenAI

import os
with open(r'archive_note.txt', 'r') as fp:
    # read all lines using readline()
    lines = fp.readlines()
    for line in lines:
        os.environ['OPENAI_API_KEY'] = line

main_topics = ResponseSchema(
        name="task_description",
        description="Describe each task in detail and store it as a string in the list.",
    )
deadline = ResponseSchema(
        name="deadline",
        description="The deadline of the corresponding task_description, stored as a unique string in the list, in the same order as it appears in task_description.",
    )

person_in_charge = ResponseSchema(
        name="person_in_charge",
        description="default str is NA. The people in involved in each task, who needs to perform follow-up actions after the meeting or had presented this task in the meeting, stored as a string in the list, in the same order as it appears in task_description.",
    )

days_left = ResponseSchema(
        name="days_left",
        description="Optional parameter, default value is int 0. If the deadline given is relative from the date of the meeting, store as an int (the number of days given) in the list, in the same order as it appears in task_description.",
    )

output_parser = StructuredOutputParser.from_response_schemas(
    [main_topics, deadline, person_in_charge, days_left]
)

response_format = output_parser.get_format_instructions()
print(response_format)

prompt = ChatPromptTemplate.from_template("You are a helpful formatting assistant. The meeting transcript is delimited with triple backticks. It is already a summarised version, hence you don't need to summarise it further when providing task_description. A task can be what team members have presented in the meeting, or follow-up actions required after this meeting. Return the task_description, and their respective deadline, person_in_charge and days_left separately. '''{meeting_transcript}''' \n {format_instructions}")


llm_openai = OpenAI()

meeting_transcript = result['result']

formated_prompt = prompt.format(**{"meeting_transcript":meeting_transcript, "format_instructions":output_parser.get_format_instructions()})
response_openai = llm_openai(formated_prompt)
print(response_openai[:5])
print(type(response_openai))
print(response_openai)

if response_openai[0] == '[':
    print('error in',response_openai )
    response_openai = response_openai[1:-2]
    print('re-attempt with',response_openai )
print('printing response')
formatted_output = output_parser.parse(response_openai)

In [49]:
(response_openai)


'\n\n```json\n{\n\t"meeting_date": "2023-10-13",\n\t"attendees_list": ["John Doe", "Jane Smith", "Mary Green"],\n\t"start_time": "11:27 PST",\n\t"end_time": "14:27 PST"\n}\n```'

In [14]:
(formatted_output)

{'task_description': ['Research the cost of implementing the green features and update the team in two weeks',
  'Research vendors who could provide the green features and update the team by October 16th'],
 'deadline': ['Two weeks', 'October 16th'],
 'person_in_charge': ['Jane and Mary', 'Jane'],
 'days_left': [14, 16]}

In [20]:
table_content = []
for i in range (len(formatted_output['task_description'])):
    mydict = {}
    mydict['task_description'] = formatted_output['task_description'][i]
    info_list = [formatted_output['deadline'][i],formatted_output['person_in_charge'][i]]
    mydict['additional info'] = info_list
    table_content.append(mydict)

In [21]:
table_content

[{'task_description': 'Research the cost of implementing the green features and update the team in two weeks',
  'additional info': ['Two weeks', 'Jane and Mary']},
 {'task_description': 'Research vendors who could provide the green features and update the team by October 16th',
  'additional info': ['October 16th', 'Jane']}]

In [40]:
context = meeting_info_dict.copy()
context['attendees_list'] =  '\n'.join(meeting_info_dict['attendees_list'])
context['col_labels'] = ['Deadline', 'Person-In-Charge']
context['tbl_contents'] = table_content
print(context)

{'meeting_date': '2023-10-13',
 'attendees_list': 'John Doe\nJane Smith\nMary Green',
 'start_time': '11:27 PST',
 'end_time': '14:27 PST',
 'col_labels': ['Deadline', 'Person-In-Charge'],
 'tbl_contents': [{'task_description': 'Research the cost of implementing the green features and update the team in two weeks',
   'additional info': ['Two weeks', 'Jane and Mary']},
  {'task_description': 'Research vendors who could provide the green features and update the team by October 16th',
   'additional info': ['October 16th', 'Jane']}]}