# Meeting minutes generator with Whisperx and Mistral 7b

## Install Dependencies

In [None]:
!pip install --q git+https://github.com/m-bain/whisperx.git
!pip install git+https://github.com/huggingface/transformers -U
!pip install accelerate
!pip install pandas
!pip install bitsandbytes
!pip install torch

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.7/208.7 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.7/36.7 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m73.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m11.3 MB/s[0m eta [

## import libraries

In [None]:
import whisperx
import gc

  torchaudio.set_audio_backend("soundfile")


## set configurations

In [None]:
device = "cuda"
batch_size = 4 # reduce if low on GPU mem
compute_type = "int8" # change to "int8" if low on GPU mem (may reduce accuracy)

In [None]:
audio_file = "conv.mp3"

In [None]:
audio = whisperx.load_audio(audio_file)

In [None]:
!python --version

Python 3.10.12


## loading whisperx model

In [None]:
model = whisperx.load_model("tiny", device, compute_type=compute_type)


config.json:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

model.bin:   0%|          | 0.00/75.5M [00:00<?, ?B/s]

No language specified, language will be first be detected for each audio file (increases inference time).


100%|█████████████████████████████████████| 16.9M/16.9M [00:07<00:00, 2.35MiB/s]
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.0+cu121. Bad things might happen unless you revert torch to 1.x.


## Transcribe and save result

In [None]:
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)


Detected language: en (0.99) in first 30s of audio...
[{'text': " Hi everyone, welcome back to another mmmEnglish lesson. I'm in London at the moment and I decided to contact another English teacher who has a channel on YouTube. You might know her, it's Lucy from English with Lucy. We are going to meet together on camera so that you can learn a little bit more about how to introduce yourself in English. Hello! Hi! So nice to meet you!", 'start': 0.316, 'end': 30.094}, {'text': " Lovely to finally meet you. How are you? I'm well, how are you? I'm good, thank you. How are you finding mother? It's so far, it has been incredible. I've been working really hard but also having lots of fun as well. It's just the weather that's not", 'start': 30.469, 'end': 47.108}, {'text': " White, my cup of tea. It's not great, it's a bit grey. It's a bit grey and a little bit drizzly, a little bit cold for me. I've spent a few days up in Scotland as well, which was even colder. So I guess now I like in Aus

Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960_asr_ls960.pth
100%|██████████| 360M/360M [00:03<00:00, 119MB/s]


## Display Sentence segments

In [None]:
result['segments']

[{'start': 0.336,
  'end': 4.358,
  'text': ' Hi everyone, welcome back to another mmmEnglish lesson.',
  'words': [{'word': 'Hi', 'start': 0.336, 'end': 0.576, 'score': 0.915},
   {'word': 'everyone,', 'start': 0.676, 'end': 1.237, 'score': 0.353},
   {'word': 'welcome', 'start': 1.537, 'end': 1.897, 'score': 0.908},
   {'word': 'back', 'start': 1.917, 'end': 2.117, 'score': 0.9},
   {'word': 'to', 'start': 2.157, 'end': 2.237, 'score': 0.82},
   {'word': 'another', 'start': 2.297, 'end': 2.577, 'score': 0.829},
   {'word': 'mmmEnglish', 'start': 2.597, 'end': 3.698, 'score': 0.656},
   {'word': 'lesson.', 'start': 3.938, 'end': 4.358, 'score': 0.893}]},
 {'start': 5.099,
  'end': 12.623,
  'text': "I'm in London at the moment and I decided to contact another English teacher who has a channel on YouTube.",
  'words': [{'word': "I'm", 'start': 5.099, 'end': 5.319, 'score': 0.519},
   {'word': 'in', 'start': 5.439, 'end': 5.519, 'score': 0.982},
   {'word': 'London', 'start': 5.599, 'en

## Load diarization model

In [None]:
diarize_model = whisperx.DiarizationPipeline(use_auth_token="your key",
                                             device=device)

config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

In [None]:
diarize_segments = diarize_model(audio, min_speakers=2, max_speakers=2)

## diarization result

In [None]:
diarize_segments

Unnamed: 0,segment,label,speaker,start,end
0,[ 00:00:00.246 --> 00:00:04.422],A,SPEAKER_00,0.246180,4.422750
1,[ 00:00:04.915 --> 00:00:24.983],B,SPEAKER_00,4.915110,24.983022
2,[ 00:00:27.037 --> 00:00:27.071],C,SPEAKER_01,27.037351,27.071307
3,[ 00:00:27.071 --> 00:00:29.006],D,SPEAKER_00,27.071307,29.006791
4,[ 00:00:29.006 --> 00:00:30.093],E,SPEAKER_01,29.006791,30.093379
...,...,...,...,...,...
117,[ 00:07:46.358 --> 00:07:51.230],DN,SPEAKER_00,466.358234,471.230900
118,[ 00:07:52.775 --> 00:07:56.477],DO,SPEAKER_00,472.775891,476.477080
119,[ 00:07:56.833 --> 00:08:01.112],DP,SPEAKER_00,476.833616,481.112054
120,[ 00:08:02.453 --> 00:08:07.376],DQ,SPEAKER_00,482.453311,487.376910


In [None]:
diarize_segments.speaker.unique()

array(['SPEAKER_00', 'SPEAKER_01'], dtype=object)

In [None]:
result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

                               segment label     speaker       start  \
0    [ 00:00:00.246 -->  00:00:04.422]     A  SPEAKER_00    0.246180   
1    [ 00:00:04.915 -->  00:00:24.983]     B  SPEAKER_00    4.915110   
2    [ 00:00:27.037 -->  00:00:27.071]     C  SPEAKER_01   27.037351   
3    [ 00:00:27.071 -->  00:00:29.006]     D  SPEAKER_00   27.071307   
4    [ 00:00:29.006 -->  00:00:30.093]     E  SPEAKER_01   29.006791   
..                                 ...   ...         ...         ...   
117  [ 00:07:46.358 -->  00:07:51.230]    DN  SPEAKER_00  466.358234   
118  [ 00:07:52.775 -->  00:07:56.477]    DO  SPEAKER_00  472.775891   
119  [ 00:07:56.833 -->  00:08:01.112]    DP  SPEAKER_00  476.833616   
120  [ 00:08:02.453 -->  00:08:07.376]    DQ  SPEAKER_00  482.453311   
121  [ 00:08:08.073 -->  00:08:08.803]    DR  SPEAKER_00  488.073005   

            end  intersection       union  
0      4.422750   -484.039250  488.355820  
1     24.983022   -463.478978  483.686890  
2  

## Format results into prompt

In [None]:
import pandas as pd
df = pd.json_normalize(result['segments'])

In [None]:
df.drop(columns='words', inplace=True)

In [None]:
conversation_df = df['speaker'] + ': ' + df['text'] + '\n'

In [None]:
conversation = ''.join(conversation_df)

In [None]:
input_text = f'There is given conversation of a meeting from different speakers in chat format. Please make a meeting minutes and meeting summary from it. conversation: \n {conversation}'

In [None]:
input_text

"There is given conversation of a meeting from different speakers in chat format. Please make a meeting minutes and meeting summary from it. conversation: \n SPEAKER_00:  Hi everyone, welcome back to another mmmEnglish lesson.\nSPEAKER_00: I'm in London at the moment and I decided to contact another English teacher who has a channel on YouTube.\nSPEAKER_00: You might know her, it's Lucy from English with Lucy.\nSPEAKER_00: We are going to meet together on camera so that you can learn a little bit more about how to introduce yourself in English.\nSPEAKER_00: Hello!\nSPEAKER_00: Hi!\nSPEAKER_01: So nice to meet you!\nSPEAKER_01:  Lovely to finally meet you.\nSPEAKER_00: How are you?\nSPEAKER_01: I'm well, how are you?\nSPEAKER_01: I'm good, thank you.\nSPEAKER_00: How are you finding mother?\nSPEAKER_00: It's so far, it has been incredible.\nSPEAKER_00: I've been working really hard but also having lots of fun as well.\nSPEAKER_00: It's just the weather that's not\nSPEAKER_00:  White, my

## clear previous cache to save memory

In [None]:
import gc;import torch; gc.collect(); torch.cuda.empty_cache(); del diarize_model; del model

In [None]:
input_text

"There is given conversation of a meeting from different speakers in chat format. Please make a meeting minutes and meeting summary from it. conversation: \n SPEAKER_00:  Hi everyone, welcome back to another mmmEnglish lesson.\nSPEAKER_00: I'm in London at the moment and I decided to contact another English teacher who has a channel on YouTube.\nSPEAKER_00: You might know her, it's Lucy from English with Lucy.\nSPEAKER_00: We are going to meet together on camera so that you can learn a little bit more about how to introduce yourself in English.\nSPEAKER_00: Hello!\nSPEAKER_00: Hi!\nSPEAKER_01: So nice to meet you!\nSPEAKER_01:  Lovely to finally meet you.\nSPEAKER_00: How are you?\nSPEAKER_01: I'm well, how are you?\nSPEAKER_01: I'm good, thank you.\nSPEAKER_00: How are you finding mother?\nSPEAKER_00: It's so far, it has been incredible.\nSPEAKER_00: I've been working really hard but also having lots of fun as well.\nSPEAKER_00: It's just the weather that's not\nSPEAKER_00:  White, my

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load mistral

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = 'mistralai/Mistral-7B-Instruct-v0.2'

def load_quantized_model(model_name: str):
    """
    :param model_name: Name or path of the model to be loaded.
    :return: Loaded quantized model.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config
    )

    return model

def initialize_tokenizer(model_name: str):
    """
    Initialize the tokenizer with the specified model_name.

    :param model_name: Name or path of the model for tokenizer initialization.
    :return: Initialized tokenizer.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.bos_token_id = 1  # Set beginning of sentence token id
    return tokenizer


model = load_quantized_model(model_name)

tokenizer = initialize_tokenizer(model_name)

# Define stop token ids
stop_token_ids = [0]


config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

## Passing prompt into mistral and generate meeting minutes

In [None]:
system_message = '''
There is given conversation of a meeting from different speakers in chat format. Please make a meeting minutes and meeting summary from it in the format below:\r\nMeeting Minutes\r\nTopic: Introduction\r\nConversation:\r\nDiscussion:\r\nCall to Action:\r\nKey Takeaways:\r\n
'''
user_message = conversation
prompt = f"""<s>[INST] <<SYS>>
            {system_message}
            <</SYS>>
            {user_message} [/INST]"""


encoded = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
model_input = encoded
generated_ids = model.generate(**model_input, max_new_tokens=200, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] <<SYS>>
            
There is given conversation of a meeting from different speakers in chat format. Please make a meeting minutes and meeting summary from it in the format below:
Meeting Minutes
Topic: Introduction
Conversation:
Discussion:
Call to Action:
Key Takeaways:


            <</SYS>>
            SPEAKER_00:  Hi everyone, welcome back to another mmmEnglish lesson.
SPEAKER_00: I'm in London at the moment and I decided to contact another English teacher who has a channel on YouTube.
SPEAKER_00: You might know her, it's Lucy from English with Lucy.
SPEAKER_00: We are going to meet together on camera so that you can learn a little bit more about how to introduce yourself in English.
SPEAKER_00: Hello!
SPEAKER_00: Hi!
SPEAKER_01: So nice to meet you!
SPEAKER_01:  Lovely to finally meet you.
SPEAKER_00: How are you?
SPEAKER_01: I'm well, how are you?
SPEAKER_01: I'm good, thank you.
SPEAKER_00: How are you finding mother?
SPEAKER_00: It's so far, it has been incr