### Load enviornment variables

In [None]:
!pip install pathlib
!pip install python-dotenv

In [141]:
# set environment variables
import os
from pathlib import Path
from dotenv import load_dotenv

env_path = Path('.') / '.env'
load_dotenv(dotenv_path=env_path)

True

## 3. Start building meeting report json

In [None]:
!pip install pyjwt
!pip install webvtt-py

In [2]:
import jwt
import requests
import json
from time import time
import os
import webvtt

In [3]:
# generate a token using the pyjwt library
token = jwt.encode(

    # Create a payload of the token containing
    # API Key & expiration time
    {'iss': os.getenv('ZOOM_API_KEY'), 'exp': time() + 5000},

    # Secret used to generate token signature
    os.getenv('ZOOM_API_SEC'),

    # Specify the hashing alg
    algorithm='HS256'
)

In [4]:
# input required from user
# For Sp23, use meeting_id being the main input and Dummy User Input for parsing purpose
meeting_id = 85760048922

## DUMMY USER INPUT
meeting_title = "23min_demo_meeting"
meeting_date = '2023-03-03T19:59:40Z'
meeting_participants_count = 3
agenda_list = ["Duke and Duchess of Sussex asked to vacate UK home Frogmore Cottage", 
               "Alex Muradugh guilty",
               "Data Center Trends"]

In [13]:
# Meeting participants
# as if start_time is 2023-03-03T19:59:40Z
from datetime import datetime as dt
parsed_meeting_date = dt.strptime(meeting_date, '%Y-%m-%dT%H:%M:%SZ').strftime('%Y-%m-%d %H:%M:%S')
headers = {'authorization': 'Bearer %s' % token, 'content-type': 'application/json'}
part = requests.get(f'https://api.zoom.us/v2/past_meetings/'+str(meeting_id)+'/participants', headers=headers)
part_info = {}
if part.status_code == 200:
    response = json.loads(part.text)
    part_arr = response['participants']
    
    for p in part_arr:
        part_info[p['name']] = {'late': 'late' if dt.strptime(p['join_time'], '%Y-%m-%dT%H:%M:%SZ') > dt.strptime(meeting_date, '%Y-%m-%dT%H:%M:%SZ') else 'on-time'}

###### WILL NOT GET INCLUDED IN FINAL OUTPUT ######        
# change dazhi's name in p_detail as it's in email :/
part_info['DAZHI PENG'] = part_info.pop('dazhip@andrew.cmu.edu')

In [16]:
# start setting up meeting_report_json
from datetime import datetime as dt
meeting_report_json = {}

meeting_report_json['meeting_title'] = meeting_title

meeting_report_json['meeting_date'] = parsed_meeting_date

meeting_report_json['participants'] = part_info

In [22]:
meeting_report_json

{'meeting_title': '23min_demo_meeting',
 'meeting_date': '2023-03-03 19:59:40',
 'participants': {'Yooni Choi': {'late': 'on-time'},
  'Jiahe Feng': {'late': 'late'},
  'DAZHI PENG': {'late': 'late'}}}

## 4. ASR Work

In [None]:
# requirements
# ffmpeg

!pip install azure-storage-blob
!pip install openai

In [23]:
# convert to supported format
# have to think about how we are going to use ffmpeg
# currently, I've downloaded ffmpeg on my local machine
import os
zoom_m4a = "85760048922_M4A.m4a"
out_wav = "out.wav"
os.system("ffmpeg -i {0} -acodec pcm_s16le -ac 1 -ar 16000 {1}".format(zoom_m4a, out_wav))

32512

In [143]:
# upload .wav file to azure blob
from azure.storage.blob import BlobServiceClient

storage_account_key = os.getenv('AUDIO_STORAGE_ACCT_KEY')
storage_account_name = os.getenv('AUDIO_STORAGE_ACCT_NAME')
connection_string = os.getenv('CONNECTION_STRING')
container_name = os.getenv('AUDIO_CONTAINER_NAME')

def uploadToBlobStorage(file_path,file_name):
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=file_name)
    with open(file_path, "rb") as data:
        blob_client.upload_blob(data)
        print(f"Uploaded {file_name}.")

# calling a function to perform upload
audio_path = "/Users/yooni/Desktop/vca_data/" + out_wav
audio_name = out_wav
uploadToBlobStorage(audio_path, audio_name)

04/21/2023 11:13:58 AM EDT Request URL: 'https://vcademo.blob.core.windows.net/vcademo/out.wav'
Request method: 'PUT'
Request headers:
    'Content-Length': '44458062'
    'x-ms-blob-type': 'REDACTED'
    'If-None-Match': '*'
    'x-ms-version': 'REDACTED'
    'Content-Type': 'application/octet-stream'
    'Accept': 'application/xml'
    'User-Agent': 'azsdk-python-storage-blob/12.15.0 Python/3.8.8 (macOS-10.16-x86_64-i386-64bit)'
    'x-ms-date': 'REDACTED'
    'x-ms-client-request-id': '23f31318-e057-11ed-bcee-acde48001122'
    'Authorization': 'REDACTED'
A body is sent with the request
04/21/2023 11:14:03 AM EDT Response status: 201
Response headers:
    'Content-Length': '0'
    'Content-MD5': 'REDACTED'
    'Last-Modified': 'Fri, 21 Apr 2023 15:14:06 GMT'
    'ETag': '"0x8DB427B0D01FB50"'
    'Server': 'Windows-Azure-Blob/1.0 Microsoft-HTTPAPI/2.0'
    'x-ms-request-id': '6f10e22a-001e-0081-1763-74be6c000000'
    'x-ms-client-request-id': '23f31318-e057-11ed-bcee-acde48001122'
   

In [144]:
# THIS CELL IS NOT WORKING IDK WHY ##
# get blob SAS url
from datetime import datetime, timedelta
from azure.storage.blob import BlobClient, generate_blob_sas, BlobSasPermissions


blob_name = audio_name

def get_blob_sas(storage_account_name, storage_account_key, container_name, blob_name):
    sas_blob = generate_blob_sas(account_name=storage_account_name, 
                                container_name=container_name,
                                blob_name=blob_name,
                                account_key=storage_account_key,
                                permission=BlobSasPermissions(read=True),
                                expiry=datetime.utcnow() + timedelta(hours=1))
    return sas_blob

blob_token = get_blob_sas(storage_account_name, storage_account_key, container_name, blob_name)
blob_sas_url = 'https://'+storage_account_name+'.blob.core.windows.net/'+container_name+'/'+blob_name+'?'+blob_token
blob_sas_url

'https://vcademo.blob.core.windows.net/vcademo/out.wav?se=2023-04-21T16%3A14%3A08Z&sp=r&sv=2021-12-02&sr=b&sig=iNr35mtO914DyP4Y3tQ4BUujLb1uLMaDJH6vh3KoYJ8%3D'

In [145]:
# THIS CELL IS NOT WORKING IDK WHY ##
# set up transcription result container sas
transcription_stroage_account_key = os.getenv('TRANSCRIPTION_STORAGE_ACCT_KEY')
transcription_stroage_account_name = os.getenv('TRANSCRIPTION_STORAGE_ACCT_NAME')
transcription_container_name = os.getenv('TRANSCRIPTION_CONTAINER_NAME')
from azure.storage.blob import generate_container_sas, ContainerSasPermissions
def get_container_sas(storage_account_name, storage_account_key, container_name):
    container_sas = generate_container_sas(
       account_name=storage_account_name,
       container_name=container_name,
       account_key=storage_account_key,
       permission=ContainerSasPermissions(write=True, read=True, list=True),
       start = datetime.utcnow(),
       expiry=datetime.utcnow() + timedelta(hours=1))
    return container_sas
container_token = get_container_sas(storage_account_name, storage_account_key, transcription_container_name)
container_sas_url = 'https://'+transcription_stroage_account_name+'.blob.core.windows.net/'+transcription_container_name+'?'+container_token
container_sas_url

'https://vcatranscriptions.blob.core.windows.net/transcription-results?se=2023-04-21T16%3A14%3A13Z&sp=rwl&sv=2021-12-02&sr=c&sig=d5Io6V6/W05ma/WD0NKJtUHMwx0sRCL%2BKla4Ks65esU%3D'

In [25]:
# ###### Not going to be in final code ######
# # placeholder SAS for now
# # sas url with 1yr expiry -- transcription container should be different container from audio blob storage since it requires all external traffic allowed
# blob_sas_url = os.getenv('TEMP_BLOB_SAS_URL')
# container_sas_url = os.getenv('TEMP_CONTAINER_SAS_URL')

In [5]:
# setting up Swagger
!pip install /Users/yooni/Desktop/CMU/Capstone/Video-Conference-Agent/notebook_playground/ASRwork/Azure/python-client

Processing ./python-client
Building wheels for collected packages: swagger-client
  Building wheel for swagger-client (setup.py) ... [?25ldone
[?25h  Created wheel for swagger-client: filename=swagger_client-1.0.0-py3-none-any.whl size=221566 sha256=605a6868f631db5eb1eaea3953124cda5c8caee00ed1f5a346acc7eb548fc712
  Stored in directory: /Users/yooni/Library/Caches/pip/wheels/f5/1f/0a/fa0cd5a2204b8564b5bca2e85cbb0efd152c039fc2213d07a9
Successfully built swagger-client
Installing collected packages: swagger-client
  Attempting uninstall: swagger-client
    Found existing installation: swagger-client 1.0.0
    Uninstalling swagger-client-1.0.0:
      Successfully uninstalled swagger-client-1.0.0
Successfully installed swagger-client-1.0.0


In [6]:
!python -c "import swagger_client"
!pip install requests



In [146]:
# Your subscription key and region for the speech service
SUBSCRIPTION_KEY = os.getenv('SPEECH_SUBSCRIPTION_KEY')   # use vca-speech-resource since it doesnt support Free tier
SERVICE_REGION = "eastus"
NAME = meeting_title
DESCRIPTION = parsed_meeting_date + ' '+ meeting_title + ' transcription'
LOCALE = "en-US"
RECORDINGS_BLOB_URI = blob_sas_url
PARTICIPANT_COUNT = meeting_participants_count


In [147]:
import logging
import sys
import requests
import time
import swagger_client

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG,
        format="%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p %Z")


# # Provide the uri of a container with audio files for transcribing all of them
# # with a single request. At least 'read' and 'list' (rl) permissions are required.
# RECORDINGS_CONTAINER_URI = "<Your SAS Uri to a container of audio files>"


def transcribe_from_single_blob(uri, properties):
    """
    Transcribe a single audio file located at `uri` using the settings specified in `properties`
    using the base model for the specified locale.
    """
    transcription_definition = swagger_client.Transcription(
        display_name=NAME,
        description=DESCRIPTION,
        locale=LOCALE,
        content_urls=[uri],
        properties=properties
    )

    return transcription_definition

def _paginate(api, paginated_object):
    """
    The autogenerated client does not support pagination. This function returns a generator over
    all items of the array that the paginated object `paginated_object` is part of.
    """
    yield from paginated_object.values
    typename = type(paginated_object).__name__
    auth_settings = ["api_key"]
    while paginated_object.next_link:
        link = paginated_object.next_link[len(api.api_client.configuration.host):]
        paginated_object, status, headers = api.api_client.call_api(link, "GET",
            response_type=typename, auth_settings=auth_settings)

        if status == 200:
            yield from paginated_object.values
        else:
            raise Exception(f"could not receive paginated data: status {status}")

def transcribe():
    logging.info("Starting transcription client...")

    # configure API key authorization: subscription_key
    configuration = swagger_client.Configuration()
    configuration.api_key["Ocp-Apim-Subscription-Key"] = SUBSCRIPTION_KEY
    configuration.host = f"https://{SERVICE_REGION}.api.cognitive.microsoft.com/speechtotext/v3.1"

    # create the client object and authenticate
    client = swagger_client.ApiClient(configuration)

    # create an instance of the transcription api class
    api = swagger_client.CustomSpeechTranscriptionsApi(api_client=client)

    # Specify transcription properties by passing a dict to the properties parameter. See
    # https://learn.microsoft.com/azure/cognitive-services/speech-service/batch-transcription-create?pivots=rest-api#request-configuration-options
    # for supported parameters.
    properties = swagger_client.TranscriptionProperties()
    # properties.word_level_timestamps_enabled = True
    properties.display_form_word_level_timestamps_enabled = True
    properties.punctuation_mode = "DictatedAndAutomatic"
    # properties.profanity_filter_mode = "Masked"
    properties.destination_container_url = container_sas_url

    # uncomment the following block to enable and configure speaker separation
    properties.diarization_enabled = True
    properties.diarization = swagger_client.DiarizationProperties(
        swagger_client.DiarizationSpeakersProperties(min_count=1, max_count=PARTICIPANT_COUNT))

    # properties.language_identification = swagger_client.LanguageIdentificationProperties(["en-US", "ja-JP"])

    # Use base models for transcription. Comment this block if you are using a custom model.
    transcription_definition = transcribe_from_single_blob(RECORDINGS_BLOB_URI, properties)


    created_transcription, status, headers = api.transcriptions_create_with_http_info(transcription=transcription_definition)

    # get the transcription Id from the location URI
    transcription_id = headers["location"].split("/")[-1]

    # Log information about the created transcription. If you should ask for support, please
    # include this information.
    logging.info(f"Created new transcription with id '{transcription_id}' in region {SERVICE_REGION}")

    logging.info("Checking status.")

    completed = False

    while not completed:
        # wait for 5 seconds before refreshing the transcription status
        time.sleep(5)

        transcription = api.transcriptions_get(transcription_id)
        logging.info(f"Transcriptions status: {transcription.status}")

        if transcription.status in ("Failed", "Succeeded"):
            completed = True

        if transcription.status == "Succeeded":
            pag_files = api.transcriptions_list_files(transcription_id)
            for file_data in _paginate(api, pag_files):
                if file_data.kind != "Transcription":
                    continue

                audiofilename = file_data.name
                results_url = file_data.links.content_url
                results = requests.get(results_url)
                logging.info(f"Results for {audiofilename}:\n{results.content.decode('utf-8')}")
                return results
        elif transcription.status == "Failed":
            logging.info(f"Transcription failed: {transcription.properties.error.message}")

In [148]:
# run transcription job
results = transcribe()
content = results.content.decode('utf-8')

04/21/2023 11:14:35 AM EDT Starting transcription client...
04/21/2023 11:14:36 AM EDT Created new transcription with id '6c33e888-9ec3-4d31-bb8a-4be5d553d49f' in region eastus
04/21/2023 11:14:36 AM EDT Checking status.
04/21/2023 11:14:41 AM EDT Transcriptions status: Running
04/21/2023 11:14:46 AM EDT Transcriptions status: Running
04/21/2023 11:14:52 AM EDT Transcriptions status: Running
04/21/2023 11:14:57 AM EDT Transcriptions status: Running
04/21/2023 11:15:02 AM EDT Transcriptions status: Running
04/21/2023 11:15:07 AM EDT Transcriptions status: Running
04/21/2023 11:15:12 AM EDT Transcriptions status: Running
04/21/2023 11:15:17 AM EDT Transcriptions status: Running
04/21/2023 11:15:22 AM EDT Transcriptions status: Running
04/21/2023 11:15:28 AM EDT Transcriptions status: Running
04/21/2023 11:15:33 AM EDT Transcriptions status: Running
04/21/2023 11:15:38 AM EDT Transcriptions status: Running
04/21/2023 11:15:43 AM EDT Transcriptions status: Running
04/21/2023 11:15:48 AM ED

AttributeError: 'NoneType' object has no attribute 'content'

In [71]:
# Parse transcription results
import json
import datetime as dt
result = json.loads(content)

start_date = dt.datetime.strptime(result["timestamp"], '%Y-%m-%dT%H:%M:%SZ')
transcription = ''
# transcription_timestamp = ''
duration = {'total':0}
for segment in result["recognizedPhrases"]:
    timestamp = start_date + dt.timedelta(seconds =segment["offsetInTicks"]/10000000)
    # transcription_timestamp += str(timestamp) + ' Speaker ' + str(segment["speaker"]) + '\n' + segment["nBest"][0]["display"] + '\n\n'
    transcription += ' Speaker ' + str(segment["speaker"]) + ': ' + segment["nBest"][0]["display"] + '\n\n'
    speaker_duration = segment['durationInTicks']/10000000
    if 'Speaker '+ str(segment["speaker"]) not in duration:
        duration['Speaker '+ str(segment["speaker"])] = speaker_duration
    else:
        duration['Speaker '+ str(segment["speaker"])] += speaker_duration

In [72]:
# write transcription.txt files
with open('transcription.txt', 'w') as f:
    f.write(transcription)

# with open('transcription_timestamp.txt', 'w') as f:
#     f.write(transcription_timestamp)

In [73]:
# participant matching
import os
import requests
import openai

zoom_transript_path = "85760048922_TRANSCRIPT.vtt"
asr_transcription_path = "transcription.txt"

In [74]:
def parse_transcript(transcript_file, num_participant, num_line_threshold = 3):
    """Parse a transcript file and extract conversation text from each participant.

    Args:
        transcript_file (str): Path to the transcript file.
        num_participant (int): Number of participants in the conversation.
        num_line_threshold (int, optional): Number of conversation lines from each participant
            used for further matching. Defaults to 3.

    Returns:
        str: A concatenated string of the extracted conversation text from each participant.

    Raises:
        FileNotFoundError: If the transcript file path is invalid.

    """

    def end_transcript_loop(ppt_length):
        return len(ppt_length) == 3 and all([v > 100 for v in ppt_length.values()])
    
    with open(transcript_file) as file:
        lines = file.readlines()

    ppt_text = {}
    ppt_length = {}

    for line in lines:
        if ": " not in line:
            continue
        line_split = line.split(": ")
        speaker, text = line_split[0], line_split[1]
        if speaker in ppt_length and ppt_length[speaker] > num_line_threshold:
            continue
        if speaker not in ppt_text:
            ppt_text[speaker] = text
        else:
            ppt_text[speaker] += text
        if speaker not in ppt_length:
            ppt_length[speaker] = 1
        else:
            ppt_length[speaker] += 1
        if end_transcript_loop(ppt_length):
            break
    result_text = ""
    for k, v in ppt_text.items():
        result_text += k + ": \n"
        result_text += v + "\n"
    return result_text

In [75]:
zoom_result_text = parse_transcript(zoom_transript_path, 3)
azure_result_text = parse_transcript(asr_transcription_path, 3)

In [76]:
API_KEY = os.getenv('OPENAI_API_KEY')
url = 'https://api.openai.com/v1/chat/completions'
headers = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + API_KEY}

In [77]:
def ppt_matching_api_call(ASR_transcript, Zoom_transcript):
    
    # combine two transcript
    combined_text = "Transcript 1: \n" + ASR_transcript + "\nTranscript 2: \n" + Zoom_transcript
    
    prompt = "Given the following two transcript with different speaker representation, generate a JSON mapping, where each key is the participant in the first transcript, and the value is the corresponding participant name in the second transcript. Return the JSON only."
    data = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {"role": "system", "content": "Perform participant matching and return a JSON"},
            {"role": "assistant", "content": combined_text},
            {"role": "user", "content": prompt},
        ],
        "temperature": 0
    }

    response = requests.post(url, headers=headers, json=data)
    if response.status_code == 200:
        content = response.json()
        mapping = eval(content['choices'][0]['message']['content'])
        if isinstance(mapping, dict):
            return mapping
        else:
            return ValueError("Error in GPT participant matching. Must return a dictionary mapping in its response.")
    else:
        raise ValueError("Fail to get GPT API response.")

In [78]:
# write new transcription.txt files with speaker names
import re
ppt_match = ppt_matching_api_call(zoom_result_text, azure_result_text)

for k,i in ppt_match.items():
    transcription = re.sub(i,k, transcription)

# match name to duration dictionary
duration = {y:x for x,y in {duration.get(k, k): v for v, k in ppt_match.items()}.items()}
duration['total'] = sum(duration.values())
    

with open('transcription.txt', 'w') as f:
    f.write(transcription)


In [79]:
duration

{'Yooni Choi': 424.4,
 'Jiahe Feng': 458.4400000000001,
 'DAZHI PENG': 405.6800000000001,
 'total': 1288.5200000000002}

In [90]:
for participant in meeting_report_json['participants']:
    if participant in duration:
        meeting_report_json['participants'][participant]['duration'] = str(round(duration[participant],2))+'s (' + str(round(duration[participant]/duration['total']*100, 2))+'%)'
    else:
        meeting_report_json['participants'][participant]['duration'] = 0

In [91]:
meeting_report_json

{'meeting_title': '23min_demo_meeting',
 'meeting_date': '2023-03-03 19:59:40',
 'participants': {'Yooni Choi': {'late': 'on-time',
   'duration': '424.4s (32.94%)'},
  'Jiahe Feng': {'late': 'late', 'duration': '458.44s (35.58%)'},
  'DAZHI PENG': {'late': 'late', 'duration': '405.68s (31.48%)'}}}