In [None]:
%reload_ext autoreload
%autoreload 2

from configData import configVars

config = configVars()
# The video name can be set manually as well post initialization to use the same config values for different videos.
# config.videoToUse = "Sakai"
config.setFromEnv()

In [None]:
from transcriptLoader import retrieveTranscript
from questionGenerator import retrieveQuestions

videoData = retrieveTranscript(config)
questionData = retrieveQuestions(config, videoData=videoData)

# Additional code for Evaluation of Questions

The following code block is for generating an Excel file of the questions to be used for evaluating the generated questions. 
This is still not in a complete state, and is currently for protyping and testing only. 

You will need to install `tqdm`, `xlsxwriter` and possibly `openpyxl` Python packages via pip for the code to work.

In [None]:
# Install required libraries

!pip install tqdm
!pip install xlsxwriter
!pip install openpyxl

In [None]:
%reload_ext autoreload
%autoreload 2

from configData import configVars, captionsFolder, outputFolder
from topicExtractor import retrieveTopics
from transcriptLoader import retrieveTranscript
from questionGenerator import retrieveQuestions
from questionGenerator import processCaptions
import pandas as pd
import glob
import os
import shutil
from tqdm import tqdm
from datetime import datetime
from pandas.io.formats import excel
excel.ExcelFormatter.header_style = None

# This list is a list of videos present in the Captions folder for which the questions are to be generated.
# To generate questions for all videos in the Captions folder, set fileList = os.listdir(captionsFolder)
fileList = [
    "IMSE 514 Presentation",
    "Kalpana-GenAI-Das",
    "Kalpana-maizey2-das",
    "Kalpana-U-M MaizeyDas",
    "New Google Assignments in Canvas",
    "New Quizzes - Basics",
    "New Quizzes Video",
    "Piazza Introduction Workshop",
    # "Rearrange Playlist video",
    "Sakai",
]

# This is hardcoded, based on the known column it would be assigned to in the Excel file.
# I did not try to make this dynamic as it is a one-time use case.
wrappedCols  = {'Question': 'C', 'Answers':'G', 'Reason':'I'}

def makeExcelComparer(DFData, fileName, transcript):
    """
    Generate an Excel file with questions analysis based on the provided data.

    Args:
        DFData (pandas.DataFrame): DataFrame containing the data for analysis.
        fileName (str): Name of the file being analyzed.
        transcript (pandas.DataFrame): DataFrame containing the transcript data.

    Returns:
        None
    """

    # Find the path to save the Excel file.
    dfSavePath = os.path.join(
            outputFolder,
            fileName,
            f"Questions Analysis - {fileName}.xlsx",
        )
    
    # Automated this process so that the SRT file is copied to the output folder for reference.
    # Find the original SRT file
    srtFile = glob.glob(
            os.path.join(captionsFolder, fileName, "*.srt")
        )[0]
    # Find the path to save the SRT file.
    srtCopy = os.path.join(
        outputFolder,
        fileName,
        f"Transcript - {fileName}.srt",
    )
    # Copy the SRT file to the output folder.
    shutil.copyfile(srtFile, srtCopy)
    
    # Merge the two dataframes to get the transcript data for each question.
    mergedDF = pd.concat([DFData['LangChain'], DFData['BERTopic']]).reset_index(drop=False)

    for index, row in mergedDF.iterrows():
        start = datetime.strptime(row['Start'], '%H:%M:%S')
        end = datetime.strptime(row['End'], '%H:%M:%S')
        transcriptSlice = transcript[(transcript['Start'] >= start) & (transcript['End'] <= end)]
        if transcriptSlice.empty:
            transcriptSlice = transcript[(transcript['Start'] >= start)].head(1)
        relevantText = " ".join(transcriptSlice['Combined Lines'].tolist())

        mergedDF.at[index, 'Transcript'] = relevantText
    
    # display(mergedDF)

    # Write the data to an Excel file.
    # This is little janky and is a manual effort to make the Excel file readable.
    # It cuts out the need to manually format the Excel file.
    with pd.ExcelWriter(dfSavePath, engine="xlsxwriter") as writer:
        mergedDF.to_excel(writer, sheet_name=fileName[:31])
        workbook = writer.book
        worksheet = writer.sheets[fileName[:31]]

        columns = list(mergedDF.columns)
        wrap_format = workbook.add_format({'text_wrap': True})
        header_format = workbook.add_format({'bold': True, 'text_wrap': True, 'align': 'center'})
        for col in columns:
            if col.startswith('Is this question:'):
                worksheet.set_column(columns.index(col)+1,columns.index(col)+1, 15, header_format)
            else:
                worksheet.set_column(columns.index(col)+1,columns.index(col)+1, None, wrap_format)

        for col in wrappedCols:
            excel_header  =  wrappedCols[col] + ':' + wrappedCols[col]
            worksheet.set_column(excel_header, 50, wrap_format)
        
        worksheet.set_column('M:M', 200, wrap_format)

        writer.close()

# Run the process for each video in the list.
for fileName in tqdm(fileList):

    # This dictionary is used to store the data for each generation model for a given video.
    DFData = {}
    for generationModel in ['LangChain', 'BERTopic']:
    
        print(f"Processing {fileName} with {generationModel}...")
        config = configVars()
        config.setFromEnv()
        config.videoToUse = fileName
        config.generationModel = generationModel
        if config.generationModel == "LangChain":
            config.windowSize = 120

        videoData = retrieveTranscript(config)

        topicModeller = None
        if config.generationModel == "BERTopic":
            print(f"\t--> Retrieving Topics for {config.videoToUse}...")
            topicModeller = retrieveTopics(config, videoData)

        generatedData = retrieveQuestions(config, videoData=videoData, topicModeller=topicModeller)
        DFData[generationModel] = generatedData.makeDF()

    # Generate the Excel file for the video.
    makeExcelComparer(DFData, fileName, videoData.combinedTranscript)