In [4]:
from datetime import datetime
import glob
import pandas as pd


def processSrtFile(srtFile):
    if type(srtFile) == list:
        srtFile = srtFile[0]

    with open(srtFile, "r") as f:
        lines = f.readlines()

    transcript = []

    sentence = ""
    start_time = ""
    end_time = ""

    for line in lines:
        line = line.strip()
        if line.isdigit():
            continue
        elif "-->" in line:
            start_time, end_time = line.split("-->")
            start_time = datetime.strptime(start_time.strip(), "%H:%M:%S,%f")  # .time()
            end_time = datetime.strptime(end_time.strip(), "%H:%M:%S,%f")  # .time()
        elif line:
            sentence += " " + line
        else:
            transcript.append(
                {"Line": sentence.strip(), "Start": start_time, "End": end_time}
            )
            sentence = ""

    return pd.DataFrame(transcript)


def lineCombiner(transcript, windowSize=20):
    transcript = transcript.sort_values(by="Start")

    combinedTranscript = []

    currStart = transcript.iloc[0]["Start"]

    while currStart < transcript.iloc[-1]["Start"]:
        slicedTranscript = transcript[
            (transcript["Start"] - currStart < pd.Timedelta(seconds=windowSize))
            & (transcript["Start"] >= currStart)
        ]
        combinedLines = " ".join(slicedTranscript["Line"].tolist())
        combinedTranscript.append(
            {
                "Combined Lines": combinedLines,
                "Start": slicedTranscript.iloc[0]["Start"],
                "End": slicedTranscript.iloc[-1]["End"],
            }
        )

        currStart = slicedTranscript.iloc[-1]["End"]

    return pd.DataFrame(combinedTranscript)


captionsFolder = "Captions"
videoNames = ["New Quizzes Video", "Rearrange Playlist video"]
srtFiles = {}
transcripts = {}
sentences = {}
combinedTranscripts = {}
for video in videoNames:
    srtFiles[video] = glob.glob(f"{captionsFolder}/{video}/*.srt")
    transcripts[video] = processSrtFile(srtFiles[video])
    sentences[video] = " ".join(transcripts[video]["Line"].tolist())
    combinedTranscripts[video] = lineCombiner(transcripts[video], windowSize=30)

In [5]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

representation_model = KeyBERTInspired()
topic_model = BERTopic(representation_model=representation_model)

video = videoNames[0]

docs = combinedTranscripts[video]["Combined Lines"].tolist()
timestamps = combinedTranscripts[video]["Start"].tolist()

topics, probs = topic_model.fit_transform(docs)

# hierarchical_topics = topic_model.hierarchical_topics(docs)
# topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

topics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=20)
topic_model.visualize_topics_over_time(topics_over_time)

In [None]:
from helper import *
import tiktoken
from bertopic.representation import OpenAI

config = Config()
config.setFromEnv()

OpenAIBot = OpenAIBot(config)
tokenizer= tiktoken.encoding_for_model(OpenAIBot.model)

representation_model = OpenAI(
    OpenAIBot.client,
    model=OpenAIBot.model, 
    delay_in_seconds=2, 
    chat=True,
    nr_docs=4,
    doc_length=100,
    tokenizer=tokenizer
)

topic_model = BERTopic(representation_model=representation_model)
video = videoNames[0]

docs = combinedTranscripts[video]["Combined Lines"].tolist()
timestamps = combinedTranscripts[video]["Start"].tolist()

topics, probs = topic_model.fit_transform(docs)

# hierarchical_topics = topic_model.hierarchical_topics(docs)
# topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

topics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=20)
topic_model.visualize_topics_over_time(topics_over_time)

In [None]:
from helper import *

config = Config()
config.setFromEnv()

LangChainBot = LangChainBot(config)

chain = LangChainBot.chain