In [None]:
! pip install deepgram-sdk requests matplotlib youtube_dl ffmpeg-python

In [None]:
# first download stuff from youtube
import youtube_dl

# change this variable to change what you download
vids = ['https://www.youtube.com/watch?v=VBPTFlpv31k&list=PLrAXtmErZgOdP_8GztsuKi9nrraNbKKp4&index=1']

ydl_opts = {
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192',
    }],
    # change this to change where you download it to
    'outtmpl': './lex/audio/%(title)s.mp3',
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    ydl.download(vids)

In [None]:
# next we use deepgram to get a transcription
from deepgram import Deepgram
import asyncio, json, os

from config import dg_key
dg = Deepgram(dg_key)

options = {
    "diarize": True,
    "punctuate": True,
    "paragraphs": True,
    "model": 'general',
    "tier": 'enhanced'
}

async def main():
    podcasts = os.listdir("./lex/audio")
    for podcast in podcasts:
        if "Bishop Robert Barron" in podcast:
            continue
        print(podcast)
        with open(f"lex/audio/{podcast}", "rb") as audio:
            source = {"buffer": audio, "mimetype":'audio/mp3'}
            res = await dg.transcription.prerecorded(source, options)
            with open(f"lex/transcripts/{podcast[:-4]}.json", "w") as transcript:
                # print(transcript)
                json.dump(res, transcript)
    return

asyncio.run(main())

In [None]:
# pretty print transcripts into a script and separate speakers
import json
import os

# create transcripts
def create_transcripts():
    for filename in os.listdir("lex/transcripts_unenhanced"):
        # if "Joe Rogan" in filename or "Bishop Robert Barron" in filename:
        #     continue
        with open(f"lex/transcripts_unenhanced/{filename}", "r") as file:
            transcript = json.load(file)
        paragraphs = transcript["results"]["channels"][0]["alternatives"][0]["paragraphs"]
        # words = transcript["results"]["channels"][0]["alternatives"][0]["words"]
        print(paragraphs['transcript'])
        with open(f"lex/pretty_scripts/{filename[:-5]}.txt", "w") as f:
            for line in paragraphs['transcript']:
                f.write(line)
# create_transcripts()

# separate transcripts by speaker
# label speakers by printing first lines by the speaker
# coalesce them into one file
def assign_speakers():
    for filename in os.listdir("lex/pretty_scripts"):
        print(f"Current File: {filename}")
        with open(f"lex/pretty_scripts/{filename}", "r") as f:
            lines = f.readlines()
        spoken = []
        names = []
        for line in lines:
            if line.startswith("Speaker "):
                if line[0:9] in spoken:
                    continue
                print(line)
                name = input("Who is the Speaker?")
                if len(name) <= 1:
                    continue
                spoken.append(line[:9])
                names.append(name)
        print(spoken)
        print(names)
        filedata = "\n".join(lines)
        print(filedata)
        for speaker, name in zip(spoken, names):
            filedata = filedata.replace(speaker, name)
        with open(f"lex/pretty_scripts/{filename}", "w") as f:
            f.write(filedata)

# assign_speakers()

In [None]:
# visualization
import json
import os

def divide_times():
    times = {}
    for filename in os.listdir("lex/transcripts_unenhanced"):
        print(f"Current filename: {filename}")
        with open(f"lex/transcripts_unenhanced/{filename}", "r") as file:
            transcript = json.load(file)
        paragraphs = transcript["results"]["channels"][0]["alternatives"][0]["paragraphs"]["paragraphs"]
        speaker_times = {}
        assigned_speakers = {}
        for paragraph in paragraphs:
            len_spoken = paragraph["end"]-paragraph["start"]
            speaker = paragraph["speaker"]
            if speaker in assigned_speakers:
                speaker = assigned_speakers[speaker]
            else:
                print(paragraph)
                name = input("Who is the speaker?")
                assigned_speakers[speaker] = name
                speaker = name
            if speaker in speaker_times:
                speaker_times[speaker] += len_spoken
            else:
                speaker_times[speaker] = len_spoken
        times[filename] = speaker_times
    with open("./lex/time_speaking.json", "w") as f:
        json.dump(times, f)

divide_times()

def words_said():
    word_split = {}
    for filename in os.listdir("lex/pretty_scripts"):
        print(f"Current filename: {filename}")
        with open(f"lex/pretty_scripts/{filename}", "r") as file:
            lines = file.readlines()
        cur_speaker = None
        file_word_split = {}
        for line in lines:
            if ":" in line:
                sep = line.split(":")
                cur_speaker = sep[0]
                if cur_speaker in file_word_split:
                    file_word_split[cur_speaker] += len(sep[1])
                else:
                    file_word_split[cur_speaker] = len(sep[1])
        word_split[filename] = file_word_split
    with open("./lex/word_split.json", "w") as f:
        json.dump(word_split, f)

words_said()

import matplotlib.pyplot as plt
import json

# plot speaking times bar charts
def vis_time():
    with open("./lex/time_speaking.json", "r") as f:
        time_dict = json.load(f)
    labels = []
    lex = []
    guest = []
    for podcast in time_dict.values():
        for entry in podcast:
            if "Lex" in entry:
                lex.append(podcast[entry])
            else:
                guest.append(podcast[entry])
                labels.append(entry)
    print(labels)
    print(lex)
    print(guest)
    width = 0.3
    fig, ax = plt.subplots()
    ax.bar(labels, lex, width, label="Lex")
    ax.bar(labels, guest, width, bottom=lex, label="Guest")
    ax.set_ylabel("Time Spent Speaking")
    ax.set_title("Lex vs Guests Speaking Time")
    ax.legend()
    plt.xticks(rotation=45, ha="right")
    fig.tight_layout()
    plt.savefig("./lex/time_speaking.png", pad_inches=1)
    plt.show()

vis_time()

# plot graph for words said
def vis_words():
    with open("./lex/word_split.json", "r") as f:
        time_dict = json.load(f)
    labels = []
    lex = []
    guest = []
    for podcast in time_dict.values():
        for entry in podcast:
            if "Lex" in entry:
                lex.append(podcast[entry])
            else:
                guest.append(podcast[entry])
                labels.append(entry)
    print(labels)
    print(lex)
    print(guest)
    width = 0.3
    fig, ax = plt.subplots()
    ax.bar(labels, lex, width, label="Lex")
    ax.bar(labels, guest, width, bottom=lex, label="Guest")
    ax.set_ylabel("Words Said")
    ax.set_title("Lex vs Guests Number of Words Said")
    ax.legend()
    plt.xticks(rotation=45, ha="right")
    fig.tight_layout()
    plt.savefig("./lex/words_said.png", pad_inches=1)
    plt.show()

vis_words()

In [None]:
# mcp/ner/summaries
import os
import requests
import json
import time
from config import textapi_key

headers = {
    "apikey": textapi_key,
    "Content-Type": "application/json"
}

def mcp(text: str, filename: str):
    print(f"Number of Characters: {len(text)}")
    sentences = text.split(".")
    print(f"Number of Sentences: {len(sentences)}")
    words = len(text.split(" "))
    print(f"Number of Words: {words}")
    texts = []
    sents = 0
    while sents < len(sentences):
        texts.append(" ".join(sentences[sents:sents+1500 if sents + 1500 < len(sentences) else len(sentences)]))
        sents += 1500
    mcps = []
    for text in texts:
        body = {
            "text": text,
            "num_phrases": 5
        }
        start = time.time()
        res = requests.post(url="https://app.thetextapi.com/text/most_common_phrases", headers=headers, json=body)
        print(f"Time elapsed: {time.time() - start} seconds")
        mcps.append(json.loads(res.text)["most common phrases"])
    print(mcps)
    with open(f"lex/most_common_phrases/{filename}.txt", "w") as file:
        for mcp in mcps:
            for phrase in mcp:
                file.write(phrase+"\n")

def ner(text: str, filename: str):
    print(f"Number of Characters: {len(text)}")
    sentences = text.split(".")
    print(f"Number of Sentences: {len(sentences)}")
    words = len(text.split(" "))
    print(f"Number of Words: {words}")
    texts = []
    sents = 0
    while sents < len(sentences):
        texts.append(" ".join(sentences[sents:sents+1500 if sents + 1500 < len(sentences) else len(sentences)]))
        sents += 1500
    ners = []
    for text in texts:
        body = {
            "text": text
        }
        words = len(text.split(" "))
        print(f"Processing 1500 Sentences, {words} Words")
        start = time.time()
        res = requests.post(url="https://app.thetextapi.com/text/ner", headers=headers, json=body)
        print(f"Time elapsed: {time.time() - start} seconds")
        ners.append(json.loads(res.text)["ner"])
    with open(f"lex/ner/{filename}.txt", "w") as file:
        for ner in ners:
            for phrase in ner:
                for word in phrase:
                    file.write(word+" ")
                file.write("\n")

def summarize(text: str, filename: str):
    print(f"Title: {filename}")
    print(f"Number of Characters: {len(text)}")
    sentences = text.split(".")
    print(f"Number of Sentences: {len(sentences)}")
    words = len(text.split(" "))
    print(f"Number of Words: {words}")
    texts = []
    sents = 0
    while sents < len(sentences):
        texts.append(" ".join(sentences[sents:sents+1500 if sents + 1500 < len(sentences) else len(sentences)]))
        sents += 1500
    summaries = []
    for text in texts:
        body = {
            "text": text
        }
        start = time.time()
        res = requests.post(url="https://app.thetextapi.com/text/summarize", headers=headers, json=body)
        print(f"Time elapsed: {time.time() - start} seconds")
        summaries.append(json.loads(res.text)["summary"])
    # print(summaries)
    with open(f"lex/summarize/{filename}.txt", "w") as file:
        for summary in summaries:
            file.write(summary)

def main():
    for filename in os.listdir("lex/transcripts_unenhanced"):
        with open(f"lex/transcripts_unenhanced/{filename}", "r") as file:
            transcript = json.load(file)
        text = transcript["results"]["channels"][0]["alternatives"][0]["transcript"]
        # mcp(text, filename[:-5])
        # ner(text, filename[:-5])
        summarize(text, filename[:-5])

main()

In [None]:
# separate most common phrases
import requests
import json
import os

from text_analysis import headers # gotta comment out main() in text_analytics to do this

# Re-run MCP on both
def nlp():
    for filename in os.listdir("lex/pretty_scripts"):
        print(f"Current File: {filename}")
        with open(f"lex/pretty_scripts/{filename}", "r") as f:
            lines = f.readlines()
        separated_speakers = dict()
        for line in lines:
            if ":" in line:
                speaker_sep = line.split(":")
                if speaker_sep[0][1:] in separated_speakers.keys():
                    separated_speakers[speaker_sep[0][1:]] += speaker_sep[1]
                else:
                    separated_speakers[speaker_sep[0][1:]] = speaker_sep[1]
        for speaker, spoken in separated_speakers.items():
            body = {
                "text": spoken,
                "num_phrases": 5
            }
            res = requests.post("https://app.thetextapi.com/text/most_common_phrases", headers=headers, json=body)
            mcp = json.loads(res.text)["most common phrases"]
            with open(f"lex/most_common_phrases/{speaker} in {filename}", "w") as f:
                for entry in mcp:
                    f.write(f"{entry}\n")

nlp()