In [8]:
import os
import re
import pandas as pd
from collections import Counter

# Directory containing VTT files
folder_path = "/workspaces/Marquette_teaching_finance/recording_transcripts/"

# Dictionary to store speaker counts per file
all_counts = {}

# Loop through VTT files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".vtt") and filename != "Final_presentation.vtt":
        filepath = os.path.join(folder_path, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()

        # Extract speaker names
        speakers = re.findall(r"<v\s+([^>]+)>", content)
        speaker_counts = Counter(speakers)

        # Use filename (without .vtt) as column name
        file_key = filename.replace(".vtt", "")
        all_counts[file_key] = speaker_counts

# Combine all counts into a DataFrame
df = pd.DataFrame(all_counts).fillna(0).astype(int)

# Add 'speakers' column from index
df["speakers"] = df.index

# Add 'total' column
df["total"] = df.drop(columns=["speakers"]).sum(axis=1)

# Reorder columns: speakers first, then totals, then the rest
cols = ["speakers", "total"] + [col for col in df.columns if col not in ["speakers", "total"]]
df = df[cols]

# Sort by total, descending
df = df.sort_values(by="total", ascending=False).reset_index(drop=True)

# Display the result
print(df)


               speakers  total  Week_0217  Week_0414  Week_0407  Week_0210  \
0             Hou, Eric  18460       1874         55         73       2216   
1          Sam Iosevich    952          0          0        952          0   
2                Fei Li    910          0        910          0          0   
3        Haroon, Sufyan    208         28          0          0         51   
4        Benbenek, MATT    118         21         15          0         19   
5       Teske, Benjamin     91         11          0          1          2   
6       Mertens, Aidric     90         24          5          1          0   
7         Severin, Noah     70          8          0          1          8   
8   Hollenbach, Patrick     68          0         21          0          0   
9        Capozzoli, Jay     32          2          0          0          0   
10        Matt Benbenek     26          0          0          0          0   
11    Benbenek, Matthew     24          0          0          0 

In [9]:
df.head()

Unnamed: 0,speakers,total,Week_0217,Week_0414,Week_0407,Week_0210,Week_0127,Week_0324,Week_0303,Week_0317,Week_0414_2,Week_0113_2,Week_0113,Week_0224,Week_0407_2
0,"Hou, Eric",18460,1874,55,73,2216,1825,1942,2062,1886,1214,1074,904,2193,1142
1,Sam Iosevich,952,0,0,952,0,0,0,0,0,0,0,0,0,0
2,Fei Li,910,0,910,0,0,0,0,0,0,0,0,0,0,0
3,"Haroon, Sufyan",208,28,0,0,51,6,3,19,5,10,5,18,7,56
4,"Benbenek, MATT",118,21,15,0,19,18,7,17,18,3,0,0,0,0


In [10]:
df.to_csv('class_frequency.csv', index = False)

In [1]:
import os
import re
import pandas as pd
from collections import Counter

# Directory containing VTT files
folder_path = "/workspaces/Marquette_teaching_finance/recording_transcripts/"

# Dictionary to store speaker counts per file
all_counts = {}

# Loop through VTT files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".vtt") and filename == "Final_presentation.vtt":
        filepath = os.path.join(folder_path, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()

        # Extract speaker names
        speakers = re.findall(r"<v\s+([^>]+)>", content)
        speaker_counts = Counter(speakers)

        # Use filename (without .vtt) as column name
        file_key = filename.replace(".vtt", "")
        all_counts[file_key] = speaker_counts

# Combine all counts into a DataFrame
df = pd.DataFrame(all_counts).fillna(0).astype(int)

# Add 'speakers' column from index
df["speakers"] = df.index

# Add 'total' column
df["total"] = df.drop(columns=["speakers"]).sum(axis=1)

# Reorder columns: speakers first, then totals, then the rest
cols = ["speakers", "total"] + [col for col in df.columns if col not in ["speakers", "total"]]
df = df[cols]

# Sort by total, descending
df = df.sort_values(by="total", ascending=False).reset_index(drop=True)

# Display the result
print(df)

               speakers  total  Final_presentation
0             Hou, Eric    482                 482
1      Mizwicki, Andrew    243                 243
2   Hollenbach, Patrick    157                 157
3       Teske, Benjamin    139                 139
4       Galligan, Willy    122                 122
5       Mertens, Aidric    110                 110
6        Haroon, Sufyan    110                 110
7       Schembari, John    109                 109
8          Carone, Matt    106                 106
9         Myers, Nathan    101                 101
10       Benbenek, MATT     93                  93
11        Halm, William     85                  85
12        Severin, Noah     84                  84
13      Weidner, Declan     60                  60
14          Barbel, Sam     59                  59
15        Haque, Aleema     43                  43
16     Brozynski, Jaden     42                  42
17     Vargas, Cristian     28                  28
18     Huebner, Kaitlyn     24 

In [2]:
df.to_csv('final_frequency.csv', index = False)