In [62]:
"""
Preamble for most code and jupyter notebooks
@author: tobinsouth
@notebook date: Dec 5, 2021
"""

import numpy as np, pandas as pd, matplotlib.pyplot as plt, matplotlib as mpl, seaborn as sns
import math, string, re, pickle, json, os, sys, datetime, itertools
from collections import Counter
from tqdm import tqdm

# Set panda's options
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 120)

# Better graphics
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('retina')
plt.style.use('seaborn-poster')

In [63]:
import spacy
nlp = spacy.load("es_core_news_sm")

In [64]:
from glob import glob
transcript_path = "../data/transcriptions"
transcript_files = glob(transcript_path + "/*.json")

In [65]:
transcripts, transcripts_df = {}, {}

for file in transcript_files:
    with open(file, "r") as f:
        filename = file.split("/")[-1].split(".")[0]
        transcript = json.load(f)
        transcripts[filename] = transcript
        transcript_df = pd.DataFrame(transcript.values())
        transcripts_df[filename] = transcript_df
    

In [66]:
results = []
for file, transcript_df in transcripts_df.items():
    if len(transcript_df) > 100:
        speaker_grouped = transcript_df.groupby('speaker_id').apply(lambda d: " ".join(d['transcription']))
        number_of_speakers = len(speaker_grouped)
        word_counts = speaker_grouped.str.split().apply(len)
        speaking_inequality = word_counts.max() / word_counts.sum()
        tutor_words = speaker_grouped.iloc[word_counts.argmax()] # Limitation: there's no guarantee that the tutor is the one with the most words
        tutor_doc = nlp(tutor_words)
        person_entity_counts = len([ent for ent in tutor_doc.ents if ent.label_ == 'PER'])
        pronoun_counts = len([w for w in tutor_doc if w.pos_ in ['PRON', 'PROPN']])
        numeric_counts = len([t for t in tutor_doc.text.split() if t.isnumeric()])
        non_stop_counts = Counter([t.text for t in tutor_doc if t.is_stop == False and t.is_punct == False])
        word_sum = lambda x: sum([c for t,c in x])
        common_ratio = word_sum(non_stop_counts.most_common(100))/word_sum(non_stop_counts.most_common())
        results.append([file, number_of_speakers, speaking_inequality, person_entity_counts, pronoun_counts, numeric_counts, common_ratio])

In [60]:
diarized_results = pd.DataFrame(results, columns = ['file', 'number_of_speakers', 'speaking_inequality', 'tutor_person_entity_counts', 'tutor_pronoun_counts', 'tutor_numeric_counts', 'tutor_common_ratio'])
diarized_results.to_csv("../data/diarized_results.csv", index=False)