In [None]:
import os
import csv
from tqdm import tqdm
from collections import defaultdict

def group_and_sort_by_speaker(input_csv, output_csv):
    """Groups data by speaker, calculates total duration, and sorts by duration.

    Args:
        input_csv (str): Path to the input CSV file.
        output_csv (str): Path to the output CSV file.
    """
    speaker_durations = defaultdict(float)

    with open(input_csv, 'r', newline='') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            speaker = row['speaker']
            duration = float(row['duration'])
            speaker_durations[speaker] += duration

    # Sort by duration in descending order
    sorted_durations = sorted(speaker_durations.items(), key=lambda item: item[1], reverse=True)

    with open(output_csv, 'w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(['speaker', 'total_duration'])
        writer.writerows(sorted_durations)

if __name__ == "__main__":
    region_dir = './VIN27'
    
    # Process each region file individually
    for region in ['north', 'south', 'center']:
        input_csv = os.path.join(region_dir, f"{region}.csv")
        output_csv = os.path.join(region_dir, f"sorted_{region}.csv")
        group_and_sort_by_speaker(input_csv, output_csv)

In [None]:
import os
import csv
from tqdm import tqdm
from collections import defaultdict

def get_top_speakers_for_duration(input_csv, target_duration_hours=50):
    """Gets the top speakers needed to exceed the target duration.

    Args:
        input_csv (str): Path to the input CSV file.
        target_duration_hours (float): Target duration in hours.

    Returns:
        list: List of top speaker names, or None if target duration is not reached.
    """
    speaker_durations = defaultdict(float)

    with open(input_csv, 'r', newline='') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            speaker = row['speaker']
            duration = float(row['total_duration'])
            speaker_durations[speaker] += duration

    sorted_durations = sorted(speaker_durations.items(), key=lambda item: item[1], reverse=True)

    total_duration_seconds = 0
    top_speakers = []
    target_duration_seconds = target_duration_hours * 3600

    for speaker, duration in sorted_durations:
        total_duration_seconds += duration
        top_speakers.append(speaker)
        if total_duration_seconds >= target_duration_seconds:
            return top_speakers

    return None  # Target duration not reached

if __name__ == "__main__":
    region_dir = './VIN27'

    for region in ['north', 'south', 'center']:
        input_csv = os.path.join(region_dir, f"sorted_{region}.csv")
        top_speakers = get_top_speakers_for_duration(input_csv, target_duration_hours=40)

        if top_speakers:
            output_file = os.path.join(region_dir, f"top_speakers_{region}.txt")
            with open(output_file, 'w') as f:
                for speaker in top_speakers:
                    f.write(f"{speaker}\n")
            print(f"Region: {region}, Top speaker names saved to: {output_file}")
        else:
            print(f"Region: {region}, Target duration not reached with all speakers.")

In [None]:
import os
import csv
import pandas as pd

def get_top_speakers(input_csv, top_n=150):
    """Gets the top N speakers from the sorted CSV.

    Args:
        input_csv (str): Path to the input CSV file.
        top_n (int): Number of top speakers to retrieve.

    Returns:
        list: List of top speaker names.
    """
    df = pd.read_csv(input_csv)
    top_speakers = df['speaker'].head(top_n).tolist()
    return top_speakers

if __name__ == "__main__":
    region_dir = './VIN27'

    for region in ['north', 'south', 'center']:
        input_csv = os.path.join(region_dir, f"sorted_{region}.csv")
        top_speakers = get_top_speakers(input_csv, top_n=150)

        output_file = os.path.join(region_dir, f"top_150_speakers_{region}.txt")
        with open(output_file, 'w') as f:
            for speaker in top_speakers:
                f.write(f"{speaker}\n")
        print(f"Region: {region}, Top 150 speaker names saved to: {output_file}")

In [None]:
import pandas as pd

df = pd.read_csv('./VIN27/sorted_north.csv')
df['total_duration'].head(150).sum() / 3600

In [None]:
import json

with open("VIN27/transcription.json", encoding='utf-8') as f:
    transcript = json.load(f)

transcript.keys()

In [None]:
transcript['vinh-long'].keys()

In [None]:
transcript['vinh-long']['3598207'].keys()

In [None]:
transcript['vinh-long']['3644732']

In [None]:
import pandas as pd
df = pd.read_csv("VIN27/updated_metadata.csv", sep='|')

print(df[df['gender'] == 'MALE']['speaker'].nunique())
print(df[df['gender'] == 'FEMALE']['speaker'].nunique())


In [None]:
df[df['gender'] == 'MALE']['duration'].sum() / 3600

In [None]:
df[df['gender'] == 'FEMALE']['duration'].sum() / 3600

In [None]:
df[df['region'] == 'south']['duration'].sum() / 3600

### how many hours are there in the dataset?

In [9]:
import pandas as pd

df = pd.read_csv("VIN27/audio_data.csv")
df.head()
df.duration.sum() / 3600

# --> audio_data.csv does not contains all audio paths


11.567956319444445

In [11]:
import json
with open("VIN27/transcription.json", encoding='utf-8') as f:
    transcript = json.load(f)


In [12]:
transcript

KeyboardInterrupt: 

## explore short audio 