# extracting speaker and utt ids

In [7]:
import os
import pandas as pd

def extract_ids_from_files(folder_path, max_files=None):
    speaker_utt_pairs = set()
    
    # Walk through the folder and its subfolders
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_paths.append(os.path.join(root, file))
    
    # Limit the number of files to process
    if max_files:
        file_paths = file_paths[:max_files]
    
    for idx, file_path in enumerate(file_paths):
        file_name = os.path.basename(file_path)
        
        # Print progress
        print(f"Processing file {idx + 1}/{len(file_paths)}: {file_name}")
        
        # Try reading the file as a TSV
        try:
            df = pd.read_csv(file_path, sep='\t', chunksize=1000)
            
            for chunk in df:
                # Process each row in the chunk
                for _, row in chunk.iterrows():
                    first_column = row[0]
                    if isinstance(first_column, str) and first_column.startswith('/'):
                        basename = os.path.basename(first_column)
                        parts = basename.split("_")
                        if len(parts) >= 4:
                            speakerid = parts[2]
                            uttid = parts[3]
                            speaker_utt_pairs.add((speakerid, uttid))
        except Exception as e:
            print(f"Skipping file {file_name}: {e}")
    
    # Create a DataFrame from the unique speaker-utt pairs
    result_df = pd.DataFrame(list(speaker_utt_pairs), columns=['SpeakerID', 'UttID'])
    
    # Save the DataFrame to a new TSV file
    result_df.to_csv('unique_speaker_utt_ids.tsv', sep='\t', index=False)
    print("Completed. The unique speaker and utterance IDs have been saved to 'unique_speaker_utt_ids.tsv'.")

# Specify the folder path
folder_path = r'/data/Root_content/Vaani/Response_to_questions/MegaPrecheckList/phase1'

# Call the function with a limit of 5 files for testing
extract_ids_from_files(folder_path, max_files=None)


Processing file 1/1546: 12-12-2023-24
Processing file 2/1546: redelivery-11-07-2023-11
Processing file 3/1546: 23-02-2024-05
Processing file 4/1546: redelivery-11-01-2024-02-03
Processing file 5/1546: 11-01-2024-15
Processing file 6/1546: 12-12-2023-58
Processing file 7/1546: redelivery-05-01-2024-42-03
Processing file 8/1546: redelivery-06-08-2023-09-03
Processing file 9/1546: 31-03-2024-02
Processing file 10/1546: redelivery-06-07-2023-06-02
Processing file 11/1546: 20-12-2023-46
Processing file 12/1546: redelivery-02-08-2023-02-03
Processing file 13/1546: 08-07-2023-04
Processing file 14/1546: 31-03-2024-27
Processing file 15/1546: 25-05-2024-02
Processing file 16/1546: redelivery-05-01-2024-43-03
Processing file 17/1546: 18-10-2023-33
Processing file 18/1546: redelivery-10-07-2023-13
Processing file 19/1546: 12-12-2023-61
Processing file 20/1546: 18-10-2023-47
Processing file 21/1546: redelivery-05-01-2024-52-03
Processing file 22/1546: 19-01-2024-17
Processing file 23/1546: 12-12-

KeyboardInterrupt: 