# make the csv of the unique speakerid,district,state, no. of file and duration


In [1]:
import os
import csv
import wave
from collections import defaultdict

# Function to extract speaker ID, district, and state from filename
def extract_info(filename):
    parts = filename.split('_')
    speaker_id = parts[2]
    district = parts[1]
    state = parts[0]
    return speaker_id, district, state

# Function to calculate duration of a .wav file
def get_wav_duration(file_path):
    with wave.open(file_path, 'rb') as wav_file:
        frames = wav_file.getnframes()
        rate = wav_file.getframerate()
        duration = frames / float(rate)
        return duration

# Path to the folder containing the .wav files
folder_path = '/home/vaibh/Vaani/Speaker_ID/Dataset/megdap'

# Dictionary to store speaker IDs, districts, states, durations, and their corresponding counts
speaker_info = defaultdict(lambda: {'district': '', 'state': '', 'duration': 0, 'count': 0})

# Iterate through .wav files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.wav'):
        speaker_id, district, state = extract_info(filename)
        audio_path = os.path.join(folder_path, filename)
        duration = get_wav_duration(audio_path)
        speaker_info[speaker_id]['district'] = district
        speaker_info[speaker_id]['state'] = state
        speaker_info[speaker_id]['duration'] += duration
        speaker_info[speaker_id]['count'] += 1

# Write unique speaker IDs, districts, states, durations, and their counts to a CSV file
output_csv_path = 'speaker_info_with_duration.csv'
with open(output_csv_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Speaker ID', 'District', 'State', 'Total Duration (s)', 'Number of Audio Files'])
    for speaker_id, info in speaker_info.items():
        writer.writerow([speaker_id, info['district'], info['state'], info['duration'], info['count']])

print("CSV file created successfully.")


CSV file created successfully.


#  add the gender info in the above csv file 

In [2]:
import csv

# Path to the CSV file containing speaker information (without gender)
input_csv_path = '/home/vaibh/Vaani/Speaker_ID/speaker_info_with_duration.csv'

# Path to the new CSV file to be created with gender information added
output_csv_path = 'speaker_info_with_gender.csv'

# Path to the CSV file containing speaker ID and gender information
gender_csv_path = '/home/vaibh/Vaani/Speaker_ID/genderMeta_megdap_13Dec.csv'

# Read speaker ID and gender information from existing CSV file
gender_info = {}
with open(gender_csv_path, 'r') as gender_file:
    reader = csv.reader(gender_file)
    for row in reader:
        speaker_id, gender = row
        #print(type(speaker_id), type(gender))
        #gender_info[speaker_id] = gender
        gender_info[speaker_id.strip()] = gender


# speaker_id = ' 80475 '
# gender = gender_info.get(speaker_id, '')  # Get gender info from the dictionary
# print(gender)


# Open input and output CSV files
with open(input_csv_path, 'r') as input_file, open(output_csv_path, 'w', newline='') as output_file:
    reader = csv.reader(input_file)
    writer = csv.writer(output_file)

    # Read header from input and add 'Gender' column to header
    header = next(reader)
    header.append('Gender')
    writer.writerow(header)

    # Iterate through rows in input CSV file
    for row in reader:
        # Get speaker ID from the row
        speaker_id = row[0]
        #print(type(speaker_id))
        # Get gender information from the gender_info dictionary
        gender = gender_info.get(speaker_id, '')  # Get gender info from the dictionary
        #print(gender)
        # Add gender information to the row
        row.append(gender)
        # Write the row to the output CSV file
        writer.writerow(row)

print("CSV file created successfully with gender information added.")


CSV file created successfully with gender information added.


#  made the csv which contains all the information District Wise

In [3]:
import csv
from collections import defaultdict

# Path to the existing CSV file
input_csv_path = '/home/vaibh/Vaani/Speaker_ID/speaker_info_with_gender.csv'

# Path to the new CSV file to be created
output_csv_path = 'aggregated_data_district_wise.csv'

# Dictionary to store aggregated data for each district and state
aggregated_data = defaultdict(lambda: {'number_of_speakers': 0, 'number_of_files': 0, 'total_duration': 0, 'number_of_males': 0, 'number_of_females': 0})

# Read data from the existing CSV file
with open(input_csv_path, 'r') as input_file:
    reader = csv.DictReader(input_file)
    for row in reader:
        district = row['District']
        state = row['State']
        num_speakers = int(row['Number of Audio Files'])
        duration = float(row['Total Duration (s)'])
        gender = row['Gender']

        # Aggregate data based on district and state
        aggregated_data[(district, state)]['number_of_speakers'] += 1
        aggregated_data[(district, state)]['number_of_files'] += num_speakers
        aggregated_data[(district, state)]['total_duration'] += (duration/3600)
        if gender.strip() == 'Male':
            aggregated_data[(district, state)]['number_of_males'] += 1
        elif gender.strip() == 'Female':
            aggregated_data[(district, state)]['number_of_females'] += 1

# Write aggregated data to the new CSV file
with open(output_csv_path, 'w', newline='') as output_file:
    writer = csv.writer(output_file)
    writer.writerow(['District', 'State', 'Number of Speakers', 'Number of Files', 'Total Duration (hr)', 'Number of Males', 'Number of Females'])
    for (district, state), data in aggregated_data.items():
        writer.writerow([district, state, data['number_of_speakers'], data['number_of_files'], data['total_duration'], data['number_of_males'], data['number_of_females']])

print("Aggregated data written to CSV successfully.")


Aggregated data written to CSV successfully.


#  made the csv which contains all the information State Wise

In [2]:
import csv
from collections import defaultdict

# Path to the existing CSV file
input_csv_path = '/home/vaibh/Vaani/Speaker_ID/CSVs/megdap/aggregated_data_district_wise_megdap.csv'

# Path to the new CSV file to be created
output_csv_path = 'aggregated_data_statewise.csv'

# Dictionary to store aggregated data for each state
aggregated_data_statewise = defaultdict(lambda: {'number_of_speakers': 0, 'number_of_files': 0, 'total_duration_hours': 0, 'number_of_males': 0, 'number_of_females': 0, 'district_count': 0})

# Read data from the existing CSV file
with open(input_csv_path, 'r') as input_file:
    reader = csv.DictReader(input_file)
    for row in reader:
        district = row['District']
        state = row['State']
        num_speakers = int(row['Number of Speakers'])
        num_files = int(row['Number of Files'])
        total_duration_hours = float(row['Total Duration (hr)'])
        num_males = int(row['Number of Males'])
        num_females = int(row['Number of Females'])

        # Aggregate data based on state
        aggregated_data_statewise[state]['number_of_speakers'] += num_speakers
        aggregated_data_statewise[state]['number_of_files'] += num_files
        aggregated_data_statewise[state]['total_duration_hours'] += total_duration_hours
        aggregated_data_statewise[state]['number_of_males'] += num_males
        aggregated_data_statewise[state]['number_of_females'] += num_females

        # Count the number of unique districts in each state
        if district not in aggregated_data_statewise[state]:
            aggregated_data_statewise[state]['district_count'] += 1

# Write aggregated data to the new CSV file
with open(output_csv_path, 'w', newline='') as output_file:
    writer = csv.writer(output_file)
    writer.writerow(['State', 'Number of Speakers', 'Number of Files', 'Total Duration (hr)', 'Number of Males', 'Number of Females', 'District Count'])
    for state, data in aggregated_data_statewise.items():
        writer.writerow([state, data['number_of_speakers'], data['number_of_files'], data['total_duration_hours'], data['number_of_males'], data['number_of_females'], data['district_count']])

print("Aggregated data written to CSV successfully.")


Aggregated data written to CSV successfully.


# unique speaker in data

In [None]:
import os
import csv

def get_unique_speaker_ids(folder_path):
    # List all files in the folder
    file_list = os.listdir(folder_path)
    #print(file_list[0])
    # Set to store unique speaker IDs
    unique_speaker_ids = set()
    
    # Iterate over each file
    for file_name in file_list:
        # Extract speaker ID from the filename
        if file_name.endswith('.wav'):
            speaker_id = file_name.split('_')[5]
            
            # Add speaker ID to the set
            unique_speaker_ids.add(speaker_id)
    
    return unique_speaker_ids

def write_unique_speaker_ids_to_csv(unique_speaker_ids, output_csv_file):
    with open(output_csv_file, 'w', newline='') as csvfile: 
        csv_writer = csv.writer(csvfile)
        for speaker_id in unique_speaker_ids:
            csv_writer.writerow([speaker_id])

# Parent folder containing multiple subfolders with .wav files
parent_folder = '/raid/scratch/Vaibhav/Dataset/Audio_language_specific_part2'

# Output CSV file to store combined unique speaker IDs
combined_csv_file = '/raid/scratch/Vaibhav/Dataset/combined_unique_speaker_ids.csv'

# Initialize an empty set to store all unique speaker IDs
all_unique_speaker_ids = set()

# Iterate over each subfolder
for folder in os.listdir(parent_folder):
    print(folder)
    folder_path = os.path.join(parent_folder, folder)

    for subfolder in os.listdir(folder_path):

        print(subfolder)
        subfolder_path = os.path.join(folder_path, subfolder)
        #print(subfolder_path)
        # Get unique speaker IDs for the current subfolder
        unique_speaker_ids = get_unique_speaker_ids(subfolder_path)

        # Add unique speaker IDs to the set of all unique speaker IDs
        all_unique_speaker_ids.update(unique_speaker_ids)

# Write all unique speaker IDs to the combined CSV file
write_unique_speaker_ids_to_csv(all_unique_speaker_ids, combined_csv_file)


#  count of lanuages one speaker can speak

In [29]:
import os
import csv
from tqdm import tqdm

def get_speaker_id_from_filename(filename):
    # Extract speaker ID from the filename
    if filename.endswith('.wav'):
        return filename.split('_')[5]
    else:
        return None

def count_folders_with_speaker_id(parent_folder, speaker_id):
    count = 0
    folders = []
    
    # Iterate over each subfolder
    for folder in os.listdir(parent_folder):
        folder_path = os.path.join(parent_folder, folder)

        for subfolder in os.listdir(folder_path):
            subfolder_path = os.path.join(folder_path, subfolder)
            #print(subfolder)

            # Check if the folder contains .wav files with the specified speaker ID
            if os.path.isdir(subfolder_path):
                for file_name in os.listdir(subfolder_path):
                    if get_speaker_id_from_filename(file_name) == speaker_id:
                        folders.append(subfolder)
                        count += 1
                        print(folders, count)
                        break  # No need to check other files in this folder
        
    return count, folders

# Parent folder containing multiple subfolders with .wav files
parent_folder = '/raid/scratch/Vaibhav/Dataset/Audio_language_specific_part2'

# CSV file containing all unique speaker IDs
unique_speaker_ids_csv = '/raid/scratch/Vaibhav/Dataset/combined_unique_speaker_ids.csv'

# Output CSV file to store speaker IDs, their counts, and corresponding folder names
output_csv_file = '/raid/scratch/Vaibhav/Dataset/speaker_id_counts_with_lanuages.csv'

# Read the CSV file containing all unique speaker IDs
all_unique_speaker_ids = []
with open(unique_speaker_ids_csv, 'r') as csvfile:
    csv_reader = csv.reader(csvfile)
    for row in csv_reader:
        all_unique_speaker_ids.append(row[0])

# Create a list to store speaker ID counts, number of folders, and corresponding folder names
speaker_id_counts_with_folders = []

# Create a tqdm progress bar for all unique speaker IDs
pbar = tqdm(total=len(all_unique_speaker_ids), desc="Processing Speaker IDs", unit="ID")

count_speaker = 0

# Count the number of folders containing each speaker ID and get the folder names
for speaker_id in all_unique_speaker_ids:
    count_speaker += 1

    count, folders = count_folders_with_speaker_id(parent_folder, speaker_id)
    speaker_id_counts_with_folders.append([speaker_id, count, ', '.join(folders)])

    if count_speaker >= 10:
        break
    
    # Update the progress bar
    pbar.update(1)

pbar.close()

# Write speaker ID counts, number of folders, and folder names to the output CSV file
with open(output_csv_file, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Speaker ID', 'Number of Folders', 'Folders'])
    csv_writer.writerows(speaker_id_counts_with_folders)

print("Speaker ID counts, number of folders, and corresponding folder names have been written to:", output_csv_file)


Processing Speaker IDs:   0%|          | 0/14128 [00:00<?, ?ID/s]

['HINDI'] 1


Processing Speaker IDs:   0%|          | 1/14128 [00:01<5:53:27,  1.50s/ID]

['MAITHILI'] 1


Processing Speaker IDs:   0%|          | 2/14128 [00:03<6:51:03,  1.75s/ID]

['HINDI'] 1


Processing Speaker IDs:   0%|          | 3/14128 [00:04<6:19:45,  1.61s/ID]

['HINDI'] 1


Processing Speaker IDs:   0%|          | 4/14128 [00:06<6:06:49,  1.56s/ID]

['HINDI'] 1


Processing Speaker IDs:   0%|          | 5/14128 [00:07<5:59:00,  1.53s/ID]

['HINDI'] 1


Processing Speaker IDs:   0%|          | 6/14128 [00:09<5:54:52,  1.51s/ID]

['MARATHI'] 1


Processing Speaker IDs:   0%|          | 7/14128 [00:11<6:26:11,  1.64s/ID]

['HINDI'] 1


Processing Speaker IDs:   0%|          | 8/14128 [00:12<6:12:36,  1.58s/ID]

['RAJASTHANI'] 1


Processing Speaker IDs:   0%|          | 9/14128 [00:14<6:37:52,  1.69s/ID]

['HINDI'] 1


Processing Speaker IDs:   0%|          | 9/14128 [00:16<6:59:08,  1.78s/ID]

Speaker ID counts, number of folders, and corresponding folder names have been written to: /raid/scratch/Vaibhav/Dataset/speaker_id_counts_with_lanuages.csv





# figure out in how many cases same speaker's file get detected with different languages and to what extent


In [10]:
import csv
import os

# Input folder containing CSV files
input_folder = "/raid/scratch/Vaibhav/csv_predicted_facebook_equal(S,M)_part2/"

# Output CSV file
output_csv_file = "speakerid_true_pred_different1.csv"

# Dictionary to store aggregated data for each speaker ID
speaker_data = {}

# Iterate over each CSV file in the input folder
for folder in os.listdir(input_folder):
    folder_path = os.path.join(input_folder, folder)

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            input_csv_file = os.path.join(folder_path, filename)

            with open(input_csv_file, mode='r') as file:
                reader = csv.DictReader(file)
                for row in reader:
                    speaker_id = os.path.basename(row['Filename']).split('_')[5]
                    district = os.path.basename(row['Filename']).split('_')[4]
                    state = os.path.basename(row['Filename']).split('_')[3]

                    if speaker_id not in speaker_data:
                        speaker_data[speaker_id] = {
                            'Vendors': set(),
                            'Languages': {
                                'Asserted': {},
                                'Predicted': {}
                            },
                            'District': district,
                            'State': state
                        }

                    speaker_data[speaker_id]['Vendors'].add(row['Vendor'])

                    # Count asserted languages
                    asserted_language = row['Asserted Language']
                    if asserted_language not in speaker_data[speaker_id]['Languages']['Asserted']:
                        speaker_data[speaker_id]['Languages']['Asserted'][asserted_language] = 0
                    speaker_data[speaker_id]['Languages']['Asserted'][asserted_language] += 1

                    # Count predicted languages
                    predicted_language = row['Detected Language']
                    if predicted_language not in speaker_data[speaker_id]['Languages']['Predicted']:
                        speaker_data[speaker_id]['Languages']['Predicted'][predicted_language] = 0
                    speaker_data[speaker_id]['Languages']['Predicted'][predicted_language] += 1

# Write aggregated data to output CSV file
with open(output_csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['SpeakerID', 'Vendors', 'Asserted_Languages', 'Asserted_Languages_Count', 'Predicted_Languages', 'Predicted_Languages_Count', 'District', 'State'])
    for speaker_id, data in speaker_data.items():
        vendors = ', '.join(data['Vendors'])
        asserted_languages = ', '.join(data['Languages']['Asserted'].keys())
        asserted_languages_count = ', '.join(str(count) for count in data['Languages']['Asserted'].values())
        predicted_languages = ', '.join(data['Languages']['Predicted'].keys())
        predicted_languages_count = ', '.join(str(count) for count in data['Languages']['Predicted'].values())
        writer.writerow([speaker_id, vendors, asserted_languages, asserted_languages_count, predicted_languages, predicted_languages_count, data['District'], data['State']])

print("Aggregated data has been written to", output_csv_file)


Aggregated data has been written to speakerid_true_pred_different1.csv


In [4]:
import csv

# Input CSV file
input_csv_file = "/raid/scratch/Vaibhav/speakerid_true_pred_different.csv"

# Dictionary to store counts for different lengths of predicted labels
label_counts = {i: 0 for i in range(1, 21)}

# Variable to store the maximum length of predicted labels
max_length = 0

# Read data from input CSV file
with open(input_csv_file, mode='r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        predicted_languages = row['Predicted_Languages'].split(', ')
        num_predicted_languages = len(predicted_languages)
        
        # Increment count based on the length of predicted labels
        if num_predicted_languages in label_counts:
            label_counts[num_predicted_languages] += 1
        
        # Update the maximum length
        if num_predicted_languages > max_length:
            ans_row = row
            print(num_predicted_languages, row)
            max_length = num_predicted_languages

# Print the counts for different lengths of predicted labels
for length, count in label_counts.items():
    if count != 0 :
        print(f"Number of occurrences with {length} predicted labels for a speaker: {count}")

#print(ans_row)
# Print the maximum length of predicted labels
print(f"Maximum no. of predicted labels: {max_length}")


1 {'SpeakerID': '61551', 'Vendors': 'S', 'Asserted_Languages': 'Khari boli', 'Predicted_Languages': 'Hindi', 'District': 'JyotibaPhuleNagar', 'State': 'Uttarpradesh'}
4 {'SpeakerID': '61701', 'Vendors': 'S', 'Asserted_Languages': 'Khari boli', 'Predicted_Languages': 'Hindi, Urdu, Gujarati, Sindhi', 'District': 'JyotibaPhuleNagar', 'State': 'Uttarpradesh'}
5 {'SpeakerID': '142462', 'Vendors': 'S', 'Asserted_Languages': 'Khortha', 'Predicted_Languages': 'Hindi, Marathi, Sindhi, Urdu, Nepali (individual language)', 'District': 'Madhepura', 'State': 'Bihar'}
7 {'SpeakerID': '152995', 'Vendors': 'S', 'Asserted_Languages': 'Malvani', 'Predicted_Languages': 'Hindi, Marathi, Sindhi, Sinhala, Pushto, Urdu, Telugu', 'District': 'Solapur', 'State': 'Maharashtra'}
10 {'SpeakerID': '61195', 'Vendors': 'S', 'Asserted_Languages': 'Jaipuri', 'Predicted_Languages': 'Southern Balochi, Assamese, Hindi, Kannada, Gujarati, Kashmiri, Pushto, Urdu, Panjabi, Telugu', 'District': 'Churu', 'State': 'Rajasthan'}