# unique speaker in data

In [None]:
import os
import csv

def get_unique_speaker_ids(folder_path):
    # List all files in the folder
    file_list = os.listdir(folder_path)
    #print(file_list[0])
    # Set to store unique speaker IDs
    unique_speaker_ids = set()
    
    # Iterate over each file
    for file_name in file_list:
        # Extract speaker ID from the filename
        if file_name.endswith('.wav'):
            speaker_id = file_name.split('_')[5]
            
            # Add speaker ID to the set
            unique_speaker_ids.add(speaker_id)
    
    return unique_speaker_ids

def write_unique_speaker_ids_to_csv(unique_speaker_ids, output_csv_file):
    with open(output_csv_file, 'w', newline='') as csvfile: 
        csv_writer = csv.writer(csvfile)
        for speaker_id in unique_speaker_ids:
            csv_writer.writerow([speaker_id])

# Parent folder containing multiple subfolders with .wav files
parent_folder = '/raid/scratch/Vaibhav/Dataset/Audio_language_specific_part2'

# Output CSV file to store combined unique speaker IDs
combined_csv_file = '/raid/scratch/Vaibhav/Dataset/combined_unique_speaker_ids.csv'

# Initialize an empty set to store all unique speaker IDs
all_unique_speaker_ids = set()

# Iterate over each subfolder
for folder in os.listdir(parent_folder):
    print(folder)
    folder_path = os.path.join(parent_folder, folder)

    for subfolder in os.listdir(folder_path):

        print(subfolder)
        subfolder_path = os.path.join(folder_path, subfolder)
        #print(subfolder_path)
        # Get unique speaker IDs for the current subfolder
        unique_speaker_ids = get_unique_speaker_ids(subfolder_path)

        # Add unique speaker IDs to the set of all unique speaker IDs
        all_unique_speaker_ids.update(unique_speaker_ids)

# Write all unique speaker IDs to the combined CSV file
write_unique_speaker_ids_to_csv(all_unique_speaker_ids, combined_csv_file)


#  count of lanuages one speaker can speak

In [None]:
import os
import csv
from tqdm import tqdm

def get_speaker_id_from_filename(filename):
    # Extract speaker ID from the filename
    if filename.endswith('.wav'):
        return filename.split('_')[5]
    else:
        return None

def count_folders_with_speaker_id(parent_folder, speaker_id):
    count = 0
    folders = []
    
    # Iterate over each subfolder
    for folder in os.listdir(parent_folder):
        folder_path = os.path.join(parent_folder, folder)

        for subfolder in os.listdir(folder_path):
            subfolder_path = os.path.join(folder_path, subfolder)
            #print(subfolder)

            # Check if the folder contains .wav files with the specified speaker ID
            if os.path.isdir(subfolder_path):
                for file_name in os.listdir(subfolder_path):
                    if get_speaker_id_from_filename(file_name) == speaker_id:
                        folders.append(subfolder)
                        count += 1
                        print(folders, count)
                        break  # No need to check other files in this folder
        
    return count, folders

# Parent folder containing multiple subfolders with .wav files
parent_folder = '/raid/scratch/Vaibhav/Dataset/Audio_language_specific_part2'

# CSV file containing all unique speaker IDs
unique_speaker_ids_csv = '/raid/scratch/Vaibhav/Dataset/combined_unique_speaker_ids.csv'

# Output CSV file to store speaker IDs, their counts, and corresponding folder names
output_csv_file = '/raid/scratch/Vaibhav/Dataset/speaker_id_counts_with_lanuages.csv'

# Read the CSV file containing all unique speaker IDs
all_unique_speaker_ids = []
with open(unique_speaker_ids_csv, 'r') as csvfile:
    csv_reader = csv.reader(csvfile)
    for row in csv_reader:
        all_unique_speaker_ids.append(row[0])

# Create a list to store speaker ID counts, number of folders, and corresponding folder names
speaker_id_counts_with_folders = []

# Create a tqdm progress bar for all unique speaker IDs
pbar = tqdm(total=len(all_unique_speaker_ids), desc="Processing Speaker IDs", unit="ID")

count_speaker = 0

# Count the number of folders containing each speaker ID and get the folder names
for speaker_id in all_unique_speaker_ids:
    count_speaker += 1

    count, folders = count_folders_with_speaker_id(parent_folder, speaker_id)
    speaker_id_counts_with_folders.append([speaker_id, count, ', '.join(folders)])

    if count_speaker >= 10:
        break
    
    # Update the progress bar
    pbar.update(1)

pbar.close()

# Write speaker ID counts, number of folders, and folder names to the output CSV file
with open(output_csv_file, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Speaker ID', 'Number of Folders', 'Folders'])
    csv_writer.writerows(speaker_id_counts_with_folders)

print("Speaker ID counts, number of folders, and corresponding folder names have been written to:", output_csv_file)


# figure out in how many cases same speaker's file get detected with different languages and to what extent


In [None]:
import csv
import os
from tqdm import tqdm 

# Input folder containing CSV files
input_folder = "/data/Vaani/CSVs_with_link/"

# Output CSV file
output_csv_file = "speakerid_true_pred_different1.csv"

# Dictionary to store aggregated data for each speaker ID
speaker_data = {}

for folder in tqdm(os.listdir(input_folder)):
    folder_path = os.path.join(input_folder, folder)

    # Use tqdm to create a progress bar for CSV files within the folder
    for filename in tqdm(os.listdir(folder_path)):
        #print(filename)

        if filename.endswith(".csv"):
            input_csv_file = os.path.join(folder_path, filename)

            with open(input_csv_file, mode='r') as file:
                reader = csv.DictReader(file)

                for row in reader:
                    speaker_id = os.path.basename(row['File']).split('_')[5]
                    district = os.path.basename(row['File']).split('_')[4]
                    state = os.path.basename(row['File']).split('_')[3]

                    # Update speaker data if speaker_id doesn't exist
                    if speaker_id not in speaker_data:
                        speaker_data[speaker_id] = {
                            'Vendors': set(),
                            'Languages': {
                                'Asserted': {},
                                'Predicted': {}
                            },
                            'District': district,
                            'State': state
                        }

                    speaker_data[speaker_id]['Vendors'].add(row['Vendor'])

                    # Count asserted languages
                    asserted_language = row['Asserted_Language']
                    if asserted_language not in speaker_data[speaker_id]['Languages']['Asserted']:
                        speaker_data[speaker_id]['Languages']['Asserted'][asserted_language] = 0
                    speaker_data[speaker_id]['Languages']['Asserted'][asserted_language] += 1

                    # Count predicted languages
                    predicted_language = row['Detected_Language']
                    if predicted_language not in speaker_data[speaker_id]['Languages']['Predicted']:
                        speaker_data[speaker_id]['Languages']['Predicted'][predicted_language] = 0

                    speaker_data[speaker_id]['Languages']['Predicted'][predicted_language] += 1


# Write aggregated data to output CSV file
with open(output_csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['SpeakerID', 'Vendors', 'Asserted_Languages', 'Asserted_Languages_Count', 'Predicted_Languages', 'Predicted_Languages_Count', 'District', 'State'])
    for speaker_id, data in speaker_data.items():
        vendors = ', '.join(data['Vendors'])
        asserted_languages = ', '.join(data['Languages']['Asserted'].keys())
        asserted_languages_count = ', '.join(str(count) for count in data['Languages']['Asserted'].values())
        predicted_languages = ', '.join(data['Languages']['Predicted'].keys())
        predicted_languages_count = ', '.join(str(count) for count in data['Languages']['Predicted'].values())
        writer.writerow([speaker_id, vendors, asserted_languages, asserted_languages_count, predicted_languages, predicted_languages_count, data['District'], data['State']])

print("Aggregated data has been written to", output_csv_file)


In [22]:
import csv

# Input CSV file
input_csv_file = "/data/Vaani/speakerid_true_pred_different3.csv"

# Dictionary to store counts for different lengths of predicted labels
label_counts = {i: [] for i in range(1, 100)}

# Variable to store the maximum length of predicted labels
max_length = 0

# Read data from input CSV file
with open(input_csv_file, mode='r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        predicted_languages = row['Predicted_Languages'].split(', ')
        num_predicted_languages = len(predicted_languages)
        
        # Increment count based on the length of predicted labels
        label_counts[num_predicted_languages].append(row['SpeakerID'])
        
        # Update the maximum length
        if num_predicted_languages > max_length:
            max_length = num_predicted_languages

# Print the counts for different lengths of predicted labels along with speaker IDs
for length, speaker_ids in label_counts.items():
    if len(speaker_ids) != 0:
        if len(speaker_ids) <= 5:
            print(f"Number of occurrences with {length} predicted labels for a speaker {len(speaker_ids)}:{speaker_ids}")
        else :
            print(f"Number of occurrences with {length} predicted labels for a speaker {len(speaker_ids)}")


# Print the maximum length of predicted labels
print(f"Maximum no. of predicted labels: {max_length}")



Number of occurrences with 1 predicted labels for a speaker 5326
Number of occurrences with 2 predicted labels for a speaker 7845
Number of occurrences with 3 predicted labels for a speaker 7799
Number of occurrences with 4 predicted labels for a speaker 6918
Number of occurrences with 5 predicted labels for a speaker 5687
Number of occurrences with 6 predicted labels for a speaker 4606
Number of occurrences with 7 predicted labels for a speaker 3991
Number of occurrences with 8 predicted labels for a speaker 3328
Number of occurrences with 9 predicted labels for a speaker 2826
Number of occurrences with 10 predicted labels for a speaker 2282
Number of occurrences with 11 predicted labels for a speaker 1921
Number of occurrences with 12 predicted labels for a speaker 1750
Number of occurrences with 13 predicted labels for a speaker 1507
Number of occurrences with 14 predicted labels for a speaker 1302
Number of occurrences with 15 predicted labels for a speaker 1132
Number of occurrenc

# added the probability column

In [13]:
import csv
import os
from tqdm import tqdm

# Input folder containing CSV files
input_folder = "/data/Vaani/CSVs_with_link/"

# Output CSV file
output_csv_file = "speakerid_true_pred_different3.csv"

# Dictionary to store aggregated data for each speaker ID
speaker_data = {}


def process_csv_file(input_csv_file):
    """
    Processes a single CSV file, extracting speaker data and updating the dictionary.

    Args:
        input_csv_file (str): Path to the CSV file.
    """

    with open(input_csv_file, mode='r') as file:
        reader = csv.DictReader(file)

        for row in reader:
            speaker_id = os.path.basename(row['File']).split('_')[5]
            district = os.path.basename(row['File']).split('_')[4]
            state = os.path.basename(row['File']).split('_')[3]

            # Update speaker data if speaker_id doesn't exist
            if speaker_id not in speaker_data:
                speaker_data[speaker_id] = {
                    'Vendors': set(),
                    'Languages': {
                        'Asserted': {},
                        'Predicted': {}
                    },
                    'District': district,
                    'State': state
                }

            speaker_data[speaker_id]['Vendors'].add(row['Vendor'])

            # Count asserted languages
            asserted_language = row['Asserted_Language']
            if asserted_language not in speaker_data[speaker_id]['Languages']['Asserted']:
                speaker_data[speaker_id]['Languages']['Asserted'][asserted_language] = 0
            speaker_data[speaker_id]['Languages']['Asserted'][asserted_language] += 1

            # Count predicted languages and probabilities (assuming a 'Probability' column)
            predicted_language = row['Detected_Language']
            probability = float(row['Probability'])  # Assuming 'Probability' column exists
            probability = round(probability, 2)

            if predicted_language not in speaker_data[speaker_id]['Languages']['Predicted']:
                speaker_data[speaker_id]['Languages']['Predicted'][predicted_language] = {
                    'count': 0,
                    'probability': []
                }
            speaker_data[speaker_id]['Languages']['Predicted'][predicted_language]['count'] += 1
            speaker_data[speaker_id]['Languages']['Predicted'][predicted_language]['probability'].append(probability)

            # Calculate the mean probability if there are multiple probabilities
            if len(speaker_data[speaker_id]['Languages']['Predicted'][predicted_language]['probability']) > 1:
                mean_probability = sum(speaker_data[speaker_id]['Languages']['Predicted'][predicted_language]['probability']) / len(speaker_data[speaker_id]['Languages']['Predicted'][predicted_language]['probability'])
                # Round the mean probability to two decimal places
                mean_probability = round(mean_probability, 2)
                # Replace the list of probabilities with the mean probability
                speaker_data[speaker_id]['Languages']['Predicted'][predicted_language]['probability'] = [mean_probability]

# Use a single tqdm loop for progress tracking
for folder in tqdm(os.listdir(input_folder), desc="Processing"):
    folder_path = os.path.join(input_folder, folder)

    for filename in tqdm(os.listdir(folder_path), desc="Processing CSVs"):
        if filename.endswith(".csv"):
            input_csv_file = os.path.join(folder_path, filename)
            process_csv_file(input_csv_file)


# Write aggregated data to output CSV file
with open(output_csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['SpeakerID', 'Vendors', 'Asserted_Language', 'Files_count',
                     'Predicted_Languages', 'Predicted_Language_Counts', 'Predicted_Language_Probabilities_mean',
                     'District', 'State'])
    for speaker_id, data in speaker_data.items():
        vendors = ', '.join(data['Vendors'])
        asserted_languages = ', '.join(data['Languages']['Asserted'].keys())
        asserted_languages_count = ', '.join(str(count) for count in data['Languages']['Asserted'].values())

        predicted_languages = []
        predicted_language_counts = []
        predicted_language_probabilities = []
        for language, info in data['Languages']['Predicted'].items():
            predicted_languages.append(language)
            predicted_language_counts.append(str(info['count']))
            predicted_language_probabilities.append(', '.join([str(prob) for prob in info['probability']]))

        predicted_languages = ', '.join(predicted_languages)
        predicted_language_counts = ', '.join(predicted_language_counts)
        predicted_language_probabilities = ', '.join(predicted_language_probabilities)

        writer.writerow([speaker_id, vendors, asserted_languages, asserted_languages_count,
                         predicted_languages, predicted_language_counts, predicted_language_probabilities,
                         data['District'], data['State']])


Processing CSVs: 100%|██████████| 23/23 [00:03<00:00,  7.54it/s]
Processing CSVs: 100%|██████████| 36/36 [01:19<00:00,  2.21s/it]
Processing: 100%|██████████| 2/2 [01:22<00:00, 41.29s/it]


# making the csv file which contains the info assert lanuage wise

In [19]:
import csv
from collections import defaultdict
import os

# Function to extract state and district from filename
def extract_state_and_district(filename):
    parts = filename.split("_")
    if len(parts) >= 5:
        state = parts[3]
        district = parts[4]
        return state, district
    else:
        return '', ''

# Initialize dictionary to store combined data
combined_data = defaultdict(lambda: {'vendors': set(), 'file_count': 0, 'predicted_language_count': defaultdict(int), 'predicted_language_probabilities': defaultdict(list), 'districts': set(), 'states': set()})

# Directory containing CSV files
input_directory = "/data/Vaani/CSVs/CSVs_with_link/Not_supported_facebook"

# Process each CSV file in the directory
for filename in os.listdir(input_directory):
    if filename.endswith(".csv"):
        with open(os.path.join(input_directory, filename), "r") as input_file:
            csv_reader = csv.DictReader(input_file)

            for row in csv_reader:
                asserted_language = row['Asserted_Language']
                vendor = row['Vendor']
                detected_language = row['Detected_Language']
                probability = float(row['Probability'])
                file_url = row['File']

                # Extract filename from file URL
                file_name = file_url.split("/")[-1]

                state, district = extract_state_and_district(file_name)
                combined_data[asserted_language]['vendors'].add(vendor)
                combined_data[asserted_language]['file_count'] += 1
                combined_data[asserted_language]['predicted_language_count'][detected_language] += 1
                combined_data[asserted_language]['predicted_language_probabilities'][detected_language].append(probability)
                combined_data[asserted_language]['districts'].add(district)
                combined_data[asserted_language]['states'].add(state)

# Calculate the mean probability for each predicted language
for asserted_language, data in combined_data.items():
    for detected_language, probabilities in data['predicted_language_probabilities'].items():
        mean_probability = sum(probabilities) / len(probabilities)
        combined_data[asserted_language]['predicted_language_probabilities'][detected_language] = mean_probability

# Write the combined data to a new CSV file
output_file_path = "combined_output.csv"
with open(output_file_path, "w", newline='') as output_file:
    fieldnames = ['Asserted_Language', 'Support_Status', 'Vendor', 'File_Count', 'Predicted_Language_Count', 'Predicted_Language_Probabilities', 'District', 'State']
    csv_writer = csv.DictWriter(output_file, fieldnames=fieldnames)
    csv_writer.writeheader()

    for asserted_language, data in combined_data.items():
        support_status = os.path.basename(input_directory).rsplit('_', 1)[0]
        csv_writer.writerow({
            'Asserted_Language': asserted_language,
            'Support_Status': support_status,
            'Vendor': ','.join(data['vendors']),
            'File_Count': data['file_count'],
            'Predicted_Language_Count': ','.join([f"{lang}:{count}" for lang, count in data['predicted_language_count'].items()]),
            'Predicted_Language_Probabilities': ','.join([f"{lang}:{prob:.2f}" for lang, prob in data['predicted_language_probabilities'].items()]),
            'District': ','.join(data['districts']),
            'State': ','.join(data['states'])
        })

print(f"Combined data written to {output_file_path}")


Combined data written to combined_output.csv


# confussion Matrix from the above csv

In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file
data = pd.read_csv("/data/Vaani/CSVs/Assert_lan_wise_info_supported.csv")

# Initialize a dictionary to store counts
language_counts = {}

# Iterate through the data and update counts
for index, row in data.iterrows():
    asserted_language = row['Asserted_Language']
    predicted_languages = row['Predicted_Language_Count'].split(',')
    for pred_lang in predicted_languages:
        lang_count = pred_lang.split(':')
        predicted_language = lang_count[0]
        count = int(lang_count[1])
        if asserted_language not in language_counts:
            language_counts[asserted_language] = {}
        language_counts[asserted_language][predicted_language] = count

# Create a DataFrame from the counts dictionary
matrix_df = pd.DataFrame(language_counts).fillna(0).astype(int)

# Transpose the DataFrame twice to move asserted labels to the top and predicted labels to the left
matrix_df = matrix_df.transpose().transpose()

# Save the transposed matrix to a CSV file
matrix_df.to_csv("/data/Vaani/CSVs/Asserted_vs_Predicted_Language_Counts.csv")



## visualize the above 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file
data = pd.read_csv("/data/Vaani/CSVs/Assert_lan_wise_info_supported.csv")

# Initialize a dictionary to store counts
language_counts = {}

# Iterate through the data and update counts
for index, row in data.iterrows():
    asserted_language = row['Asserted_Language']
    predicted_languages = row['Predicted_Language_Count'].split(',')
    for pred_lang in predicted_languages:
        lang_count = pred_lang.split(':')
        predicted_language = lang_count[0]
        count = int(lang_count[1])
        if asserted_language not in language_counts:
            language_counts[asserted_language] = {}
        language_counts[asserted_language][predicted_language] = count

# Create a DataFrame from the counts dictionary
matrix_df = pd.DataFrame(language_counts).fillna(0).astype(int)

# Visualize the matrix as a heatmap
plt.figure(figsize=(25, 300))
sns.heatmap(matrix_df, cmap="YlGnBu", annot=True, fmt="d", linewidths=0.5)
plt.title("Asserted vs Predicted Language Counts")
plt.xlabel("Asserted Language")
plt.ylabel("Predicted Language")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()


# normalize each row of the csv 

In [5]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('/data/Vaani/CSVs/Asserted_vs_Predicted_Language_Counts_supported.csv')

# Get column names except the first one (language names)
columns = df.columns[1:]

# Calculate the sum of each column
col_sums = df[columns].sum()

# Normalize each column
normalized_df = df.copy()
for col in columns:
    normalized_df[col] = df[col] / col_sums[col]

# Write the normalized data to a new CSV file
normalized_df.to_csv('normalized_data.csv', index=False)


# most frequent mismatch pair  pie charts

In [None]:
import csv
import matplotlib.pyplot as plt

# Read data from CSV file
data = {}
with open('/data/Vaani/CSVs/Asserted_vs_Predicted_Language_Counts_not_supported_normalized.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    header_languages = next(reader)[1:]
    #print(header_languages) 

    for row in reader:
        #print(row)
        language = row[0]
        percentage = float(row[1])
        data[language] = percentage

# Calculate percentages
total_percentage = sum(data.values())
threshold = 2  # Threshold percentage for inclusion in the chart
percentages = {lang: count * 100 for lang, count in data.items() if (count * 100) >= threshold}
other_percentage = 100 - sum(percentages.values())
if other_percentage > 0:
    percentages['Other'] = other_percentage

# Define a color palette
colors = plt.cm.tab20.colors  # Example color palette

# Pie chart with rotated labels
plt.figure(figsize=(15, 15))
plt.pie(percentages.values(), labels=percentages.keys(), autopct='%1.1f%%', colors=colors, textprops={'rotation': 40, 'ha': 'right'})

plt.title(f'Percentage of Times {header_languages[0]} is Predicted as Different Languages (supported)')
plt.show()


In [None]:
import csv
import matplotlib.pyplot as plt

for i in range(1,37):
    # Read data from CSV file
    data = {}
    with open('/data/Vaani/CSVs/Asserted_vs_Predicted_Language_Counts_supported_normalized.csv', newline='') as csvfile:
        reader = csv.reader(csvfile)
        header_languages = next(reader)[1:]
        #print(header_languages) 

        for row in reader:
            #print(row)
            language = row[0]
            percentage = float(row[i])
            data[language] = percentage

    # Calculate percentages
    total_percentage = sum(data.values())
    threshold = 1.5  # Threshold percentage for inclusion in the chart
    percentages = {lang: count * 100 for lang, count in data.items() if (count * 100) >= threshold}
    other_percentage = 100 - sum(percentages.values())
    if other_percentage > 0:
        percentages['Other'] = other_percentage

    # Define a color palette
    colors = plt.cm.tab20.colors  # Example color palette

    # Pie chart with rotated labels
    plt.figure(figsize=(15, 15))
    plt.pie(percentages.values(), labels=percentages.keys(), autopct='%1.1f%%', colors=colors, textprops={'rotation': 40, 'ha': 'right'})

    plt.title(f'Percentage of Times {header_languages[i-1]} is Predicted as Different Languages (supported)')
    plt.show()

# retrive the files from csv which have totally different predicted lanuage for example tamil to hindi

In [34]:
import pandas as pd
import os

csv_dir = "/data/Vaani/CSVs/CSVs_with_link/Supported_facebook/SANTALI_predicted_labels.csv"

# Read the CSV file
df = pd.read_csv(csv_dir)

# Filter rows where Detected_Language is Bengali
#bengali_df = df[(df['Detected_Language'] == 'Hindi')].copy()
#bengali_df = df[(df['Detected_Language'] == 'Bengali')].copy()
#bengali_df = df[(df['Detected_Language'] == 'Kannada') | (df['Detected_Language'] == 'Telugu')].copy()
bengali_df = df

# Extract District from the File column
bengali_df['District'] = bengali_df['File'].apply(lambda x: x.split('_')[4])

# Drop the Probability and Detected_Language columns
bengali_df = bengali_df.drop(['Probability', 'Detected_Language'], axis=1)

# Add an empty column named "Claim_Correct/Incorrect"
bengali_df['Claim_Correct/Incorrect'] = ''

filename = "Assert_" + os.path.basename(csv_dir).split('_')[0].capitalize() + ".csv"

output_dir = "/data/Vaani/CSVs/audio_files_can_be_wrong/supported/" + filename

# Write the filtered DataFrame to a new CSV file
bengali_df.to_csv(output_dir, index=False)
