# retreving the max group size


In [1]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('/data/Root_content/Vaani/audio_content_analysis/unique_file_names.csv')

# Group by 'File Size with Header (bytes)' and calculate the size of each group
group_sizes = df.groupby('File Size with Header (bytes)').size()

# Retrieve the maximum group size
max_group_size = group_sizes.max()

print(f"The maximum group size is: {max_group_size}")

The maximum group size is: 4204


In [3]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('/data/Root_content/Vaani/audio_content_analysis/unique_file_names.csv')

# Group by 'File Size with Header (bytes)' and calculate the size of each group
group_sizes = df.groupby('File Size with Header (bytes)').size()

# Count the occurrences of each group size
group_size_occurrences = group_sizes.value_counts()

# Convert to a DataFrame for easier viewing
group_size_occurrences_df = group_size_occurrences.reset_index(name='Occurrences')
group_size_occurrences_df.columns = ['Group Size', 'Occurrences']

# Sort the DataFrame by 'Group Size' in ascending order (use ascending=False for descending)
sorted_group_size_occurrences_df = group_size_occurrences_df.sort_values(by='Group Size', ascending=False)

# Save to a CSV file if needed
sorted_group_size_occurrences_df.to_csv('sorted_group_size_occurrences.csv', index=False)

print("Occurrences of each group size (sorted) saved to 'sorted_group_size_occurrences.csv'")
print(sorted_group_size_occurrences_df)


Occurrences of each group size (sorted) saved to 'sorted_group_size_occurrences.csv'
      Group Size  Occurrences
1226        4204            1
1232        3393            1
1163        3225            1
1467        3191            1
1494        3186            1
...          ...          ...
4              5         2772
3              4         5314
2              3        11052
1              2        23728
0              1        49149

[1519 rows x 2 columns]


# reterive all the files with group size 4204 and byte size 87314 

In [13]:
import pandas as pd
import os
import shutil

# Load the CSV file
df = pd.read_csv('/data/Root_content/Vaani/audio_content_analysis/audio_file_info.csv')

# Filter the DataFrame where "File Size with Header (bytes)" equals 87314
filtered_df = df[df['File Size with Header (bytes)'] == 87314]
columns_to_drop = ['Sample Rate', 'Channels', 'Bit Depth', 'Audio Data Size without Header (bytes)', 'Header Size (bytes)']
filtered_df = filtered_df.drop(columns=columns_to_drop)

filtered_df.to_csv("/data/Vaani/Dataset/Group_wise/group_size_87314/group_size_87314.csv", index=False)
# Specify the source base directory where the files are located
source_base_dir = '/data/Vaani/Dataset/Audios_all_district_vaani_3' 

# Specify the destination directory where the files will be copied
destination_dir = '/data/Vaani/Dataset/Group_wise/group_size_87314'

os.makedirs(destination_dir, exist_ok= True)

for filename in filtered_df['File Name']:
    folder_name = filename.split('_')[4]
    source_folder_path = os.path.join(source_base_dir, folder_name)

    source_file_path = os.path.join(source_folder_path, filename)

    destination_file_path = os.path.join(destination_dir, filename)

    if os.path.exists(source_file_path):
        shutil.copy(source_file_path, destination_file_path)
    # else :
    #     print("file_not_found", source_file_path)

print("Files have been copied successfully.")


Files have been copied successfully.


# make the csv file which contains all the file names


In [1]:
import os
import csv

# Specify the directory containing .wav files
directory = '/data/Root_content/Vaani/audio_content_analysis/audio_matched_files/files'

# Create a list to hold the filenames
filenames = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".wav"):
        filenames.append(filename)

# Specify the CSV file to save the filenames
csv_filename = '/data/Root_content/Vaani/audio_content_analysis/audio_matched_files/audio_filenames.csv'

# Write the filenames to the CSV file
with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Filename"])  # Writing the header
    for name in filenames:
        writer.writerow([name])

print(f"CSV file '{csv_filename}' has been created with {len(filenames)} filenames.")


CSV file '/data/Root_content/Vaani/audio_content_analysis/audio_matched_files/audio_filenames.csv' has been created with 3700 filenames.


# retreving the embedding of the audio files using wav2vec

## using cpu

In [None]:
import pandas as pd
import os
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import librosa

# Load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Folder containing audio files
folder_path = "/data/Root_content/Vaani/audio_content_analysis/audio_matched_files/files"

# Load the CSV file
csv_file_path = "/data/Root_content/Vaani/audio_content_analysis/audio_matched_files/audio_filenames.csv"
df = pd.read_csv(csv_file_path)

# Function to get transcription
def get_transcription(file_name):
    file_path = os.path.join(folder_path, file_name)
    if os.path.exists(file_path):
        audio, sr = librosa.load(file_path, sr=None)
        input_values = processor(audio, return_tensors="pt", padding="longest").input_values  # Batch size 1
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]
        return transcription
    return None

# Add a new column for transcriptions
df['Transcription'] = df['Filename'].apply(get_transcription)

# Save the updated CSV file
df.to_csv("/data/Root_content/Vaani/audio_content_analysis/audio_matched_files/audio_transcription.csv", index=False)


In [5]:
import pandas as pd
import os
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import librosa

# Load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Folder containing audio files
file_path = "/data/Vaani/Dataset/Group_wise/group_size_87314/IISc_VaaniProject_M_UP_Etah_Nira60087_1313120000_APVCYR_74609_12416_15104.wav"

# Function to get transcription
def get_transcription(file_path):
    if os.path.exists(file_path):
        audio, sr = librosa.load(file_path, sr=None)
        input_values = processor(audio, return_tensors="pt", padding="longest").input_values  # Batch size 1
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]
        return transcription
    return None

get_transcription(file_path)
# "ELEGAP THE FLEW BUT THE BARRAN"

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

'BLOACA PSYCHE CUDIN DEE'

## using gpu

In [None]:
import pandas as pd
import os
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import librosa

# Load model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Folder containing audio files
folder_path = "/data/Root_content/Vaani/audio_content_analysis/audio_matched_files/files"

# Load the CSV file
csv_file_path = "/data/Root_content/Vaani/audio_content_analysis/audio_matched_files/audio_filenames.csv"
df = pd.read_csv(csv_file_path)

# Function to get transcription
def get_transcription(file_name):
    file_path = os.path.join(folder_path, file_name)
    if os.path.exists(file_path):
        audio, sr = librosa.load(file_path, sr=None)
        input_values = processor(audio, return_tensors="pt", padding="longest").input_values  # Batch size 1
        input_values = input_values.to(device)
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]
        return transcription
    return None

# Add a new column for transcriptions
df['Transcription'] = df['Filename'].apply(get_transcription)

# Save the updated CSV file
df.to_csv("/data/Root_content/Vaani/audio_content_analysis/audio_matched_files/audio_transcription.csv", index=False)


# FAISS

## L2 distance

In [62]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.documents import Document
import numpy as np

# Load the CSV file containing transcriptions
csv_file_path = "/data/Vaani/Dataset/Group_wise/group_size_87314/group_size_87314_transcription.csv"
df = pd.read_csv(csv_file_path)

# Initialize the Sentence Transformer model
model_name = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(model_name)

# Generate embeddings for each transcription
transcriptions = df['Transcription'].tolist()
embedding_vectors = model.encode(transcriptions)

# Initialize FAISS index
dimension = embedding_vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embedding_vectors)

# Create a document store and index-to-docstore ID mapping
docstore = InMemoryDocstore({str(idx): Document(page_content=transcription) for idx, transcription in enumerate(transcriptions)})
index_to_docstore_id = {i: str(i) for i in range(len(transcriptions))}

# Create FAISS vector store
vector_store = FAISS(
    embedding_function=lambda x: model.encode(x),
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)

# Perform a similarity search with scores
query = "ITTON YO TO EXPS PA LONOCE"
results = vector_store.similarity_search_with_score(query, k=5)

# Output results with similarity scores
for res, score in results:
    print(f"* [SIM={score:.6f}] {res.page_content}")


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


* [SIM=0.000000] ITTON YO TO EXPS PA LONOCE
* [SIM=0.946985] LOP ACTEHE OR
* [SIM=1.028532] ALL THE EXAINBARD ADDE YE
* [SIM=1.030308] SO YO NO AT EGOUL LOOK AT FOR IKE GOIN TO PLAS AT A THE LOG
* [SIM=1.039297] E GO WEN NI TOE TO THE LAY PANGE PA


## cosine similarity

In [67]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.documents import Document
import numpy as np

# Load the CSV file containing transcriptions
csv_file_path = "/data/Vaani/Dataset/Group_wise/group_size_87314/group_size_87314_transcription.csv"
df = pd.read_csv(csv_file_path)

# Initialize the Sentence Transformer model
model_name = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(model_name)

# Generate embeddings for each transcription
transcriptions = df['Transcription'].tolist()
embedding_vectors = model.encode(transcriptions)

# Normalize embeddings to use cosine similarity
embedding_vectors = embedding_vectors / np.linalg.norm(embedding_vectors, axis=1, keepdims=True)

# Initialize FAISS index for inner product (cosine similarity)
dimension = embedding_vectors.shape[1]
index = faiss.IndexFlatIP(dimension)  # Use IndexFlatIP for cosine similarity
index.add(embedding_vectors)

# Create a document store and index-to-docstore ID mapping
docstore = InMemoryDocstore({str(idx): Document(page_content=transcription) for idx, transcription in enumerate(transcriptions)})
index_to_docstore_id = {i: str(i) for i in range(len(transcriptions))}

# Create FAISS vector store
vector_store = FAISS(
    embedding_function=lambda x: model.encode(x),
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)

# Perform a similarity search with scores
query = "ITTON YO TO EXPS PA LONOCE"
results = vector_store.similarity_search_with_score(query, k=5)

# Output results with similarity scores
for res, score in results:
    print(f"* [SIM={score:.3f}] {res.page_content}")


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


* [SIM=1.000] ITTON YO TO EXPS PA LONOCE
* [SIM=0.527] LOP ACTEHE OR
* [SIM=0.486] ALL THE EXAINBARD ADDE YE
* [SIM=0.485] SO YO NO AT EGOUL LOOK AT FOR IKE GOIN TO PLAS AT A THE LOG
* [SIM=0.480] E GO WEN NI TOE TO THE LAY PANGE PA


## applying FAISS on all the files in the group of size 87314

In [7]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.documents import Document
import numpy as np

# Load the CSV file containing transcriptions
csv_file_path = "/data/Root_content/Vaani/audio_content_analysis/audio_matched_files/audio_transcription.csv"
df = pd.read_csv(csv_file_path)


# Initialize the Sentence Transformer model
model_name = "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(model_name)

# Generate embeddings for each transcription
transcriptions = df['Transcription'].tolist()
filenames = df['Filename'].tolist()
embedding_vectors = model.encode(transcriptions)

# Normalize embeddings to use cosine similarity
embedding_vectors = embedding_vectors / np.linalg.norm(embedding_vectors, axis=1, keepdims=True)

# Initialize FAISS index for inner product (cosine similarity)
dimension = embedding_vectors.shape[1]
index = faiss.IndexFlatIP(dimension)  # Use IndexFlatIP for cosine similarity
index.add(embedding_vectors)

# Create a dictionary with filenames as keys and transcriptions as values
transcription_dict = {filename: transcription for filename, transcription in zip(filenames, transcriptions)}

# Create a reverse dictionary to map transcriptions back to filenames
reverse_transcription_dict = {transcription: filename for filename, transcription in transcription_dict.items()}

# Create a document store with filenames as keys
docstore = InMemoryDocstore({filename: Document(page_content=transcription) for filename, transcription in transcription_dict.items()})
index_to_docstore_id = {i: filename for i, filename in enumerate(filenames)}

# Create FAISS vector store
vector_store = FAISS(
    embedding_function=lambda x: model.encode(x),
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)

# Prepare a list to store results
results_list = []

# Perform a similarity search for each transcription
for filename, query in transcription_dict.items():
    try:
        search_results = vector_store.similarity_search_with_score(query, k=2)  # k=2 to get top 2 similar transcriptions
        
        for res, score in search_results:
            # Extract the transcription from the result
            result_transcription = res.page_content
            
            # Use the reverse dictionary to get the filename for the result transcription
            result_filename = reverse_transcription_dict.get(result_transcription, None)
            
            if result_filename is None:
                print(f"Result transcription not found in reverse dictionary: {result_transcription}")
                continue
            
            if result_filename == filename:  # Skip the result of the query itself
                continue
            
            filename_link = "https://vaani.iisc.ac.in/Audios/" + filename.split('_')[4] + "/" + filename
            result_filename_link = 'https://vaani.iisc.ac.in/Audios/' + result_filename.split('_')[4] + '/' + result_filename
            
            result_row = {
                "Filename_1": filename,
                "Filename_2": result_filename,
                "Similarity_score": round(score,3)
            }
            results_list.append(result_row)
    except Exception as e:
        print(f"Skipping file {filename} due to error: {e}")

# Save the results to a CSV file
results_df = pd.DataFrame(results_list)
results_df.to_csv("/data/Root_content/Vaani/audio_content_analysis/audio_matched_files/similarity_results.csv", index=False)


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


# remove duplicates from the csv file

In [9]:
import pandas as pd

# Load the CSV file
file_path = '/data/Root_content/Vaani/audio_content_analysis/audio_matched_files/similarity_results.csv'
df = pd.read_csv(file_path)

# Create a set to track unique pairs
unique_pairs = set()

# Function to create an unordered pair as a tuple
def make_pair(row):
    return tuple(sorted([row['Filename_1'], row['Filename_2']]))

# Identify and remove duplicate pairs
filtered_rows = []
for _, row in df.iterrows():
    pair = make_pair(row)
    if pair not in unique_pairs:
        unique_pairs.add(pair)
        filtered_rows.append(row)

# Create a new DataFrame with the filtered rows
filtered_df = pd.DataFrame(filtered_rows)

# Save the cleaned data back to a new CSV file
output_path = '/data/Root_content/Vaani/audio_content_analysis/audio_matched_files/similarity_results_unique_pair.csv'
filtered_df.to_csv(output_path, index=False)

print(f"Cleaned CSV saved to {output_path}")


Cleaned CSV saved to /data/Root_content/Vaani/audio_content_analysis/audio_matched_files/similarity_results_unique_pair.csv


# finding the number of common files from refernce and 60 lakhs files

In [1]:
import pandas as pd

# Load both CSV files into dataframes
df1 = pd.read_csv('/data/Root_content/Vaani/audio_content_analysis/audio_analysis_all_group/audio_transcription_without_batch/audio_transcription_csv/audio_link_trancription_without_batching_all_removed_duplicates.csv')
df2 = pd.read_csv('/data/Root_content/Vaani/audio_content_analysis/audio_matched_files_tsv/audio_transcription.csv')


print(df1)

def clean_filename(filename):
    return '_'.join(filename.split('/')[-1].split('_')[3:])

# clean_filename("/data/Vaani/Dataset/Audios_all_district_vaani_1/Sukma/IISc_VaaniProject_S_Chhattisgarh_Sukma_114191_12208498_APCTFLM_226630_7402_11502.wav")

# Apply the cleaning function to the filenames in df2
df1['cleaned_filename'] = df1['File Name'].apply(clean_filename)

# Now merge based on the cleaned filenames from df2 and filenames from df1
common_files = pd.merge(df2[['Filename']], df1[['cleaned_filename']], left_on='Filename', right_on='cleaned_filename')

# Count how many files are common
common_count = common_files.shape[0]

print(f'Total number of common files: {common_count}')


                                                 File Name   Duration  \
0        https://vaani.iisc.ac.in/Audios/Sukma/IISc_Vaa...   4.100000   
1        https://vaani.iisc.ac.in/Audios/Sukma/IISc_Vaa...   5.560000   
2        https://vaani.iisc.ac.in/Audios/Sukma/IISc_Vaa...   9.192000   
3        https://vaani.iisc.ac.in/Audios/Sukma/IISc_Vaa...   8.775000   
4        https://vaani.iisc.ac.in/Audios/Sukma/IISc_Vaa...  12.440000   
...                                                    ...        ...   
5908590  https://vaani.iisc.ac.in/Audios/North24P/IISc_...   7.093312   
5908591  https://vaani.iisc.ac.in/Audios/North24P/IISc_...   5.418687   
5908592  https://vaani.iisc.ac.in/Audios/North24P/IISc_...   2.069313   
5908593  https://vaani.iisc.ac.in/Audios/North24P/IISc_...   3.914687   
5908594  https://vaani.iisc.ac.in/Audios/North24P/IISc_...   5.589313   

         Byte Size                                      Transcription  
0           132498  O E DEY WE TIO SO POR OMO A HAI