In [None]:
!pip install datasets -q


In [None]:

from sklearn.manifold import TSNE

import torch

from transformers import Wav2Vec2Model
import torchaudio


import librosa

from datasets import load_from_disk

import numpy as np
import pandas as pd
from scipy.stats import zscore
from librosa.sequence import dtw as lib_dtw
from datasets import load_dataset


In [None]:

from huggingface_hub import notebook_login

notebook_login()


In [None]:
dataset = load_dataset("juanfengyun/CAAP")

In [None]:
dataset

In [None]:
ds = dataset['train']
ds

In [None]:

wav_paths = []
for i in range(len(ds)):
  wav_paths.append('CAAP_2023-04-27/wavs/audios/' + ds[i]['filePath']['path'])


In [None]:
wav_paths

In [None]:
seed = 31415
time_frame = 5

expected_sr = 16000

In [None]:
time_frame=5
def time_txt(time, time_frame=5):
    if time % time_frame == 0:
        return f"{round(time * 0.02, 2)}"
    return ""

def create_df(feats, speaker_len, names,textId):
    cols = [f"val {i}" for i in range(feats.shape[1])]
    df = pd.DataFrame(feats, columns=cols)
    df['idx'] = df.index
    time_index = {i: speaker_len[i] for i in range(len(speaker_len))}
    com_time_index = {i: sum(speaker_len[:i]) for i in range(len(speaker_len))}
    df_speaker_count = pd.Series(time_index)
    df_speaker_count = df_speaker_count.reindex(df_speaker_count.index.repeat(df_speaker_count.to_numpy())).rename_axis(
        'speaker_id').reset_index()
    df['speaker_id'] = df_speaker_count['speaker_id']
    df['speaker_len'] = df['speaker_id'].apply(lambda row: speaker_len[row])
    df['com_sum'] = df['speaker_id'].apply(lambda i: com_time_index[i])
    df['speaker'] = df['speaker_id'].apply(lambda i: names[i])
    df['textId'] = df['speaker_id'].apply(lambda i: int(textId[i]))
    df['time'] = df['idx'] - df['com_sum']
    df['time_txt'] = df[['time', 'speaker_len']].apply(lambda row: time_txt(row['time'], time_frame), axis=1)
    assert len(df.loc[df['speaker'] == -1]) == 0
    assert len(df_speaker_count) == len(df)
    df_subset = df.copy()
    data_subset = df_subset[cols].values
    return data_subset, df_subset, cols

In [None]:
wavs = []
for wav_path in wav_paths:
    print(wav_path)
    wav, sr = torchaudio.load(wav_path)
    if sr != expected_sr:
        print(f"Sampling rate of {wav_path} is not {expected_sr} -> Resampling the file")
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=expected_sr)
        wav = resampler(wav)
        wav.squeeze()
    wavs.append(wav)

device_name = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_name)
print(f'Running on {device_name}')


layer = 22
# print("Layer: ", layer)
names = [f.split('-')[1][:2] for f in ds['Filename']]
textId = [f.rsplit('.', 1)[0][-2:] for f in ds['Filename']]

print("Speaker names: ", names)

dfs = []
# for layer in range(25):
print("Layer: ", layer)
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large")
features = None
speaker_len = []
for wav in wavs:
  wav_features = model(wav, return_dict=True, output_hidden_states=True).hidden_states[layer].squeeze().detach().numpy()
  features = wav_features if features is None else np.concatenate([features, wav_features], axis=0)
  speaker_len.append(wav_features.shape[0])
num_features_per_frame = features.shape[1]
data_subset, df_subset, hubert_feature_columns = create_df(features, speaker_len, names,textId)
dfs.append(df_subset)

In [None]:
dfs

In [None]:
Refs_Files_df = pd.read_csv("ref_files_and_layers.csv")
Refs_Files_df

In [None]:

tsne_1 = 'tsne-3d-one'
tsne_2 = 'tsne-3d-two'
tsne_3 = 'tsne-3d-thr'

time_frame=5
def mut_normalize_sequences(sq1, sq2, normalize: bool):
    if normalize:
        sq1 = np.array(sq1)
        sq2 = np.array(sq2)
        len_sq1 = sq1.shape[0]

        arr = np.concatenate((sq1, sq2), axis=0)
        for dim in range(sq1.shape[1]):
            arr[:, dim] = zscore(arr[:, dim])
        sq1 = arr[:len_sq1, :]
        sq2 = arr[len_sq1:, :]
    return sq1, sq2


def time_txt(time, time_frame=5):
    if time % time_frame == 0:
        return f"{round(time * 0.02, 2)}"
    return ""


def librosa_dtw(sq1, sq2):
    return lib_dtw(sq1.transpose(), sq2.transpose())[0][-1, -1]


def calc_distance(df_subset, speaker1, speaker2, cols):
    # Extract features for the two speakers
    features_speaker1 = df_subset[df_subset['speaker'] == speaker1][cols].to_numpy()
    features_speaker2 = df_subset[df_subset['speaker'] == speaker2][cols].to_numpy()

    # Debugging: Print feature shapes
    print(f"Features for speaker {speaker1}: {features_speaker1.shape}")
    print(f"Features for speaker {speaker2}: {features_speaker2.shape}")

    # Check if features are empty
    if features_speaker1.size == 0 or features_speaker2.size == 0:
        print(f"Skipping distance calculation: Empty features for speaker {speaker1} or {speaker2}")
        return -1  # Return a default value or handle as needed

    # Normalize and calculate distance
    features_speaker1, features_speaker2 = mut_normalize_sequences(features_speaker1, features_speaker2, True)
    distance = librosa_dtw(features_speaker1, features_speaker2)
    distance = distance / (len(features_speaker1) + len(features_speaker2))
    return distance



def tsne(data_subset, init='pca', early_exaggeration=12.0, lr='auto', n_comp=3, perplexity=40, iters=1000,
         random_state=None):
    tsne = TSNE(n_components=n_comp, verbose=1, perplexity=perplexity, n_iter=iters, init=init,
                early_exaggeration=early_exaggeration,
                learning_rate=lr, random_state=random_state)
    tsne_results = tsne.fit_transform(data_subset)
    return tsne_results


def fill_tsne(df_subset, tsne_results):
    print(tsne_results[:, 0].shape)
    df_subset[tsne_1] = tsne_results[:, 0]
    df_subset[tsne_2] = tsne_results[:, 1]
    if tsne_results.shape[1] == 3:
        df_subset[tsne_3] = tsne_results[:, 2]
    return df_subset


In [None]:
dfs[0]

In [None]:
Refs = ['01','06','10','13','18','26','27','31','36','40']

In [None]:
import numpy as np

# Extract the 'Speaker' column from the dataset and convert it to a NumPy array
speakers = ds['Speaker']
speakers_array = np.array(speakers)

# Print the NumPy array
print(speakers_array)

In [None]:
# Convert speakers to an integer list
speakers_int = [int(speaker) for speaker in speakers]

# Convert Refs to an integer list
Refs_int = [int(ref) for ref in Refs]

# Print the converted lists
print("Speakers as integers:", speakers_int)
print("Refs as integers:", Refs_int)

In [None]:
for speaker in speakers_int:
    for ref in Refs_int:
        S1 = speaker
        S2 = ref
        print( f"Speaker 1: {S1}, Speaker 2: {S2}")

In [None]:
print(f"Number of DataFrames in dfs: {len(dfs)}")
assert len(dfs) > 0, "The dfs list is empty. Ensure the processing loop is working correctly."

In [None]:
S1 = '01'
S2 = '06'
print( f"Speaker 1: {S1}, Speaker 2: {S2}")

Fulldistance = calc_distance(df_subset, S1, S2, hubert_feature_columns)
print(f"Full Dim. Distance: {Fulldistance}")

cols = [tsne_1, tsne_2, tsne_3]
TSNEdistance = calc_distance(df_subset, S1, S2, cols)
print(f"TSNE Dim. Distance: {TSNEdistance}")

In [None]:
df_subsets = []
for i in range(len(dfs)):
    print(f"Layer DataFrame {i}:")
    # print(dfs[i])# Filter numeric columns from the DataFrame
    df_subset_orig = dfs[i].copy()

    numeric_cols = df_subset_orig.select_dtypes(include=[np.number])

    # Ensure there are no missing values
    numeric_cols = numeric_cols.dropna()

    # Convert the filtered DataFrame to a NumPy array
    data_subset = numeric_cols.to_numpy()

    # Pass the numeric data to the TSNE function
    tsne_results = tsne(data_subset, init='pca', early_exaggeration=2.0, lr=100.0, n_comp=3, perplexity=40, iters=1000, random_state=seed)

    # Fill the TSNE results back into the original DataFrame
    df_subset = fill_tsne(df_subset_orig, tsne_results)
    df_subsets.append(df_subset)

In [None]:
df_subsets,len(df_subsets)

In [None]:
for i in range(len(df_subsets)):
    print(f"Layer DataFrame {i}:")
    # print(dfs[i])
    distances = []
    assert len(dfs[i]) > 0, f"DataFrame {i} is empty. Ensure the processing loop is working correctly."
    for speaker in speakers:
        for ref in Refs:


            if str(speaker) == str(ref):  # Cast both to strings for comparison
                continue

            df_subset_orig = df_subsets[i].copy()
            # data_subset_orig = data_subset.copy()
            print(df_subset_orig)
            # tsne_results = tsne(df_subset_orig, init='pca', early_exaggeration=2.0, lr=100.0, n_comp=3, perplexity=40, iters=1000,random_state=seed)
            # df_subset = fill_tsne(df_subset_orig, tsne_results)
            # print(df_subset)
            # breakpoint()
            S1 = speaker
            S2 = ref
            print( f"Speaker 1: {S1}, Speaker 2: {S2}")

            Fulldistance = calc_distance(df_subset_orig, S1, S2, hubert_feature_columns)
            print(f"Full Dim. Distance: {Fulldistance}")

            cols = [tsne_1, tsne_2, tsne_3]
            TSNEdistance = calc_distance(df_subset_orig, S1, S2, cols)
            print(f"TSNE Dim. Distance: {TSNEdistance}")
            # return Fulldistance, TSNEdistance
            distances.append({
                'speaker1': speaker,
                'speaker2': ref,
                'TextId1': "01",
                'TextId2': "01",
                'Full_distance': Fulldistance,
                "TSNE_distance": TSNEdistance
            })
        # breakpoint();

    # Convert the distances list to a DataFrame
    distances_df = pd.DataFrame(distances)

    # Save the DataFrame to a CSV file
    # distances_df.to_csv('distances.csv', index=False)

    # Print the DataFrame
    # print(distances_df)

    distances_df.to_csv(f'csv/W2V2_Distances_01_{i}.csv', index=False)
    print(f"Distances for layer {i} saved to W2V2_Distances_01_{i}.csv.")

In [None]:
distances_df

In [None]:
def load_and_clean_csv(file_path):
    df = pd.read_csv(file_path)
    return df.loc[:, ~df.columns.str.contains('^Unnamed')]

file_path = "APTCT_distance_01.csv"
APTCT_Distance_df = load_and_clean_csv(file_path)
APTCT_Distance_df

In [None]:
def calculate_average_distance_by_speaker(df, speaker_column):
    return df.groupby(speaker_column).mean()


In [None]:
from scipy.stats import pearsonr  # Import for Pearson correlation

def calculate_pearson_correlation(df1,df2, col1, col2):
    """
    Calculate the Pearson correlation between two columns in a DataFrame.
    """
    correlation, _ = pearsonr(df1[col1], df2[col2])
    return correlation


In [None]:
from scipy.stats import pearsonr

correlations =[]
pvalues = []
for i in range(25):
    W2V2_Distance_df = load_and_clean_csv(f'csv/W2V2_Distances_01_{i}.csv')
    print(f"Loaded W2V2_Distances_01_{i}.csv")
    W2V2_avg_distance_by_speaker = calculate_average_distance_by_speaker(W2V2_Distance_df, 'speaker1')
    print("Average distance by speaker1 for W2V2_Distance_df:")
    print(W2V2_avg_distance_by_speaker)
    # Ensure the columns 'TSNE_distance' and 'distance' exist in the DataFrame
    if 'TSNE_distance' in W2V2_avg_distance_by_speaker.columns and 'Distance' in APTCT_Distance_df.columns:
        correlation,pvalue= pearsonr(W2V2_avg_distance_by_speaker['TSNE_distance'], APTCT_Distance_df['Distance'])
        print(f"Pearson correlation between TSNE_distance and distance: {correlation}")
    else:
        print("Columns 'TSNE_distance' and/or 'distance' are missing in APTCT_Distance_df.")

    correlations.append(correlation)
    pvalues.append(pvalue)

# Create a DataFrame to store correlations and p-values
correlation_data = pd.DataFrame({
    'Layer': list(range(25)),
    'Correlation': correlations,
    'P-value': pvalues
})

# Save the DataFrame to a CSV file for further analysis
correlation_data.to_csv('csv/Correlation_Pvalues.csv', index=False)
print("Correlation and p-values saved to csv/Correlation_Pvalues.csv")

# Display the DataFrame
correlation_data

In [None]:
correlations, pvalues

In [None]:
import pandas as pd

correlation_data = pd.read_csv("csv/Correlation_Pvalues_2.csv")
correlation_data

In [None]:
correlations = correlation_data['Correlation'].tolist()
pvalues = correlation_data['P-value'].tolist()

In [None]:
import matplotlib.pyplot as plt

# Generate a chart for correlations and p-values
layers = list(range(25))  # Assuming 25 layers
plt.figure(figsize=(10, 6))

# Plot correlations
plt.plot(layers, correlations, label='Correlations', marker='o')

# Plot p-values
plt.plot(layers, pvalues, label='P-values', marker='x')

# Add labels, title, and legend
plt.xlabel('Layer')
plt.ylabel('Value')
plt.title('Correlations and P-values Across Layers')
plt.axhline(y=0.05, color='r', linestyle='--', label='Significance Threshold (p=0.05)')
plt.legend()

# Show the plot
plt.grid(True)
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define a moving average function
def moving_average(data, window_size):
    return np.convolve(data, np.ones(window_size) / window_size, mode='valid')

# Smooth the correlations and p-values using a moving average
window_size = 3  # Adjust the window size for more or less smoothing
smoothed_correlations = moving_average(correlations, window_size)
smoothed_pvalues = moving_average(pvalues, window_size)

# Adjust the layers to match the smoothed data
smoothed_layers = list(range(len(smoothed_correlations)))

# Generate a chart for smoothed correlations and p-values
plt.figure(figsize=(10, 6))

# Plot smoothed correlations
plt.plot(smoothed_layers, smoothed_correlations, label='Smoothed Correlations', marker='o')

# Plot smoothed p-values
plt.plot(smoothed_layers, smoothed_pvalues, label='Smoothed P-values', marker='x')

# Add labels, title, and legend
plt.xlabel('Layer')
plt.ylabel('Value')
plt.title('Smoothed Correlations and P-values Across Layers')
plt.axhline(y=0.05, color='r', linestyle='--', label='Significance Threshold (p=0.05)')
plt.legend()

# Show the plot
plt.grid(True)
plt.show()


In [None]:
# Find the best correlation and its corresponding layer
max_correlation = max(correlations)
best_layer = correlations.index(max_correlation)

print(f"Best correlation: {max_correlation} at layer {best_layer}")

# Highlight the best correlation on the chart
plt.figure(figsize=(10, 6))
plt.plot(layers, correlations, label='Correlations', marker='o')
plt.plot(layers, pvalues, label='P-values', marker='x')
plt.axhline(y=0.1, color='r', linestyle='--', label='Significance Threshold (p=0.05)')
plt.scatter(best_layer, max_correlation, color='green', label=f'Best Correlation ({max_correlation:.2f})', zorder=5)

# Add labels, title, and legend
plt.xlabel('Layer')
plt.ylabel('Value')
plt.title('Correlations and P-values Across Layers')
plt.legend()
plt.grid(True)
plt.show()
