In [2]:
import pandas as pd
import numpy as np
import warnings 
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings("ignore")

In [3]:
import pandas as pd

audio_df = pd.read_csv("wav2vec2_embeddings.csv")
clinical_df = pd.read_csv("CODA_TB_clinical_Meta_Info.csv")

audio_df['base_id'] = audio_df['filename'].str.extract(r"(.*?)(?:_modclean)")[0]

print(audio_df[['filename', 'base_id']].head())

#Proceed with mapping base_id to participant ===
unique_audio_ids = audio_df['base_id'].unique()
unique_participants = clinical_df['participant'].values

# Safety check
print(f"Audio IDs: {len(unique_audio_ids)}, Clinical Participants: {len(unique_participants)}")

# If counts match, continue
if len(unique_audio_ids) == len(unique_participants):
    id_map = dict(zip(unique_audio_ids, unique_participants))
    audio_df['participant'] = audio_df['base_id'].map(id_map)

    # Merge
    merged_df = pd.merge(audio_df, clinical_df, on='participant', how='inner')
    merged_df.drop(['filename', 'base_id'], axis=1, inplace=True)
    merged_df.to_csv("merged_wav2vec2_clinical.csv", index=False)
    print("Merged dataset saved to 'merged_wav2vec2_clinical.csv'")
else:
    print(" Still mismatch. Check if ordering of audio files and participant rows are aligned.")


                                        filename                    base_id
0  1637846163058-recording-1_modclean_A_seg0.wav  1637846163058-recording-1
1  1637846163058-recording-1_modclean_B_seg0.wav  1637846163058-recording-1
2  1637846854303-recording-1_modclean_B_seg0.wav  1637846854303-recording-1
3  1637846855223-recording-1_modclean_A_seg0.wav  1637846855223-recording-1
4  1637846855223-recording-1_modclean_B_seg0.wav  1637846855223-recording-1
Audio IDs: 3367, Clinical Participants: 1105
 Still mismatch. Check if ordering of audio files and participant rows are aligned.


In [4]:
# Load all datasets
audio_df = pd.read_csv("wav2vec2_embeddings.csv")
clinical_df = pd.read_csv("CODA_TB_clinical_Meta_Info.csv")
map_df = pd.read_csv("cnn_ready_metadata.csv")  # This file must contain base_filename and participant

# Extract base_id from filename
audio_df['base_id'] = audio_df['filename'].str.extract(r"(.*?)(?:_modclean)")[0]

# Merge audio_df with map_df to get participant IDs
audio_with_participant = pd.merge(audio_df, map_df[['base_filename', 'participant']], 
                                  left_on='base_id', right_on='base_filename', how='left')

# Drop unnecessary columns
audio_with_participant.drop(['filename', 'base_id', 'base_filename'], axis=1, inplace=True)

merged_df = pd.merge(audio_with_participant, clinical_df, on='participant', how='inner')


merged_df.to_csv("merged_wav2vec2_clinical.csv", index=False)
print(" Final merged dataset saved: merged_wav2vec2_clinical.csv")


 Final merged dataset saved: merged_wav2vec2_clinical.csv


In [5]:
df3 = pd.read_csv("merged_wav2vec2_clinical.csv")

In [6]:
df3.head()

Unnamed: 0,feat_0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,...,tb_prior_Extrapul,tb_prior_Unknown,hemoptysis,heart_rate,temperature,weight_loss,smoke_lweek,fever,night_sweats,tb_status
0,-0.026476,0.066998,-0.02633,-0.012994,-0.14013,-0.121104,0.082295,-0.000749,-0.034801,-0.41785,...,No,No,No,68,37.5,Yes,No,Yes,Yes,1
1,-0.026476,0.066998,-0.02633,-0.012994,-0.14013,-0.121104,0.082295,-0.000749,-0.034801,-0.41785,...,No,No,No,68,37.5,Yes,No,Yes,Yes,1
2,-0.115683,0.074489,0.014545,-0.00577,-0.184433,-0.109778,0.070073,-0.004235,-0.051047,-0.439018,...,No,No,No,68,37.5,Yes,No,Yes,Yes,1
3,-0.115683,0.074489,0.014545,-0.00577,-0.184433,-0.109778,0.070073,-0.004235,-0.051047,-0.439018,...,No,No,No,68,37.5,Yes,No,Yes,Yes,1
4,-0.036963,0.027261,-0.097449,-0.025534,-0.055059,-0.135565,0.002773,-0.035429,-0.03201,-0.351857,...,No,No,No,76,37.7,Yes,Yes,Yes,Yes,1


In [7]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [8]:
df3.shape

(8833, 786)

In [9]:
df3.to_csv('merged_wav2vec2_clinical.csv', index = False)

In [None]:
df3.isnull().sum()