In [1]:
import os
import pandas as pd
import glob

### Bonafide 메타데이터 합치기

In [None]:
# 각 메타데이터 불러오기
vctk_df = pd.read_csv('/home/woongjae/noise-tracing/new_dataset/meta_new_VCTK.csv')
librispeech_df = pd.read_csv('/home/woongjae/noise-tracing/new_dataset/meta_new_LibriSpeech.csv')

In [10]:
# 통합
merged_df = pd.concat([vctk_df, librispeech_df, commonvoice_df], ignore_index=True)

# 저장
merged_df.to_csv('meta_bonafide.csv', index=False)

### spoof + bonafide

In [None]:
spoof_df = pd.read_csv('/home/woongjae/noise-tracing/new_dataset/meta_new_DSDCorpus.csv')
bonafide_df = pd.read_csv('/home/woongjae/noise-tracing/new_dataset/meta_bonafide.csv')

merged_df = pd.concat([spoof_df, bonafide_df], ignore_index=True)
merged_df.to_csv('meta_bonafide_spoof.csv', index=False)

### 전체 메타 파일 정보

In [3]:
meta_df = pd.read_csv('/home/woongjae/noise-tracing/new_dataset/meta_bonafide_spoof.csv')

# 1. 전체 샘플 수
total_samples = len(meta_df)
print(f'전체 샘플 수: {total_samples}')

# 2. Label 분포 확인 (bonafide vs spoof)
label_counts = meta_df['Label'].value_counts()
print('Label 분포:')
print(label_counts)

전체 샘플 수: 52700
Label 분포:
bonafide    37500
spoof       15200
Name: Label, dtype: int64


### 노이즈 레이블 열 추가하기

In [4]:
meta_df = pd.read_csv('/home/woongjae/noise-tracing/new_dataset/meta_bonafide_spoof.csv')

# 컬럼 이름 변경: Label → Label2
meta_df = meta_df.rename(columns={'Label': 'Label2'})

# 새 컬럼 Label1 생성 → 모든 샘플은 처음엔 'clean'
meta_df['Label1'] = 'clean'

# 저장
meta_df.to_csv('meta_clean.csv', index=False)

### 전체 노이즈 메타 파일 합치기

In [2]:
import pandas as pd

# 📌 Clean 메타데이터
clean_path = "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_clean.csv"
df_clean = pd.read_csv(clean_path)

# 📌 각 노이즈 타입별 메타데이터 수동 지정
noise_paths = [
    "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_background_noise.csv",
    "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_background_music.csv",
    "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_overlapping_speech.csv",
    "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_white_noise.csv",
    "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_pink_noise.csv",
    "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_pitch_shift.csv",
    "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_time_stretch.csv",
    "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_auto_tune.csv",
    "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_reverberation.csv"
]

# 📥 전체 메타데이터 통합
dfs = [df_clean] + [pd.read_csv(path) for path in noise_paths]
df_total = pd.concat(dfs, ignore_index=True)

# 💾 저장
output_path = "/home/woongjae/noise-tracing/new_dataset/meta_all.csv"
df_total.to_csv(output_path, index=False)

print(f"✅ 최종 통합 메타데이터 저장 완료: {output_path}")


✅ 최종 통합 메타데이터 저장 완료: /home/woongjae/noise-tracing/new_dataset/meta_all.csv


### Train/Dev/Eval 셋 나누기

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 🔍 경로 및 데이터 불러오기
meta_path = "/home/woongjae/noise-tracing/new_dataset/meta_total.csv"
df = pd.read_csv(meta_path)

# 📌 분할 컬럼 추가
df['Split'] = None

# ✅ 1. Bonafide 분할
# 1-1. CommonVoice는 eval
df.loc[(df['Label2'] == 'bonafide') & (df['Subset'] == 'CommonVoice'), 'Split'] = 'eval'

# 1-2. VCTK & LibriSpeech → train/dev 분할
real_train_dev = df[(df['Label2'] == 'bonafide') & (df['Subset'].isin(['VCTK', 'LibriSpeech']))]
train_idx, dev_idx = train_test_split(real_train_dev.index, test_size=0.2, random_state=42, shuffle=True)
df.loc[train_idx, 'Split'] = 'train'
df.loc[dev_idx, 'Split'] = 'dev'

# ✅ 2. Spoof 분할
spoof_df = df[df['Label2'] == 'spoof']
for group_name in spoof_df['group'].unique():
    group_data = spoof_df[spoof_df['group'] == group_name]
    
    # 50% → eval
    group_eval_idx = group_data.sample(frac=0.5, random_state=42).index
    df.loc[group_eval_idx, 'Split'] = 'eval'
    
    # 나머지 → train/dev
    remaining = group_data.drop(index=group_eval_idx)
    group_train_idx, group_dev_idx = train_test_split(remaining.index, test_size=0.2, random_state=42)
    df.loc[group_train_idx, 'Split'] = 'train'
    df.loc[group_dev_idx, 'Split'] = 'dev'

# ✅ 검증
print(df['Split'].value_counts())

# 💾 저장
output_path = "/home/woongjae/noise-tracing/new_dataset/meta_total_with_split.csv"
df.to_csv(output_path, index=False)
print(f"\n✅ 데이터셋 분할 완료: {output_path}")


### 안만들어진 파일 찾기

In [1]:
import pandas as pd

# 메타 불러오기
clean_meta = pd.read_csv("/home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_clean.csv")
reverb_meta = pd.read_csv("/home/woongjae/noise-tracing/new_dataset/meta_reverberation.csv")

# 공통 형식 정리
clean_utts = set(clean_meta['utt'])
reverb_utts = set(reverb_meta['utt'].str.replace("_reverberation", "", regex=False))

# 누락된 항목
missing_utts = clean_utts - reverb_utts

# 누락된 샘플 확인
missing_df = clean_meta[clean_meta['utt'].isin(missing_utts)]
print(f"❗ 누락된 샘플 수: {len(missing_df)}개")
missing_df.head()


❗ 누락된 샘플 수: 2개


Unnamed: 0,File_path,utt,speaker ID,gender,Group,Label2,Label1
13587,/home/woongjae/noise-tracing/new_dataset/Datas...,TTS_VCTK_22_VITS_38,VCTK_22,Female,VITS-TTS,spoof,clean
47306,/home/woongjae/noise-tracing/new_dataset/Datas...,common_voice_en_20182997,ccf426b54d5a1d7c96254d4bbeb6c0da8d2923224687fe...,Male,CommonVoice,bonafide,clean


##### 전체 메타 파일 합치기

In [4]:
# 경로 수정: 실제 메타파일 폴더
meta_dir = "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile"

# 메타데이터 파일 필터링
meta_files = [f for f in os.listdir(meta_dir) if f.endswith(".csv") and "meta" in f]

# 병합
merged_df = pd.concat([pd.read_csv(os.path.join(meta_dir, f)) for f in meta_files], ignore_index=True)

# 저장
output_path = os.path.join(meta_dir, "meta_all.csv")
merged_df.to_csv(output_path, index=False)
print(f"✅ 메타파일 병합 완료: {output_path}")


✅ 메타파일 병합 완료: /home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_all.csv


### 데이터셋 나누기

In [5]:
# 메타 불러오기
meta_path = "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_all.csv"
df = pd.read_csv(meta_path)

# Subset 컬럼 추가
df["Subset"] = "undefined"

# ✅ Bonafide 처리
bonafide_df = df[df["Label2"] == "bonafide"]

# LibriSpeech + VCTK → train/dev
bv_df = bonafide_df[bonafide_df["Group"].isin(["LibriSpeech", "VCTK"])]
bv_shuffled = bv_df.sample(frac=1, random_state=42)
n_train = int(len(bv_shuffled) * 0.75)
train_idx = bv_shuffled.iloc[:n_train].index
dev_idx = bv_shuffled.iloc[n_train:].index
df.loc[train_idx, "Subset"] = "train"
df.loc[dev_idx, "Subset"] = "dev"

# CommonVoice → eval
df.loc[(df["Label2"] == "bonafide") & (df["Group"] == "CommonVoice"), "Subset"] = "eval"

# ✅ Spoof 처리
spoof_df = df[df["Label2"] == "spoof"]
for group in spoof_df["Group"].unique():
    gdf = spoof_df[spoof_df["Group"] == group].sample(frac=1, random_state=42)
    n = len(gdf)
    n_train = int(n * 0.5)
    n_dev = int(n * 0.25)
    train_idx = gdf.iloc[:n_train].index
    dev_idx = gdf.iloc[n_train:n_train + n_dev].index
    eval_idx = gdf.iloc[n_train + n_dev:].index
    df.loc[train_idx, "Subset"] = "train"
    df.loc[dev_idx, "Subset"] = "dev"
    df.loc[eval_idx, "Subset"] = "eval"

# 저장
out_path = "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_subset.csv"
df.to_csv(out_path, index=False)
print(f"✅ 저장 완료: {out_path}")


  exec(code_obj, self.user_global_ns, self.user_ns)


✅ 저장 완료: /home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_subset.csv


In [6]:
df = pd.read_csv("/home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_subset.csv")

# Subset 비율 및 개수 확인
subset_counts = df["Subset"].value_counts(normalize=True).round(4) * 100
subset_counts = subset_counts.rename("Percentage (%)").to_frame()
subset_counts["Count"] = df["Subset"].value_counts()

print(subset_counts)

       Percentage (%)   Count
train           53.56  282250
eval            26.19  138001
dev             20.26  106749


### TIMIT 저장하기

In [2]:
import os
import pandas as pd

meta_dir = "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile"
output_path = os.path.join(meta_dir, "meta_timit_all.csv")

# 모든 meta_*.csv 파일 탐색
meta_files = [f for f in os.listdir(meta_dir)
              if f.startswith("meta_") and f.endswith(".csv") and f != "meta_timit_all.csv"]

df_all_timit = []

for fname in meta_files:
    fpath = os.path.join(meta_dir, fname)
    df = pd.read_csv(fpath)

    if "Group" not in df.columns:
        print(f"[스킵] {fname}: Group 컬럼 없음")
        continue

    # TIMIT만 추출 + Subset 컬럼 명시적 추가
    df_timit = df[df["Group"] == "TIMIT"].copy()
    df_timit["Subset"] = "eval"  # ← 여기에 명시적으로 추가
    df_all_timit.append(df_timit)

    print(f"{fname}: {len(df_timit)}개 TIMIT 샘플 수집됨")

# 병합 및 저장
df_concat = pd.concat(df_all_timit, ignore_index=True)

# 모든 컬럼 유지 + 저장
df_concat.to_csv(output_path, index=False)
print(f"\n✅ 모든 TIMIT 샘플 전체 컬럼 포함 병합 완료 → {output_path} (총 {len(df_concat)}개)")


meta_clean.csv: 6300개 TIMIT 샘플 수집됨
meta_background_noise.csv: 6300개 TIMIT 샘플 수집됨
meta_overlapping_speech.csv: 6300개 TIMIT 샘플 수집됨
meta_white_noise.csv: 6300개 TIMIT 샘플 수집됨
meta_pitch_shift.csv: 6300개 TIMIT 샘플 수집됨
meta_background_music.csv: 6300개 TIMIT 샘플 수집됨
meta_auto_tune.csv: 6300개 TIMIT 샘플 수집됨
meta_time_stretch.csv: 6300개 TIMIT 샘플 수집됨
meta_reverberation.csv: 6300개 TIMIT 샘플 수집됨
meta_pink_noise.csv: 6300개 TIMIT 샘플 수집됨

✅ 모든 TIMIT 샘플 전체 컬럼 포함 병합 완료 → /home/woongjae/noise-tracing/new_dataset/Dataset/metafile/meta_timit_all.csv (총 63000개)


In [3]:
import pandas as pd
import os

meta_dir = "/home/woongjae/noise-tracing/new_dataset/Dataset/metafile"
sub_meta_path = os.path.join(meta_dir, "meta_subset.csv")
timit_path = os.path.join(meta_dir, "meta_timit_all.csv")
merged_path = os.path.join(meta_dir, "Sub_meta_merged.csv")

# 파일 불러오기
df_orig = pd.read_csv(sub_meta_path)
df_timit = pd.read_csv(timit_path)

# 컬럼 통일 (혹시 누락된 컬럼이 있으면 추가)
for col in df_orig.columns:
    if col not in df_timit.columns:
        df_timit[col] = ""

# 동일한 순서로 정렬
df_timit = df_timit[df_orig.columns]

# 병합
df_combined = pd.concat([df_orig, df_timit], ignore_index=True)
df_combined.to_csv(merged_path, index=False)

print(f"✅ Sub_meta + TIMIT 병합 완료 → {merged_path} (총 {len(df_combined)}개 샘플)")


  exec(code_obj, self.user_global_ns, self.user_ns)


✅ Sub_meta + TIMIT 병합 완료 → /home/woongjae/noise-tracing/new_dataset/Dataset/metafile/Sub_meta_merged.csv (총 490000개 샘플)
