In [2]:
import pandas as pd
from tqdm import tqdm
import shutil
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [2]:
train_file = "../train.csv"

whole_df = pd.read_csv(train_file)
whole_df.head(3)

Unnamed: 0,id,path,label
0,RUNQPNJF,./train/RUNQPNJF.ogg,real
1,JFAWUOGJ,./train/JFAWUOGJ.ogg,fake
2,RDKEKEVX,./train/RDKEKEVX.ogg,real


# 음성 길이 구하기

In [4]:
# for index, row in tqdm(df.iterrows()):
#     path = row['path']
        
#     audio = AudioSegment.from_file(path)
#     # 길이 계산 (밀리초 단위)
#     length_in_milliseconds = len(audio)

#     whole_df.at[index, 'len'] = length_in_milliseconds

In [5]:
# 저장
# whole_df.to_csv("train_with_length.csv", index=False)

# 길이가 1000ms 미만인 파일들은 too_short으로 이동

In [65]:
whole_df = pd.read_csv("train_ver1.csv")
count_len_5000_or_less = whole_df[whole_df['len'] < 1000]
count_len_5000_or_less.shape

(4162, 4)

In [66]:
# 대상 디렉토리 경로
destination_dir = './too_short'

for index, row in tqdm(whole_df.iterrows()):
    if row['len'] >= 1000: continue
    
    path = row['path']
    new_path = path.replace("train", "too_short")
    whole_df.at[index, 'path'] = new_path
    shutil.move(path, new_path)

55438it [00:03, 18111.77it/s]


In [67]:
# 저장
# whole_df.to_csv("train_ver2.csv", index=False)

# train data, validation data 분할

In [68]:
file_path = 'train_ver2.csv'
whole_df = pd.read_csv(file_path)
print(f"before: {whole_df.shape}")
filtered_df = whole_df[~whole_df['path'].str.contains('too_short')]
print(f"after: {filtered_df.shape}")

before: (55438, 4)
after: (51276, 4)


In [70]:
train_df, test_df = train_test_split(filtered_df, test_size=0.2, random_state=42, stratify=filtered_df['label'])

# 결과 출력
print("학습용 데이터 개수:", len(train_df))
print("테스트용 데이터 개수:", len(test_df))

# 레이블 비율 확인
print("\n학습용 데이터의 레이블 비율:")
print(train_df['label'].value_counts(normalize=True))

print("\n테스트용 데이터의 레이블 비율:")
print(test_df['label'].value_counts(normalize=True))

학습용 데이터 개수: 41020
테스트용 데이터 개수: 10256

학습용 데이터의 레이블 비율:
real    0.535446
fake    0.464554
Name: label, dtype: float64

테스트용 데이터의 레이블 비율:
real    0.535394
fake    0.464606
Name: label, dtype: float64


# train 폴더에서 validate 폴더로 이동

In [71]:
# 대상 디렉토리 경로
destination_dir = './validate'

for index, row in tqdm(test_df.iterrows()):
    path = row['path']
    new_path = path.replace("train", "validate")
    whole_df.loc[whole_df['id'] == row['id'], 'path'] = new_path
    shutil.move(path, new_path)

0it [00:00, ?it/s]

10256it [00:59, 171.30it/s]


In [72]:
# 저장
# whole_df.to_csv("train_ver3.csv", index=False)

# label 2개로 분리

In [4]:
file_path = 'train_ver3.csv'
whole_df = pd.read_csv(file_path)
whole_df.head(3)

Unnamed: 0,id,path,label,len
0,RUNQPNJF,./train/RUNQPNJF.ogg,real,2393.0
1,JFAWUOGJ,./train/JFAWUOGJ.ogg,fake,1264.0
2,RDKEKEVX,./train/RDKEKEVX.ogg,real,6144.0


In [6]:
# 'label' 칼럼의 이름을 'label1'로 변경
whole_df.rename(columns={'label': 'label1'}, inplace=True)
whole_df['label2'] = whole_df['label1']

# 새로운 칼럼 순서 지정
new_column_order = ['id', 'path', 'label1', 'label2', 'len']

# 칼럼 순서를 변경한 DataFrame 생성
whole_df = whole_df[new_column_order]

whole_df.head(3)

Unnamed: 0,id,path,label1,label2,len
0,RUNQPNJF,./train/RUNQPNJF.ogg,real,real,2393.0
1,JFAWUOGJ,./train/JFAWUOGJ.ogg,fake,fake,1264.0
2,RDKEKEVX,./train/RDKEKEVX.ogg,real,real,6144.0


In [8]:
# 저장
# whole_df.to_csv("train_ver4.csv", index=False)

# two voice랑 합치기

In [6]:
file_path1 = 'train_ver4.csv'
whole_df1 = pd.read_csv(file_path1)
print(whole_df1.columns, f"whole_df: {whole_df1.shape}")

file_path2 = 'train_two_voice.csv'
whole_df2 = pd.read_csv(file_path2)
print(whole_df2.columns, f"whole_df: {whole_df2.shape}")

file_path3 = 'validate_two_voice.csv'
whole_df3 = pd.read_csv(file_path3)
print(whole_df3.columns, f"whole_df: {whole_df3.shape}")

Index(['id', 'path', 'label1', 'label2', 'len'], dtype='object') whole_df: (55438, 5)
Index(['id', 'path', 'label1', 'label2', 'len'], dtype='object') whole_df: (10000, 5)
Index(['id', 'path', 'label1', 'label2', 'len'], dtype='object') whole_df: (10000, 5)


In [7]:
whole_df = pd.concat([whole_df1, whole_df2, whole_df3], ignore_index=True)
print(f"whole_df: {whole_df.shape}")

whole_df: (75438, 5)


In [8]:
# 저장
#whole_df.to_csv("train_ver5.csv", index=False)

# Noise에 label 추가

In [14]:
file_path = 'noiseMix.csv'
whole_df = pd.read_csv(file_path)
whole_df['label1'] = 'fake'
whole_df['label2'] = 'fake'

# 새로운 칼럼 순서 지정
new_column_order = ['id', 'path', 'label1', 'label2', 'len']

# 칼럼 순서를 변경한 DataFrame 생성
whole_df = whole_df[new_column_order]
whole_df.head(3)

Unnamed: 0,id,path,label1,label2,len
0,ABJGMLHQ_MVFYKRHJ.ogg,./noiseMix/ABJGMLHQ_MVFYKRHJ.ogg,fake,fake,5000
1,ABKEEJML_HAJICFQI.ogg,./noiseMix/ABKEEJML_HAJICFQI.ogg,fake,fake,5000
2,ABKEEJML_HFRWWACQ.ogg,./noiseMix/ABKEEJML_HFRWWACQ.ogg,fake,fake,5000


In [19]:
# 저장
# whole_df.to_csv("noiseMix_labeled.csv", index=False)

# Noise validation train 나누기

In [34]:
file_path = 'noiseMix_labeled.csv'
whole_df = pd.read_csv(file_path)

# 데이터프레임 섞기 (랜덤하게 섞지 않으면 인덱스 기준으로 나뉘기 때문에 데이터가 치우칠 수 있음)
df_shuffled = whole_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 4:1 비율로 나누기 (80% 훈련, 20% 테스트)
train_size = int(len(whole_df) * 0.8)

train_df = df_shuffled.iloc[:train_size]
test_df = df_shuffled.iloc[train_size:]

In [35]:
destination_dir = './noiseMix_train'

for index, row in tqdm(train_df.iterrows()):
    path = row['path']
    new_path = path.replace("noiseMix", "noiseMix_train")
    whole_df.loc[whole_df['id'] == row['id'], 'path'] = new_path
    shutil.move(path, new_path)

1186it [00:03, 370.75it/s]


In [36]:
destination_dir = './noiseMix_validate'

for index, row in tqdm(test_df.iterrows()):
    path = row['path']
    new_path = path.replace("noiseMix", "noiseMix_validate")
    whole_df.loc[whole_df['id'] == row['id'], 'path'] = new_path
    shutil.move(path, new_path)

297it [00:00, 450.68it/s]


In [37]:
whole_df.head(3)

Unnamed: 0,id,path,label1,label2,len
0,ABJGMLHQ_MVFYKRHJ.ogg,./noiseMix_train/ABJGMLHQ_MVFYKRHJ.ogg,fake,fake,5000
1,ABKEEJML_HAJICFQI.ogg,./noiseMix_validate/ABKEEJML_HAJICFQI.ogg,fake,fake,5000
2,ABKEEJML_HFRWWACQ.ogg,./noiseMix_train/ABKEEJML_HFRWWACQ.ogg,fake,fake,5000


In [38]:
# 저장
# whole_df.to_csv("noiseMix_ver1.csv", index=False)

# Noise랑 합치기

In [41]:
file_path1 = 'train_ver5.csv'
whole_df1 = pd.read_csv(file_path1)
print(whole_df1.columns, f"whole_df: {whole_df1.shape}")

file_path2 = 'noiseMix_ver1.csv'
whole_df2 = pd.read_csv(file_path2)
print(whole_df2.columns, f"whole_df: {whole_df2.shape}")
whole_df2.head(3)

Index(['id', 'path', 'label1', 'label2', 'len'], dtype='object') whole_df: (75438, 5)
Index(['id', 'path', 'label1', 'label2', 'len'], dtype='object') whole_df: (1483, 5)


Unnamed: 0,id,path,label1,label2,len
0,ABJGMLHQ_MVFYKRHJ.ogg,./noiseMix_train/ABJGMLHQ_MVFYKRHJ.ogg,fake,fake,5000
1,ABKEEJML_HAJICFQI.ogg,./noiseMix_validate/ABKEEJML_HAJICFQI.ogg,fake,fake,5000
2,ABKEEJML_HFRWWACQ.ogg,./noiseMix_train/ABKEEJML_HFRWWACQ.ogg,fake,fake,5000


In [42]:
whole_df = pd.concat([whole_df1, whole_df2], ignore_index=True)
print(f"whole_df: {whole_df.shape}")

whole_df: (76921, 5)


In [43]:
# 저장
# whole_df.to_csv("train_ver6.csv", index=False)

# train data 나누기

In [86]:
file_path = 'train_ver6.csv'
whole_df = pd.read_csv(file_path)

# 'path'에 'too_short'가 포함된 행 제외
whole_df = whole_df[~whole_df['path'].str.contains('too_short')].reset_index(drop=True)
print(f"number: {whole_df.shape[0]}")

train_df = whole_df[whole_df['path'].str.contains('train')].reset_index(drop=True)
validate_df = whole_df[whole_df['path'].str.contains('validate')].reset_index(drop=True)

# StratifiedKFold 객체 생성
num_splits = 7
skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)

# 'label1'과 'label2'의 조합을 기준으로 새로운 컬럼 생성
train_df['combined_label'] = train_df['label1'].astype(str) + '_' + train_df['label2'].astype(str)
validate_df['combined_label'] = validate_df['label1'].astype(str) + '_' + validate_df['label2'].astype(str)

# train_df를 StratifiedKFold로 나누기
train_splits = []
for _, test_index in skf.split(train_df, train_df['combined_label']):
    fold_df = train_df.iloc[test_index].reset_index(drop=True)
    train_splits.append(fold_df)

# validate_df를 StratifiedKFold로 나누기
validate_splits = []
for _, test_index in skf.split(validate_df, validate_df['combined_label']):
    fold_df = validate_df.iloc[test_index].reset_index(drop=True)
    validate_splits.append(fold_df)

# 각 fold를 합치기
for idx, (train_fold, validate_fold) in enumerate(zip(train_splits, validate_splits), start=1):
    combined_df = pd.concat([train_fold, validate_fold], ignore_index=True)
    combined_df.to_csv(f"train_final{idx}.csv", index=False)

number: 72759


# toy sample 추출

In [22]:
whole_df = pd.read_csv("train_final.csv")

sample_train_df = whole_df[whole_df['path'].str.contains('train')].sample(1000, random_state=42)
sample_validate_df = whole_df[whole_df['path'].str.contains('validate')].sample(n=200, random_state=42)
print(sample_train_df.head(3))
print(sample_validate_df.head(3))
sample_whole_df = pd.concat([sample_train_df, sample_validate_df], ignore_index=True)

                      id                                     path label1  \
62360  XRWNTJVY_QTVBBRRG  ./train_two_voice/XRWNTJVY_QTVBBRRG.ogg   real   
48923           SFXBXAWC                     ./train/SFXBXAWC.ogg   fake   
15388           EZKOLEHD                     ./train/EZKOLEHD.ogg   real   

      label2     len  
62360   real  6712.0  
48923   fake  1080.0  
15388   real  5270.0  
                      id                                        path label1  \
68948  ROKAMAMN_LVRQEVVN  ./validate_two_voice/ROKAMAMN_LVRQEVVN.ogg   fake   
70061  SZESAGEQ_VSGLVWFP  ./validate_two_voice/SZESAGEQ_VSGLVWFP.ogg   real   
70427  UVLUNEQH_FSZDFZCL  ./validate_two_voice/UVLUNEQH_FSZDFZCL.ogg   real   

      label2     len  
68948   fake  4850.0  
70061   real  9323.0  
70427   fake  2949.0  


In [23]:
for index, row in tqdm(sample_train_df.iterrows()):
    path = row['path']
    new_path = path.replace("train", "toy_sample_train")
    sample_whole_df.loc[sample_whole_df['id'] == row['id'], 'path'] = new_path
    # shutil.copy(path, new_path)

for index, row in tqdm(sample_validate_df.iterrows()):
    path = row['path']
    new_path = path.replace("validate", "toy_sample_validate")
    sample_whole_df.loc[sample_whole_df['id'] == row['id'], 'path'] = new_path
    # shutil.copy(path, new_path)

1000it [00:00, 2316.55it/s]
200it [00:00, 2712.89it/s]


In [24]:
# 저장
# sample_whole_df.to_csv("toy_sample.csv", index=False)

# toy sample의 label을 label1, label2로 분리

In [65]:
sample_df = pd.read_csv("train_sample.csv")
sample_df.head(3)

FileNotFoundError: [Errno 2] No such file or directory: 'train_sample.csv'

In [None]:
# 'label' 칼럼의 이름을 'label1'로 변경
sample_df.rename(columns={'label': 'label1'}, inplace=True)
sample_df['label2'] = sample_df['label1']

# 새로운 칼럼 순서 지정
new_column_order = ['id', 'path', 'label1', 'label2', 'len']

# 칼럼 순서를 변경한 DataFrame 생성
sample_df = sample_df[new_column_order]

sample_df.head(3)

Unnamed: 0,id,path,label1,label2,len
0,PTAMUHTJ,./train/PTAMUHTJ.ogg,real,real,6481.0
1,JBCXCNQF,./train/JBCXCNQF.ogg,fake,fake,1406.0
2,DYTNYEAK,./train/DYTNYEAK.ogg,real,real,8414.0


In [None]:
# 저장
# sample_df.to_csv("train_sample.csv", index=False)