<a href="https://colab.research.google.com/github/zhrldnpftl/DKU-Capstone-DeepPhish/blob/main/Voice-context-Detection/1_voice_file_wav_conversion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 보이스 피싱 탐지 모델 - 1

[데이터셋]
- 정상 데이터 : 복지 분야 콜센터 상담데이터 (mp3 형태)
-      > 경로  : "/content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset"
- 피싱 데이터 : 금융감독원 데이터 (mp4, mp3 형태)
-      > 경로  : "/content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data"
-  피싱 데이터 : KorCCVi 데이터 (csv 형태)
-      > 경로  : "/content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/KorCCVi_dataset"

## 데이터 전처리 목표 단계
0. 프로젝트 경로 설정 및 라이브러리 설치
1. 데이터 디렉토리 로드
2. 음성 데이터 wav 변환
3. 데이터 전처리 (16kHz 변환, mono 변환, max_length 계산)
4. 라벨링

### 0. 프로젝트 경로 설정 및 라이브러리 설치

In [None]:
# 프로젝트 경로 설정
from pathlib import Path

project_path = Path("/content/drive/MyDrive/2025_VoicePhshing_Detection_Model")

In [None]:
# 경로 안에 datasset 폴더 생성
import os

# 데이터셋 폴더 경로 지정
dataset_path = os.path.join(project_path, "dataset")

# 폴더 생성 (이미 존재해도 에러 발생 X)
os.makedirs(dataset_path, exist_ok=True)

# 경로 확인 출력
print(f"📁 데이터셋 경로: {dataset_path}")

📁 데이터셋 경로: /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset


In [None]:
# Huuggingface Transformers와 Datasets 설치 (처음 1회만 설치 필요)
# !pip install transformers datasets librosa jiwer

In [None]:
# fsspec 버전 문제로 추가 설치 (처음 1회만 필요)
# !pip install fsspec==2023.6.0

In [None]:
# 기본 라이브러리 임포트
import os
import pandas as pd
import numpy as np
import librosa                  # 오디오 전처리용
from datasets import load_dataset, Dataset, Audio
# Transformers 관련
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor

## 1. 데이터 디렉토리 로드

In [None]:
# 데이터 디렉토리 설정

# mp3 디렉토리 (phshing_dataset)
phishing_mp3_1_dir = project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "mp3" / "대출사기형"
phishing_mp3_2_dir = project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "mp3" / "수사기관_사칭형"
phishing_mp3_3_dir = project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "mp3" / "기타유형"
# mp4 디렉토리 (phshing_dataset)
phishing_mp4_dir = project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "mp4"
# wav 디렉토리 (phshing_dataset)
phishing_wav_dir = project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "wav"

# mp3 디렉토리 (normal_dataset)
normal_mp3_dir = project_path / "dataset" / "normal_dataset" / "mp3"
# wav 디렉토리 (normal_dataset)
normal_wav_dir = project_path / "dataset" / "normal_dataset" / "wav"

In [None]:
# 디렉토리 생성

# 모든 디렉토리 리스트
all_dirs = [
    phishing_mp3_1_dir,
    phishing_mp3_2_dir,
    phishing_mp3_3_dir,
    phishing_mp4_dir,
    phishing_wav_dir,
    normal_mp3_dir,
    normal_wav_dir
]

# 디렉토리 생성
for d in all_dirs:
    d.mkdir(parents=True, exist_ok=True)

## 2. 음성 데이터 wav 변환

In [None]:
# moviepy 설치
#!pip uninstall -y moviepy
#!pip install moviepy

In [None]:
# 라이브러리 임포트
from moviepy.video.io.VideoFileClip import VideoFileClip
import os
import sys
import moviepy

[1] normal_data

In [None]:
# pydub 라이브러리 설치
#!pip install pydub

In [None]:
from pathlib import Path

# 프로젝트 경로 설정
normal_mp3_dir = Path("/content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset/mp3")
normal_wav_dir = Path("/content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset/wav")

# 디버깅: MP3 디렉토리 확인
print(f"MP3 디렉토리 경로: {normal_mp3_dir}")
print(f"MP3 디렉토리 존재 여부: {normal_mp3_dir.exists()}")

print(f"WAV 디렉토리 경로: {normal_wav_dir}")
print(f"WAV 디렉토리 존재 여부: {normal_wav_dir.exists()}")

MP3 디렉토리 경로: /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset/mp3
MP3 디렉토리 존재 여부: True
WAV 디렉토리 경로: /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset/wav
WAV 디렉토리 존재 여부: True


In [None]:
from pydub import AudioSegment
from pathlib import Path
import csv
import os

# 프로젝트 경로 설정
normal_mp3_dir = Path("/content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset/mp3")
normal_wav_dir = Path("/content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset/wav")

# 디버깅: MP3 디렉토리 확인
print(f"MP3 디렉토리 경로: {normal_mp3_dir}")
print(f"MP3 디렉토리 존재 여부: {normal_mp3_dir.exists()}")

print(f"WAV 디렉토리 경로: {normal_wav_dir}")
print(f"WAV 디렉토리 존재 여부: {normal_wav_dir.exists()}")

# MP3 파일 목록 확인
mp3_files = list(normal_mp3_dir.glob("*.mp3"))
print(f"발견된 MP3 파일 수: {len(mp3_files)}")
if len(mp3_files) > 0:
    print(f"첫 번째 MP3 파일: {mp3_files[0]}")
    print(f"첫 번째 MP3 파일 존재 여부: {mp3_files[0].exists()}")

# CSV 파일 경로
metadata_csv_path = normal_wav_dir / "metadata.csv"

# 변환 및 메타데이터 저장 (오류 처리 개선)
successful_conversions = 0
failed_conversions = 0

with open(metadata_csv_path, mode="w", newline='', encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["filename", "label", "source_path"])  # 헤더

    for mp3_file in mp3_files:
        try:
            # Path 객체 사용하여 일관된 경로 처리
            clean_filename = mp3_file.stem
            wav_filename = f"normal_{clean_filename}.wav"
            wav_path = normal_wav_dir / wav_filename

            print(f"변환 시도 중: {mp3_file} -> {wav_path}")

            # 파일 존재 여부 명시적 확인
            if not mp3_file.exists():
                print(f"⚠️ 파일이 존재하지 않음: {mp3_file}")
                failed_conversions += 1
                continue

            # 파일 크기 확인 (0바이트 파일 감지)
            if mp3_file.stat().st_size == 0:
                print(f"⚠️ 빈 파일 (0바이트): {mp3_file}")
                failed_conversions += 1
                continue

            # 오디오 변환
            audio = AudioSegment.from_mp3(mp3_file)
            audio.export(wav_path, format="wav")

            # CSV에 정보 추가
            writer.writerow([wav_filename, "normal", str(mp3_file)])
            successful_conversions += 1
            print(f"✅ 변환 완료: {wav_filename}")

        except Exception as e:
            print(f"⚠️ 변환 실패 ({mp3_file.name}): {e}")
            # 더 자세한 오류 정보 출력
            import traceback
            traceback.print_exc()
            failed_conversions += 1

print(f"\n===== 변환 결과 =====")
print(f"성공: {successful_conversions} 파일")
print(f"실패: {failed_conversions} 파일")

MP3 디렉토리 경로: /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset/mp3
MP3 디렉토리 존재 여부: True
WAV 디렉토리 경로: /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset/wav
WAV 디렉토리 존재 여부: True
발견된 MP3 파일 수: 619
첫 번째 MP3 파일: /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset/mp3/1.mp3
첫 번째 MP3 파일 존재 여부: True
변환 시도 중: /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset/mp3/1.mp3 -> /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset/wav/normal_1.wav
✅ 변환 완료: normal_1.wav
변환 시도 중: /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset/mp3/2.mp3 -> /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset/wav/normal_2.wav
✅ 변환 완료: normal_2.wav
변환 시도 중: /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dataset/mp3/3.mp3 -> /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/normal_dat

[2] phishing_data

(1) mp4 -> mp3 변환

In [None]:
# 라이브러리 임포트
from moviepy.editor import VideoFileClip
import os

project_path = Path("/content/drive/MyDrive/2025_VoicePhshing_Detection_Model")

# mp3 디렉토리 (phshing_dataset)
phishing_mp3_1_dir = project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "mp3" / "대출사기형"
phishing_mp3_2_dir = project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "mp3" / "수사기관_사칭형"
phishing_mp3_3_dir = project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "mp3" / "기타유형"
# mp4 디렉토리 (phshing_dataset)
phishing_mp4_dir = project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "mp4"
# wav 디렉토리 (phshing_dataset)
phishing_wav_dir = project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "wav"

# mp4 -> mp3 변환 저장 디렉토리
mp3_output_dir = project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "mp3" / "기타유형"
os.makedirs(mp3_output_dir, exist_ok=True)

# 변환 시작
for filename in os.listdir(phishing_mp4_dir):
    if filename.endswith(".mp4"):
        mp4_path = os.path.join(phishing_mp4_dir, filename)
        mp3_filename = os.path.splitext(filename)[0] + ".mp3"
        mp3_path = os.path.join(mp3_output_dir, mp3_filename)

        try:
            video = VideoFileClip(mp4_path)
            video.audio.write_audiofile(mp3_path)
            print(f"🎵 변환 완료: {mp3_filename}")
        except Exception as e:
            print(f"⚠️ 변환 실패 ({filename}): {e}")

MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/금감원_보이스피싱_12_수정07(1).mp3




MoviePy - Done.
🎵 변환 완료: 금감원_보이스피싱_12_수정07(1).mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/금감원_보이스피싱_12_수정07.mp3




MoviePy - Done.
🎵 변환 완료: 금감원_보이스피싱_12_수정07.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/금감원_보이스피싱_11_수정07.mp3




MoviePy - Done.
🎵 변환 완료: 금감원_보이스피싱_11_수정07.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/금감원_보이스피싱_10_수정07.mp3




MoviePy - Done.
🎵 변환 완료: 금감원_보이스피싱_10_수정07.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/금감원_보이스피싱_9_수정06.mp3




MoviePy - Done.
🎵 변환 완료: 금감원_보이스피싱_9_수정06.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/금감원_보이스피싱_8_수정06.mp3




MoviePy - Done.
🎵 변환 완료: 금감원_보이스피싱_8_수정06.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/금감원_보이스피싱_5_수정07.mp3




MoviePy - Done.
🎵 변환 완료: 금감원_보이스피싱_5_수정07.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/금감원_보이스피싱_4_수정07.mp3




MoviePy - Done.
🎵 변환 완료: 금감원_보이스피싱_4_수정07.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/금감원_보이스피싱_3_수정07.mp3




MoviePy - Done.
🎵 변환 완료: 금감원_보이스피싱_3_수정07.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/금감원_보이스피싱_2_수정07.mp3




MoviePy - Done.
🎵 변환 완료: 금감원_보이스피싱_2_수정07.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/금감원_보이스피싱_1_수정08.mp3




MoviePy - Done.
🎵 변환 완료: 금감원_보이스피싱_1_수정08.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/15master.mp3




MoviePy - Done.
🎵 변환 완료: 15master.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/7master.mp3




MoviePy - Done.
🎵 변환 완료: 7master.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/14번수정.mp3




MoviePy - Done.
🎵 변환 완료: 14번수정.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/17번수정.mp3




MoviePy - Done.
🎵 변환 완료: 17번수정.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/16번수정.mp3




MoviePy - Done.
🎵 변환 완료: 16번수정.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/15번_3차례 신고된 남성 전화금융사기범 (음성_15).mp3




MoviePy - Done.
🎵 변환 완료: 15번_3차례 신고된 남성 전화금융사기범 (음성_15).mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/13번수정.mp3




MoviePy - Done.
🎵 변환 완료: 13번수정.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/10번_3차례 신고된 남성 전화금융사기범 (음성_10).mp3




MoviePy - Done.
🎵 변환 완료: 10번_3차례 신고된 남성 전화금융사기범 (음성_10).mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/9번_3차례 신고된 남성 전화금융사기범 (음성_9).mp3




MoviePy - Done.
🎵 변환 완료: 9번_3차례 신고된 남성 전화금융사기범 (음성_9).mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/8번_3차례 신고된 남성 전화금융사기범 (음성_8).mp3




MoviePy - Done.
🎵 변환 완료: 8번_3차례 신고된 남성 전화금융사기범 (음성_8).mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/5번수정.mp3




MoviePy - Done.
🎵 변환 완료: 5번수정.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/4차례 신고된 여성 전화금융사기범.mp3




MoviePy - Done.
🎵 변환 완료: 4차례 신고된 여성 전화금융사기범.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/2번_3차례 신고된 여성 전화금융사기범 (음성_2).mp3




MoviePy - Done.
🎵 변환 완료: 2번_3차례 신고된 여성 전화금융사기범 (음성_2).mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/6차례 신고된 남성 전화금융사기범 음성 3.mp3




MoviePy - Done.
🎵 변환 완료: 6차례 신고된 남성 전화금융사기범 음성 3.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/6차례 신고된 남성 전화금융사기범 음성 2.mp3




MoviePy - Done.
🎵 변환 완료: 6차례 신고된 남성 전화금융사기범 음성 2.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/6차례 신고된 남성 전화금융사기범 음성 1.mp3




MoviePy - Done.
🎵 변환 완료: 6차례 신고된 남성 전화금융사기범 음성 1.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/5차례 신고된 남성 전화금융사기범 음성 4.mp3




MoviePy - Done.
🎵 변환 완료: 5차례 신고된 남성 전화금융사기범 음성 4.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/5차례 신고된 남성 전화금융사기범 음성 3.mp3




MoviePy - Done.
🎵 변환 완료: 5차례 신고된 남성 전화금융사기범 음성 3.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/5차례 신고된 남성 전화금융사기범 음성 2.mp3




MoviePy - Done.
🎵 변환 완료: 5차례 신고된 남성 전화금융사기범 음성 2.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/5차례 신고된 남성 전화금융사기범 음성 1.mp3




MoviePy - Done.
🎵 변환 완료: 5차례 신고된 남성 전화금융사기범 음성 1.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/4차례 신고된 여성 전화금융사기범 음성 2.mp3




MoviePy - Done.
🎵 변환 완료: 4차례 신고된 여성 전화금융사기범 음성 2.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/4차례 신고된 남성 전화금융사기범 음성 7.mp3




MoviePy - Done.
🎵 변환 완료: 4차례 신고된 남성 전화금융사기범 음성 7.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/4차례 신고된 남성 전화금융사기범 음성 6.mp3




MoviePy - Done.
🎵 변환 완료: 4차례 신고된 남성 전화금융사기범 음성 6.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/4차례 신고된 남성 전화금융사기범 음성 5.mp3




MoviePy - Done.
🎵 변환 완료: 4차례 신고된 남성 전화금융사기범 음성 5.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/4차례 신고된 남성 전화금융사기범 음성 4.mp3




MoviePy - Done.
🎵 변환 완료: 4차례 신고된 남성 전화금융사기범 음성 4.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/4차례 신고된 남성 전화금융사기범 음성 3.mp3




MoviePy - Done.
🎵 변환 완료: 4차례 신고된 남성 전화금융사기범 음성 3.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/4차례 신고된 남성 전화금융사기범 음성 2.mp3




MoviePy - Done.
🎵 변환 완료: 4차례 신고된 남성 전화금융사기범 음성 2.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/4차례 신고된 남성 전화금융사기범 음성 1.mp3




MoviePy - Done.
🎵 변환 완료: 4차례 신고된 남성 전화금융사기범 음성 1.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/3차-(1).16차례 신고된 남성 전화금융사기범 음성.mp3




MoviePy - Done.
🎵 변환 완료: 3차-(1).16차례 신고된 남성 전화금융사기범 음성.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/3차-(2).6차례 신고된 남성 전화금융사기범 음성.mp3




MoviePy - Done.
🎵 변환 완료: 3차-(2).6차례 신고된 남성 전화금융사기범 음성.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/3차-(3).5차례 신고된 남성 전화금융사기범 음성 1.mp3




MoviePy - Done.
🎵 변환 완료: 3차-(3).5차례 신고된 남성 전화금융사기범 음성 1.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/3차-(4).5차례 신고된 남성 전화금융사기범 음성 2.mp3




MoviePy - Done.
🎵 변환 완료: 3차-(4).5차례 신고된 남성 전화금융사기범 음성 2.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/3차-(5).5차례 신고된 남성 전화금융사기범 음성 3.mp3




MoviePy - Done.
🎵 변환 완료: 3차-(5).5차례 신고된 남성 전화금융사기범 음성 3.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/3차-(6).5차례 신고된 남성 전화금융사기범 음성 4.mp3




MoviePy - Done.
🎵 변환 완료: 3차-(6).5차례 신고된 남성 전화금융사기범 음성 4.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/3차-(9).4차례 신고된 남성 전화금융사기범 음성 1.mp3




MoviePy - Done.
🎵 변환 완료: 3차-(9).4차례 신고된 남성 전화금융사기범 음성 1.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/3차-(10).4차례 신고된 남성 전화금융사기범 음성 2.mp3




MoviePy - Done.
🎵 변환 완료: 3차-(10).4차례 신고된 남성 전화금융사기범 음성 2.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/3차-(11).4차례 신고된 남성 전화금융사기범 음성 3.mp3




MoviePy - Done.
🎵 변환 완료: 3차-(11).4차례 신고된 남성 전화금융사기범 음성 3.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/3차-(12).4차례 신고된 남성 전화금융사기범 음성 4.mp3




MoviePy - Done.
🎵 변환 완료: 3차-(12).4차례 신고된 남성 전화금융사기범 음성 4.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/3차-(13).4차례 신고된 남성 전화금융사기범 음성 5.mp3




MoviePy - Done.
🎵 변환 완료: 3차-(13).4차례 신고된 남성 전화금융사기범 음성 5.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/3차-(14).4차례 신고된 남성 전화금융사기범 음성 6.mp3




MoviePy - Done.
🎵 변환 완료: 3차-(14).4차례 신고된 남성 전화금융사기범 음성 6.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/3차-(15).4차례 신고된 남성 전화금융사기범 음성 7.mp3




MoviePy - Done.
🎵 변환 완료: 3차-(15).4차례 신고된 남성 전화금융사기범 음성 7.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/바로 이 목소리_6차례 신고된 남성 전화금융사기범 음성.mp3




MoviePy - Done.
🎵 변환 완료: 바로 이 목소리_6차례 신고된 남성 전화금융사기범 음성.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/바로 이 목소리_5차례 신고된 남성 전화금융사기범 음성_2.mp3




MoviePy - Done.
🎵 변환 완료: 바로 이 목소리_5차례 신고된 남성 전화금융사기범 음성_2.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/바로 이 목소리_4차례 신고된 남성 전화금융사기범 음성_1.mp3




MoviePy - Done.
🎵 변환 완료: 바로 이 목소리_4차례 신고된 남성 전화금융사기범 음성_1.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/대출빙자형 사례 7_편법을 통해 피해자 신용평점을 상향시키기 위한 대출상환 요구.mp3




MoviePy - Done.
🎵 변환 완료: 대출빙자형 사례 7_편법을 통해 피해자 신용평점을 상향시키기 위한 대출상환 요구.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/대출빙자형 사례 6_편법을 통해 피해자 신용평점을 상향시키기 위한 대출상환 요구.mp3




MoviePy - Done.
🎵 변환 완료: 대출빙자형 사례 6_편법을 통해 피해자 신용평점을 상향시키기 위한 대출상환 요구.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/대출빙자형 사례 5_사금융 대환대출 명목으로 피해자의 무통장입금 유도.mp3




MoviePy - Done.
🎵 변환 완료: 대출빙자형 사례 5_사금융 대환대출 명목으로 피해자의 무통장입금 유도.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/대출빙자형 사례 4_햇살론을 받기 위하여 피해자의 기존 대출을 일부 상환해야 한다고 유도.mp3




MoviePy - Done.
🎵 변환 완료: 대출빙자형 사례 4_햇살론을 받기 위하여 피해자의 기존 대출을 일부 상환해야 한다고 유도.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/대출빙자형 사례 3_은행을 사칭하면서 서민금융나들목 상품 명목의 대출을 안내.mp3




MoviePy - Done.
🎵 변환 완료: 대출빙자형 사례 3_은행을 사칭하면서 서민금융나들목 상품 명목의 대출을 안내.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/출입국 관리 사무국 직원 사칭(여름 휴가철 보이스피싱_1).mp3




MoviePy - Done.
🎵 변환 완료: 출입국 관리 사무국 직원 사칭(여름 휴가철 보이스피싱_1).mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/바로 이 목소리 사례2_F.mp3




MoviePy - Done.
🎵 변환 완료: 바로 이 목소리 사례2_F.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/바로 이 목소리 사례3_F.mp3




MoviePy - Done.
🎵 변환 완료: 바로 이 목소리 사례3_F.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/바로 이 목소리 사례4_F.mp3




MoviePy - Done.
🎵 변환 완료: 바로 이 목소리 사례4_F.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/바로 이 목소리 사례5_F.mp3




MoviePy - Done.
🎵 변환 완료: 바로 이 목소리 사례5_F.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/바로 이 목소리 사례6_F.mp3




MoviePy - Done.
🎵 변환 완료: 바로 이 목소리 사례6_F.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/바로 이 목소리_사례7_F.mp3




MoviePy - Done.
🎵 변환 완료: 바로 이 목소리_사례7_F.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/바로 이 목소리_사례8_F.mp3




MoviePy - Done.
🎵 변환 완료: 바로 이 목소리_사례8_F.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/바로 이 목소리 사례9_F.mp3




MoviePy - Done.
🎵 변환 완료: 바로 이 목소리 사례9_F.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/대출사기 사례2_대출사기 전산삭제과정 금감원 모니터링 걸림.mp3




MoviePy - Done.
🎵 변환 완료: 대출사기 사례2_대출사기 전산삭제과정 금감원 모니터링 걸림.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/대출사기 사례1_대출사기 전산오류로 선입금 360만원 요구.mp3




MoviePy - Done.
🎵 변환 완료: 대출사기 사례1_대출사기 전산오류로 선입금 360만원 요구.mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/납치빙자 F.mp3




MoviePy - Done.
🎵 변환 완료: 납치빙자 F.mp3
⚠️ 변환 실패 (NR0079_검찰사칭 [착한 피해자에게 걸려온 검찰 사칭 보이스피싱].mp4): MoviePy error: failed to read the duration of file /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp4/NR0079_검찰사칭 [착한 피해자에게 걸려온 검찰 사칭 보이스피싱].mp4.
Here are the file infos returned by ffmpeg:

ffmpeg version 7.0.2-static https://johnvansickle.com/ffmpeg/  Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 8 (Debian 8.3.0-6)
  configuration: --enable-gpl --enable-version3 --enable-static --disable-debug --disable-ffplay --disable-indev=sndio --disable-outdev=sndio --cc=gcc --enable-fontconfig --enable-frei0r --enable-gnutls --enable-gmp --enable-libgme --enable-gray --enable-libaom --enable-libfribidi --enable-libass --enable-libvmaf --enable-libfreetype --enable-libmp3lame --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-l



MoviePy - Done.
🎵 변환 완료: UCC_[이벤트행사_통장한개당_300만원_제안].mp3
MoviePy - Writing audio in /content/drive/MyDrive/2025_VoicePhshing_Detection_Model/dataset/phishing_dataset/FSS_voicephishing_data/mp3/기타유형/UCC_[사이버경찰청_수사관_사칭_감기오셨어요_부산사나이의_화끈한_대처법].mp3




MoviePy - Done.
🎵 변환 완료: UCC_[사이버경찰청_수사관_사칭_감기오셨어요_부산사나이의_화끈한_대처법].mp3


(2) mp3 -> wav 변환

In [None]:
# pydub 라이브러리 설치
#!pip install pydub
#!apt-get install ffmpeg -y

In [1]:
from pydub import AudioSegment
import os
import csv
from pathlib import Path

project_path = Path("/content/drive/MyDrive/2025_VoicePhshing_Detection_Model")

# wav 디렉토리 (phshing_dataset)
phishing_wav_dir = project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "wav"

# 디렉토리들
mp3_dirs = {
    "대출사기형": project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "mp3" / "대출사기형",
    "수사기관_사칭형": project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "mp3" / "수사기관_사칭형",
    "기타": project_path / "dataset" / "phishing_dataset" / "FSS_voicephishing_data" / "mp3" / "기타유형"
}

# 메타데이터 저장용 CSV 파일 경로
metadata_csv_path = os.path.join(phishing_wav_dir, "metadata.csv")

# 디렉토리 생성 (존재하지 않으면)
os.makedirs(os.path.dirname(metadata_csv_path), exist_ok=True)

# 변환 및 메타데이터 저장
with open(metadata_csv_path, mode="w", newline='', encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["filename", "label", "source_path"])  # 헤더

    for label, mp3_dir in mp3_dirs.items():
        for filename in os.listdir(mp3_dir):
            if filename.endswith(".mp3"):
                mp3_path = os.path.join(mp3_dir, filename)
                # 라벨_원래이름.wav 형태로 저장
                clean_filename = os.path.splitext(filename)[0]
                wav_filename = f"{label}_{clean_filename}.wav"
                wav_path = os.path.join(phishing_wav_dir, wav_filename)

                try:
                    audio = AudioSegment.from_mp3(mp3_path)
                    audio.export(wav_path, format="wav")
                    writer.writerow([wav_filename, label, mp3_path])
                    print(f"🔊 변환 및 저장 완료: {wav_filename}")
                except Exception as e:
                    print(f"⚠️ 변환 실패 ({filename}): {e}")

🔊 변환 및 저장 완료: 대출사기형_185.wav
🔊 변환 및 저장 완료: 대출사기형_184.wav
🔊 변환 및 저장 완료: 대출사기형_183.wav
🔊 변환 및 저장 완료: 대출사기형_182.wav
🔊 변환 및 저장 완료: 대출사기형_181.wav
🔊 변환 및 저장 완료: 대출사기형_180.wav
🔊 변환 및 저장 완료: 대출사기형_179.wav
🔊 변환 및 저장 완료: 대출사기형_178.wav
🔊 변환 및 저장 완료: 대출사기형_177.wav
🔊 변환 및 저장 완료: 대출사기형_176.wav
🔊 변환 및 저장 완료: 대출사기형_175.wav
🔊 변환 및 저장 완료: 대출사기형_174.wav
🔊 변환 및 저장 완료: 대출사기형_173.wav
🔊 변환 및 저장 완료: 대출사기형_172.wav
🔊 변환 및 저장 완료: 대출사기형_171.wav
🔊 변환 및 저장 완료: 대출사기형_170.wav
🔊 변환 및 저장 완료: 대출사기형_169.wav
🔊 변환 및 저장 완료: 대출사기형_168.wav
🔊 변환 및 저장 완료: 대출사기형_167.wav
🔊 변환 및 저장 완료: 대출사기형_166.wav
🔊 변환 및 저장 완료: 대출사기형_165.wav
🔊 변환 및 저장 완료: 대출사기형_164.wav
🔊 변환 및 저장 완료: 대출사기형_163.wav
🔊 변환 및 저장 완료: 대출사기형_162.wav
🔊 변환 및 저장 완료: 대출사기형_161.wav
🔊 변환 및 저장 완료: 대출사기형_160.wav
🔊 변환 및 저장 완료: 대출사기형_159.wav
🔊 변환 및 저장 완료: 대출사기형_158.wav
🔊 변환 및 저장 완료: 대출사기형_157.wav
🔊 변환 및 저장 완료: 대출사기형_156.wav
🔊 변환 및 저장 완료: 대출사기형_155.wav
🔊 변환 및 저장 완료: 대출사기형_154.wav
🔊 변환 및 저장 완료: 대출사기형_153.wav
🔊 변환 및 저장 완료: 대출사기형_152.wav
🔊 변환 및 저장 완료: 대출사기형_151.wav
🔊 변환 및 저장 완료: 대출사기형_

KeyboardInterrupt: 

- 중간에 중단하여 오류가 뜬 것
- 파일 변환은 정상적으로 잘 진행됨