In [1]:
import torch
print(torch.version.cuda)  # CUDA 버전을 출력 (None일 경우 CPU 전용)
print(torch.cuda.is_available())  # False일 경우 CUDA 사용 불가

12.1
True


In [None]:
import json

def modify_paths(json_data, json_filename):
    """JSON 파일의 경로를 수정하는 함수"""
    dialogs = json_data['dataSet']['dialogs']
    
    for dialog in dialogs:
        # 기존 경로에서 필요한 부분만 추출
        audio_parts = dialog['audioPath'].split('/')
        text_parts = dialog['textPath'].split('/')
        
        # 새로운 경로 형식으로 변경
        dialog['audioPath'] = f"MARS/All_Datas/J91/{json_filename}/{audio_parts[-1]}"
        dialog['textPath'] = f"MARS/All_Datas/J91/{json_filename}/{text_parts[-1]}"
    
    return json_data

# JSON 파일 순차적으로 처리
for i in range(7727, 7868):
    json_filename = f"S{str(i).zfill(8)}"  # S00007727 형식으로 파일명 생성
    
    # JSON 파일 읽기
    with open(f'/content/drive/Othercomputers/내 노트북/MARS/All_Datas/J91/{json_filename}/{json_filename}.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # 경로 수정
    modified_data = modify_paths(data, json_filename)
    
    # 수정된 JSON 파일 저장
    with open(f'/content/drive/Othercomputers/내 노트북/MARS/All_Datas/J91/{json_filename}/{json_filename}.json', 'w', encoding='utf-8') as f:
        json.dump(modified_data, f, ensure_ascii=False, indent='\t')

In [2]:
import json
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import librosa
import numpy as np
from tqdm import tqdm
import pandas as pd

def calculate_cer(reference, hypothesis):
    """문자 오류율(CER) 계산"""
    if not reference or not hypothesis:
        return 1.0  # 빈 문자열인 경우 최대 오류율 반환
    
    # 문자 단위로 분리
    ref_chars = list(reference.replace(' ', ''))  # 공백 제거 후 문자 분리
    hyp_chars = list(hypothesis.replace(' ', ''))
    
    if not ref_chars:  # 참조 텍스트가 비어있는 경우
        return 1.0
    
    # Levenshtein 거리 계산
    distances = np.zeros((len(ref_chars) + 1, len(hyp_chars) + 1))
    
    # 첫 행과 열 초기화
    for i in range(len(ref_chars) + 1):
        distances[i][0] = i
    for j in range(len(hyp_chars) + 1):
        distances[0][j] = j
    
    # 거리 계산
    for i in range(1, len(ref_chars) + 1):
        for j in range(1, len(hyp_chars) + 1):
            if ref_chars[i-1] == hyp_chars[j-1]:
                distances[i][j] = distances[i-1][j-1]
            else:
                distances[i][j] = min(distances[i-1][j] + 1,    # 삭제
                                    distances[i][j-1] + 1,      # 삽입
                                    distances[i-1][j-1] + 1)    # 대체
    
    # CER 계산
    return float(distances[len(ref_chars)][len(hyp_chars)]) / len(ref_chars)

class AudioTextDataset(Dataset):
    """오디오-텍스트 데이터셋"""
    def __init__(self, base_path, file_list):
        self.base_path = base_path
        self.samples = []
        
        # 데이터 준비
        for json_filename in file_list:
            json_path = os.path.join(base_path, json_filename, f"{json_filename}.json")
            try:
                with open(json_path, 'r', encoding='utf-8-sig') as f:
                    data = json.load(f)
                
                for dialog in data['dataSet']['dialogs']:
                    audio_parts = dialog['audioPath'].split('/')
                    text_parts = dialog['textPath'].split('/')
                    
                    audio_path = os.path.join(base_path, json_filename, audio_parts[-1])
                    text_path = os.path.join(base_path, json_filename, text_parts[-1])
                    
                    if os.path.exists(audio_path) and os.path.exists(text_path):
                        self.samples.append({
                            'json_filename': json_filename,
                            'audio_path': audio_path,
                            'text_path': text_path,
                            'new_audio_path': f"MARS/All_Datas/J91/{json_filename}/{audio_parts[-1]}",
                            'new_text_path': f"MARS/All_Datas/J91/{json_filename}/{text_parts[-1]}"
                        })
            except Exception as e:
                print(f"Error loading {json_filename}: {str(e)}")
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        
        try:
            # 오디오 로드
            audio, sr = librosa.load(sample['audio_path'], sr=16000)
            
            # 텍스트 로드
            with open(sample['text_path'], 'r', encoding='utf-8') as f:
                text = f.read().strip()
            
            return {
                'json_filename': sample['json_filename'],
                'audio': audio,
                'text': text,
                'new_audio_path': sample['new_audio_path'],
                'new_text_path': sample['new_text_path']
            }
        except Exception as e:
            print(f"Error loading sample {idx}: {str(e)}")
            # 에러 발생 시 더미 데이터 반환
            return {
                'json_filename': sample['json_filename'],
                'audio': np.zeros(16000),  # 1초 길이의 무음
                'text': '',
                'new_audio_path': sample['new_audio_path'],
                'new_text_path': sample['new_text_path']
            }
            
def collate_fn(batch):
    """배치 데이터 처리"""
    return {
        'json_filename': [item['json_filename'] for item in batch],
        'audio': [item['audio'] for item in batch],
        'text': [item['text'] for item in batch],
        'new_audio_path': [item['new_audio_path'] for item in batch],
        'new_text_path': [item['new_text_path'] for item in batch]
    }

def update_json_files(results, base_path):
    """JSON 파일 업데이트"""
    for filename in results:
        json_path = os.path.join(base_path, filename, f"{filename}.json")
        try:
            with open(json_path, 'r', encoding='utf-8-sig') as f:
                data = json.load(f)
            
            # 경로 업데이트
            for dialog in data['dataSet']['dialogs']:
                audio_parts = dialog['audioPath'].split('/')
                text_parts = dialog['textPath'].split('/')
                dialog['audioPath'] = f"MARS/All_Datas/J91/{filename}/{audio_parts[-1]}"
                dialog['textPath'] = f"MARS/All_Datas/J91/{filename}/{text_parts[-1]}"
            
            # 수정된 JSON 저장
            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent='\t')
                
        except Exception as e:
            print(f"Error updating {filename}: {str(e)}")

def save_to_csv(results_data, output_path='transcription_results.csv'):
    """전사 결과를 CSV 파일로 저장"""
    # 결과 데이터 저장용 리스트
    rows = []
    
    for batch_data in results_data:
        filename = batch_data['filename']
        reference = batch_data['reference']
        transcription = batch_data['transcription']
        cer = batch_data['cer']
        
        rows.append({
            'Filename': filename,
            'Reference': reference,
            'Transcription': transcription,
            'CER': cer
        })
    
    # DataFrame 생성 및 저장
    df = pd.DataFrame(rows)
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"\n전사 결과가 {output_path}에 저장되었습니다.")

def main():
    # CUDA 설정
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    
    print("모델 로딩 중...")
    # Whisper 모델 로드
    model_id = "openai/whisper-large-v3"
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, 
        torch_dtype=torch_dtype, 
        low_cpu_mem_usage=True, 
        use_safetensors=True,
        device_map="auto"
    )
    
    processor = AutoProcessor.from_pretrained(model_id)
    
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        batch_size=16,  # 배치 크기 증가
        return_timestamps=True,
        generate_kwargs={
            "task": "transcribe", 
            "language": "ko"
        }
    )
    print("모델 로딩 완료!")
    
    # 데이터 준비
    base_path = '/content/drive/Othercomputers/내 노트북/MARS/All_Datas/J91'
    file_list = [f"S{str(num).zfill(8)}" for num in range(7727, 7867 + 1)]
    
    # 데이터셋 및 데이터로더 생성
    dataset = AudioTextDataset(base_path, file_list)
    dataloader = DataLoader(
        dataset, 
        batch_size=16,  # 배치 크기
        shuffle=False, 
        num_workers=4,  # 데이터 로딩 워커 수
        collate_fn=collate_fn,
        pin_memory=True  # CUDA 전송 최적화
    )
    
    # 결과 저장용
    results = {}
    detailed_results = []  # 상세 전사 결과 저장용
    
    # 처리 진행
    print(f"총 샘플 수: {len(dataset)}")
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="음성 처리 중"):
            try:
                # Whisper로 음성인식
                transcriptions = pipe(batch['audio'])
                
                # CER 계산 및 결과 저장
                for filename, ref_text, trans in zip(batch['json_filename'], batch['text'], transcriptions):
                    cer = calculate_cer(ref_text, trans['text'])
                    
                    if filename not in results:
                        results[filename] = {'cer_scores': [], 'num_processed': 0}
                    
                    results[filename]['cer_scores'].append(cer)
                    results[filename]['num_processed'] += 1
                    
                    # 상세 전사 결과 저장
                    detailed_results.append({
                        'filename': filename,
                        'reference': ref_text,
                        'transcription': trans['text'],
                        'cer': cer
                    })
            except Exception as e:
                print(f"Error processing batch: {str(e)}")
                continue
    
    # JSON 파일 업데이트
    print("JSON 파일 업데이트 중...")
    update_json_files(results, base_path)
    
    # 결과 정리
    final_results = []
    for filename, data in results.items():
        avg_cer = np.mean(data['cer_scores']) if data['cer_scores'] else None
        final_results.append({
            'filename': filename,
            'status': 'Success',
            'avg_cer': float(avg_cer) if avg_cer is not None else None,
            'num_processed': data['num_processed']
        })
    
    # 통계 출력
    successful_files = sum(1 for r in final_results if r['avg_cer'] is not None)
    print(f"\n처리 완료된 파일: {successful_files}/{len(file_list)}")
    
    valid_results = [r['avg_cer'] for r in final_results if r['avg_cer'] is not None]
    if valid_results:
        total_avg_cer = np.mean(valid_results)
        print(f"전체 평균 CER: {total_avg_cer:.4f}")
    
    # JSON 결과 저장
    result_path = 'processing_results.json'
    with open(result_path, 'w', encoding='utf-8') as f:
        json.dump(final_results, f, ensure_ascii=False, indent=2)
    print(f"\n상세 결과가 {result_path}에 저장되었습니다.")
    
    # CSV 파일로 전사 결과 저장
    save_to_csv(detailed_results, 'transcription_results.csv')

if __name__ == '__main__':
    main()


  from .autonotebook import tqdm as notebook_tqdm
You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of Up Guards and Adam paintings, and Mason's exquisite idylls are as national as a jingo poem. Mr. Burkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap on the back before he says, like a shampooer in a Turkish bath, Next man!
[0.0s -> 5.3s]  Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.
[6.4s -> 10.1s]  Nor is Mr. Quilter's manner less interesting than his ma