## OCR 수행


In [None]:
# 1. 필요한 패키지 설치
!pip install easyocr
!pip install torch torchvision
!pip install tqdm pandas


# GPU 확인
import torch
print("GPU available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU model:", torch.cuda.get_device_name(0))
    !nvidia-smi  # GPU 상태 확인

import torch
import easyocr
import re
import json
import os
import pandas as pd
from datetime import datetime
import unicodedata
from tqdm import tqdm
from PIL import Image
from google.colab import drive


# 3. Google Drive 마운트
drive.mount('/content/drive')

class KoreanOCR:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print(f"Using device: {self.device}")

        print("Initializing EasyOCR (this might take a while on first run)...")
        self.reader = easyocr.Reader(['ko'], gpu=self.device=='cuda')
        print("OCR model loaded successfully!")

    def recognize_text(self, image_path):
        """이미지에서 텍스트 인식"""
        try:
            results = self.reader.readtext(image_path)
            text = ' '.join([result[1] for result in results])
            return text
        except Exception as e:
            print(f"Error recognizing text: {str(e)}")
            return ""

def create_core_financial_patterns():
    """핵심 신용등급 예측을 위한 간결한 정보 추출 패턴 생성"""
    patterns = {
        'balance_sheet': r'(?:재무상태표|재무상태|자산총계|부채총계|자본총계)[\s\S]*?(?=영업이익|다음|$)',
        'income_statement': r'(?:손익계산서|포괄손익계산서|매출액|영업이익|당기순이익)[\s\S]*?(?=자산|다음|$)',
        'credit_info': r'(?:신용등급|신용평가|등급전망|신용위험|채무불이행|리스크)[\s\S]*?(?=재무|다음|$)',
        'board_and_management': r'(?:이사회 의견|이사회 결정|경영진 논의|경영진 평가|경영진 의견)[\s\S]*?(?=재무|다음|$)',
        'footnotes': r'(?:주석|비고|추가정보|기타사항|부기사항|특기사항)[\s\S]*?(?=재무|다음|$)'
    }

    combined_pattern = '|'.join(f'({pattern})' for pattern in patterns.values())
    return combined_pattern

def extract_financial_data(text, pattern=None):
    """재무 관련 정보 추출 및 정제"""
    if pattern is None:
        pattern = create_financial_patterns()

    matches = re.finditer(pattern, text, re.MULTILINE | re.DOTALL)
    extracted_texts = []

    for match in matches:
        extracted_text = match.group()
        extracted_text = re.sub(r'\s+', ' ', extracted_text).strip()
        extracted_text = re.sub(r'[^\w\s.()%+-]', '', extracted_text)
        extracted_texts.append(extracted_text)

    unique_texts = list(dict.fromkeys(extracted_texts))
    return ' '.join(unique_texts)

def process_folder(folder_path, save_path, label, mode='TRAIN'):
    """폴더 내 모든 이미지 처리"""
    try:
        company_name = os.path.basename(folder_path).split(']')[0].replace('[', '').replace(" ", "").strip()
        company_name = unicodedata.normalize('NFC', company_name)

        ocr = KoreanOCR()
        all_texts = []

        image_files = sorted([
            f for f in os.listdir(folder_path)
            if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff'))
        ])

        if not image_files:
            print("No image files found in the specified folder!")
            return "", pd.DataFrame()

        print(f"Found {len(image_files)} image files")

        for image_file in tqdm(image_files, desc="Processing images"):
            try:
                image_path = os.path.join(folder_path, image_file)
                recognized_text = ocr.recognize_text(image_path)
                all_texts.append(recognized_text)
            except Exception as e:
                print(f"Error processing {image_file}: {str(e)}")

        combined_text = ' '.join(all_texts)
        financial_text = extract_financial_data(combined_text)

        result_df = pd.DataFrame({
            'company': [company_name],
            'credit_rating': [label],
            'financial_text': [financial_text],
            'timestamp': [datetime.now().strftime('%Y-%m-%d %H:%M:%S')]
        })

        result = {
            'company': company_name,
            'credit_rating': label,
            'financial_text': financial_text,
            'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }

        os.makedirs(save_path, exist_ok=True)
        json_base_path = os.path.join(save_path, 'JSON')
        json_save_path = os.path.join(json_base_path, mode)
        os.makedirs(json_save_path, exist_ok=True)

        json_filename = os.path.join(json_save_path, f'{company_name}.json')
        with open(json_filename, 'w', encoding='utf-8') as json_file:
            json.dump(result, json_file, ensure_ascii=False, indent=4)

        return combined_text, result_df

    except Exception as e:
        print(f"Error in process_folder: {str(e)}")
        return "", pd.DataFrame()

GPU available: True
GPU model: Tesla T4
Sun Oct 27 17:51:06 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0              26W /  70W |   4187MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                            

In [None]:
def process_all_folders(base_folder, save_folder, credit_rating_df):
    """모든 폴더 처리"""
    os.makedirs(save_folder, exist_ok=True)

    train_folder = os.path.join(base_folder, "TRAIN")
    test_folder = os.path.join(base_folder, "TEST")

    train_subfolders = sorted([f.path for f in os.scandir(train_folder) if f.is_dir()])
    test_subfolders = sorted([f.path for f in os.scandir(test_folder) if f.is_dir()])

    total_folders = len(train_subfolders) + len(test_subfolders)
    print(f"총 처리할 폴더 수: {total_folders}")
    print(f"신용등급 데이터의 기업 수: {len(credit_rating_df)}")

    credit_rating_df['기업명_normalized'] = credit_rating_df['기업명'].apply(
        lambda x: unicodedata.normalize('NFC', str(x).strip())
    )

    def get_company_info(folder_path):
        """회사명 추출 및 신용등급 찾기"""
        try:
            company_name = os.path.basename(folder_path).split(']')[0].replace('[', '').strip()
            company_name = unicodedata.normalize('NFC', company_name)

            matching_companies = credit_rating_df[credit_rating_df['기업명_normalized'] == company_name]

            if len(matching_companies) == 0:
                print(f"경고: {company_name}에 대한 신용등급 정보를 찾을 수 없습니다.")
                return None, None

            if len(matching_companies) > 1:
                print(f"경고: {company_name}에 대해 중복된 신용등급 정보가 있습니다.")

            credit_rating = matching_companies['신용등급'].iloc[0]
            return company_name, credit_rating

        except Exception as e:
            print(f"회사 정보 추출 중 오류 발생: {str(e)}")
            return None, None

    def process_subset(subfolders, mode, pbar_desc):
        """TRAIN/TEST 서브폴더 처리"""
        results = []
        failed_companies = []

        for folder in tqdm(subfolders, desc=pbar_desc):
            company_name, credit_rating = get_company_info(folder)

            if company_name is None or credit_rating is None:
                failed_companies.append(os.path.basename(folder))
                continue

            try:
                full_text, result_df = process_folder(
                    folder_path=folder,
                    save_path=save_folder,
                    label=credit_rating,
                    mode=mode
                )

                if not result_df.empty:
                    results.append(result_df)

            except Exception as e:
                failed_companies.append(os.path.basename(folder))
                print(f"\n{company_name} 처리 중 오류 발생: {str(e)}")

        return results, failed_companies

    print("\nTRAIN 폴더 처리 중...")
    train_results, train_failed = process_subset(
        train_subfolders,
        'TRAIN',
        'Processing TRAIN folders'
    )

    print("\nTEST 폴더 처리 중...")
    test_results, test_failed = process_subset(
        test_subfolders,
        'TEST',
        'Processing TEST folders'
    )

    failed_cases = train_failed + test_failed
    if failed_cases:
        failed_log = os.path.join(save_folder, 'failed_cases.txt')
        with open(failed_log, 'w', encoding='utf-8') as f:
            f.write('\n'.join(failed_cases))
        print(f"\n처리 실패한 케이스들이 {failed_log}에 저장되었습니다.")

    all_results = train_results + test_results
    if not all_results:
        print("처리된 결과가 없습니다!")
        return pd.DataFrame()

    final_df = pd.concat(all_results, ignore_index=True)

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

    csv_filename = os.path.join(save_folder, f'combined_results_{timestamp}.csv')
    final_df.to_csv(csv_filename, index=False, encoding='utf-8-sig')

    summary = {
        'total_companies': len(final_df),
        'unique_companies': len(final_df['company'].unique()),
        'credit_rating_distribution': final_df['credit_rating'].value_counts().to_dict(),
        'processing_timestamp': timestamp,
        'failed_cases_count': len(failed_cases)
    }

    summary_filename = os.path.join(save_folder, f'processing_summary_{timestamp}.json')
    with open(summary_filename, 'w', encoding='utf-8') as f:
        json.dump(summary, f, ensure_ascii=False, indent=4)

    print(f"\n처리 완료!")
    print(f"- 총 처리된 기업 수: {len(final_df)}")
    print(f"- 실패한 케이스 수: {len(failed_cases)}")
    print(f"- 결과 저장 위치: {csv_filename}")
    print(f"- 요약 정보 저장 위치: {summary_filename}")

    return final_df

In [None]:
if __name__ == "__main__":
    # CUDA 사용 가능 여부 확인
    print("CUDA 사용 가능 여부:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("사용 중인 GPU:", torch.cuda.get_device_name(0))

    # Google Drive 마운트 (Colab 사용시)
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        print("Google Drive가 마운트되었습니다.")
    except:
        print("로컬 환경에서 실행 중입니다.")

    # 경로 설정
    base_folder = "/content/drive/MyDrive/Colab Notebooks/딥러닝_프로젝트/data"
    save_folder = "/content/drive/MyDrive/Colab Notebooks/딥러닝_프로젝트/OCR_results"

    # 기업 이름과 신용등급을 리스트로 구성
    data = {
        "기업명": [
            "경남은행", "광동제약", "교보증권", "기업은행", "깨끗한나라", "대상", "동국산업", "동부건설",
            "매일홀딩스", "부산은행", "비씨카드", "빙그레", "삼성증권", "셀트리온헬스케어", "송원산업",
            "신세계푸드", "신한은행", "아이비케이투자증권", "아이엠뱅크", "우리은행", "우리카드",
            "이수화학", "이지홀딩스", "제주은행", "케이비국민카드", "키움증권", "한국스탠다드차타드은행",
            "한국자산신탁", "한국증권금융", "한화엔진", "한화투자증권", "HDC", "KB금융", "SFA반도체"
        ],
        "신용등급": [
            "AA+", "A", "A+", "AAA", "BBB", "A+", "BBB+", "BBB", "A+", "AAA",
            "AA+", "AA-", "AA+", "A+", "A-", "A+", "AAA", "A", "AAA", "AAA",
            "AA", "BBB-", "BBB+", "AA+", "AA+", "AA-", "AAA", "A-", "AAA",
            "BB+", "A+", "A+", "AAA", "BBB"
        ]
    }

    # 데이터프레임 생성
    credit_rating_df = pd.DataFrame(data)

    try:
        # 처리 시작 시간 기록
        start_time = datetime.now()
        print(f"처리 시작 시간: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")

        # 전체 처리 실행
        final_results = process_all_folders(base_folder, save_folder, credit_rating_df)

        # 처리 완료 시간 및 소요 시간 계산
        end_time = datetime.now()
        processing_time = end_time - start_time

        print(f"\n처리 완료 시간: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"총 소요 시간: {processing_time}")

        # 결과 요약
        if not final_results.empty:
            print("\n처리 결과 요약:")
            print(f"- 전체 처리된 문서 수: {len(final_results)}")
            print(f"- 고유 기업 수: {len(final_results['company'].unique())}")
            print("\n신용등급 분포:")
            print(final_results['credit_rating'].value_counts())

    except Exception as e:
        print(f"\n처리 중 오류 발생: {str(e)}")
        raise

CUDA 사용 가능 여부: True
사용 중인 GPU: Tesla T4
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive가 마운트되었습니다.
처리 시작 시간: 2024-10-27 17:51:16
총 처리할 폴더 수: 12
신용등급 데이터의 기업 수: 34

TRAIN 폴더 처리 중...


Processing TRAIN folders:   0%|          | 0/4 [00:00<?, ?it/s]

Using device: cuda
Initializing EasyOCR (this might take a while on first run)...
OCR model loaded successfully!
Found 343 image files



Processing images:   0%|          | 0/343 [00:00<?, ?it/s][A
Processing images:   0%|          | 1/343 [00:03<19:43,  3.46s/it][A
Processing images:   1%|          | 2/343 [00:07<20:43,  3.65s/it][A
Processing images:   1%|          | 3/343 [00:14<29:32,  5.21s/it][A
Processing images:   1%|          | 4/343 [00:23<37:20,  6.61s/it][A
Processing images:   1%|▏         | 5/343 [00:28<35:01,  6.22s/it][A
Processing images:   2%|▏         | 6/343 [00:29<24:47,  4.41s/it][A
Processing images:   2%|▏         | 7/343 [00:31<20:29,  3.66s/it][A
Processing images:   2%|▏         | 8/343 [00:33<16:55,  3.03s/it][A
Processing images:   3%|▎         | 9/343 [00:37<19:12,  3.45s/it][A
Processing images:   3%|▎         | 10/343 [00:38<14:54,  2.69s/it][A
Processing images:   3%|▎         | 11/343 [00:40<14:13,  2.57s/it][A
Processing images:   3%|▎         | 12/343 [00:43<13:26,  2.44s/it][A
Processing images:   4%|▍         | 13/343 [00:45<14:09,  2.57s/it][A
Processing images:   4%

Using device: cuda
Initializing EasyOCR (this might take a while on first run)...
OCR model loaded successfully!
Found 360 image files



Processing images:   0%|          | 0/360 [00:00<?, ?it/s][A
Processing images:   0%|          | 1/360 [00:02<15:39,  2.62s/it][A
Processing images:   1%|          | 2/360 [00:06<18:21,  3.08s/it][A
Processing images:   1%|          | 3/360 [00:09<19:19,  3.25s/it][A
Processing images:   1%|          | 4/360 [00:18<32:14,  5.43s/it][A
Processing images:   1%|▏         | 5/360 [00:20<25:22,  4.29s/it][A
Processing images:   2%|▏         | 6/360 [00:22<20:57,  3.55s/it][A
Processing images:   2%|▏         | 7/360 [00:25<20:31,  3.49s/it][A
Processing images:   2%|▏         | 8/360 [00:28<18:41,  3.19s/it][A
Processing images:   2%|▎         | 9/360 [00:30<16:27,  2.81s/it][A
Processing images:   3%|▎         | 10/360 [00:33<15:49,  2.71s/it][A
Processing images:   3%|▎         | 11/360 [00:34<14:23,  2.47s/it][A
Processing images:   3%|▎         | 12/360 [00:36<13:03,  2.25s/it][A
Processing images:   4%|▎         | 13/360 [00:39<14:07,  2.44s/it][A
Processing images:   4%

Using device: cuda
Initializing EasyOCR (this might take a while on first run)...
OCR model loaded successfully!
Found 341 image files



Processing images:   0%|          | 0/341 [00:00<?, ?it/s][A
Processing images:   0%|          | 1/341 [00:02<14:04,  2.48s/it][A
Processing images:   1%|          | 2/341 [00:05<15:28,  2.74s/it][A
Processing images:   1%|          | 3/341 [00:09<18:27,  3.28s/it][A
Processing images:   1%|          | 4/341 [00:19<33:37,  5.99s/it][A
Processing images:   1%|▏         | 5/341 [00:20<23:17,  4.16s/it][A
Processing images:   2%|▏         | 6/341 [00:22<18:58,  3.40s/it][A
Processing images:   2%|▏         | 7/341 [00:23<15:22,  2.76s/it][A
Processing images:   2%|▏         | 8/341 [00:27<16:23,  2.95s/it][A
Processing images:   3%|▎         | 9/341 [00:30<16:28,  2.98s/it][A
Processing images:   3%|▎         | 10/341 [00:33<16:26,  2.98s/it][A
Processing images:   3%|▎         | 11/341 [00:35<15:29,  2.82s/it][A
Processing images:   4%|▎         | 12/341 [00:38<15:16,  2.79s/it][A
Processing images:   4%|▍         | 13/341 [00:41<15:35,  2.85s/it][A
Processing images:   4%

Using device: cuda
Initializing EasyOCR (this might take a while on first run)...
OCR model loaded successfully!
Found 504 image files



Processing images:   0%|          | 0/504 [00:00<?, ?it/s][A
Processing images:   0%|          | 1/504 [00:03<27:04,  3.23s/it][A
Processing images:   0%|          | 2/504 [00:06<28:20,  3.39s/it][A
Processing images:   1%|          | 3/504 [00:09<26:52,  3.22s/it][A
Processing images:   1%|          | 4/504 [00:12<25:23,  3.05s/it][A
Processing images:   1%|          | 5/504 [00:17<32:11,  3.87s/it][A
Processing images:   1%|          | 6/504 [00:33<1:04:15,  7.74s/it][A
Processing images:   1%|▏         | 7/504 [00:36<51:55,  6.27s/it]  [A
Processing images:   2%|▏         | 8/504 [00:38<41:45,  5.05s/it][A
Processing images:   2%|▏         | 9/504 [00:41<35:22,  4.29s/it][A
Processing images:   2%|▏         | 10/504 [00:45<34:35,  4.20s/it][A
Processing images:   2%|▏         | 11/504 [00:49<32:59,  4.02s/it][A
Processing images:   2%|▏         | 12/504 [00:51<28:56,  3.53s/it][A
Processing images:   3%|▎         | 13/504 [00:53<26:04,  3.19s/it][A
Processing images: 


TEST 폴더 처리 중...


Processing TEST folders:   0%|          | 0/8 [00:00<?, ?it/s]

Using device: cuda
Initializing EasyOCR (this might take a while on first run)...
OCR model loaded successfully!
Found 408 image files



Processing images:   0%|          | 0/408 [00:00<?, ?it/s][A
Processing images:   0%|          | 1/408 [00:02<18:37,  2.75s/it][A
Processing images:   0%|          | 2/408 [00:05<20:04,  2.97s/it][A
Processing images:   1%|          | 3/408 [00:08<18:22,  2.72s/it][A
Processing images:   1%|          | 4/408 [00:10<17:22,  2.58s/it][A
Processing images:   1%|          | 5/408 [00:19<32:48,  4.88s/it][A
Processing images:   1%|▏         | 6/408 [00:21<25:27,  3.80s/it][A
Processing images:   2%|▏         | 7/408 [00:23<21:54,  3.28s/it][A
Processing images:   2%|▏         | 8/408 [00:25<19:11,  2.88s/it][A
Processing images:   2%|▏         | 9/408 [00:27<16:57,  2.55s/it][A
Processing images:   2%|▏         | 10/408 [00:30<18:41,  2.82s/it][A
Processing images:   3%|▎         | 11/408 [00:32<15:46,  2.38s/it][A
Processing images:   3%|▎         | 12/408 [00:34<15:19,  2.32s/it][A
Processing images:   3%|▎         | 13/408 [00:37<17:04,  2.59s/it][A
Processing images:   3%

Using device: cuda
Initializing EasyOCR (this might take a while on first run)...
OCR model loaded successfully!
Found 874 image files



Processing images:   0%|          | 0/874 [00:00<?, ?it/s][A
Processing images:   0%|          | 1/874 [00:02<36:45,  2.53s/it][A
Processing images:   0%|          | 2/874 [00:09<1:16:30,  5.26s/it][A
Processing images:   0%|          | 3/874 [00:16<1:24:59,  5.86s/it][A
Processing images:   0%|          | 4/874 [00:21<1:23:44,  5.77s/it][A
Processing images:   1%|          | 5/874 [00:24<1:08:41,  4.74s/it][A
Processing images:   1%|          | 6/874 [00:28<1:02:01,  4.29s/it][A
Processing images:   1%|          | 7/874 [00:32<59:50,  4.14s/it]  [A
Processing images:   1%|          | 8/874 [00:35<56:36,  3.92s/it][A
Processing images:   1%|          | 9/874 [01:25<4:25:22, 18.41s/it][A
Processing images:   1%|          | 10/874 [01:28<3:16:41, 13.66s/it][A
Processing images:   1%|▏         | 11/874 [01:33<2:37:21, 10.94s/it][A
Processing images:   1%|▏         | 12/874 [01:36<2:03:38,  8.61s/it][A
Processing images:   1%|▏         | 13/874 [01:42<1:52:20,  7.83s/it][A
P

Using device: cuda
Initializing EasyOCR (this might take a while on first run)...
OCR model loaded successfully!
Found 183 image files



Processing images:   0%|          | 0/183 [00:00<?, ?it/s][A
Processing images:   1%|          | 1/183 [00:02<07:38,  2.52s/it][A
Processing images:   1%|          | 2/183 [00:08<14:12,  4.71s/it][A
Processing images:   2%|▏         | 3/183 [00:11<11:13,  3.74s/it][A
Processing images:   2%|▏         | 4/183 [00:13<09:50,  3.30s/it][A
Processing images:   3%|▎         | 5/183 [00:16<08:40,  2.92s/it][A
Processing images:   3%|▎         | 6/183 [00:18<08:10,  2.77s/it][A
Processing images:   4%|▍         | 7/183 [00:22<08:42,  2.97s/it][A
Processing images:   4%|▍         | 8/183 [00:24<08:30,  2.92s/it][A
Processing images:   5%|▍         | 9/183 [00:27<08:08,  2.81s/it][A
Processing images:   5%|▌         | 10/183 [00:30<07:55,  2.75s/it][A
Processing images:   6%|▌         | 11/183 [00:32<07:28,  2.61s/it][A
Processing images:   7%|▋         | 12/183 [00:36<08:23,  2.95s/it][A
Processing images:   7%|▋         | 13/183 [00:40<09:38,  3.40s/it][A
Processing images:   8%

Using device: cuda
Initializing EasyOCR (this might take a while on first run)...
OCR model loaded successfully!
Found 512 image files



Processing images:   0%|          | 0/512 [00:00<?, ?it/s][A
Processing images:   0%|          | 1/512 [00:03<29:35,  3.48s/it][A
Processing images:   0%|          | 2/512 [00:06<28:54,  3.40s/it][A
Processing images:   1%|          | 3/512 [00:09<24:24,  2.88s/it][A
Processing images:   1%|          | 4/512 [00:10<20:54,  2.47s/it][A
Processing images:   1%|          | 5/512 [00:13<21:32,  2.55s/it][A
Processing images:   1%|          | 6/512 [00:25<48:50,  5.79s/it][A
Processing images:   1%|▏         | 7/512 [00:28<39:34,  4.70s/it][A
Processing images:   2%|▏         | 8/512 [00:30<34:02,  4.05s/it][A
Processing images:   2%|▏         | 9/512 [00:34<33:05,  3.95s/it][A
Processing images:   2%|▏         | 10/512 [00:36<26:39,  3.19s/it][A
Processing images:   2%|▏         | 11/512 [00:37<22:58,  2.75s/it][A
Processing images:   2%|▏         | 12/512 [00:39<19:40,  2.36s/it][A
Processing images:   3%|▎         | 13/512 [00:42<20:52,  2.51s/it][A
Processing images:   3%

Using device: cuda
Initializing EasyOCR (this might take a while on first run)...
OCR model loaded successfully!
Found 301 image files



Processing images:   0%|          | 0/301 [00:00<?, ?it/s][A
Processing images:   0%|          | 1/301 [00:02<13:59,  2.80s/it][A
Processing images:   1%|          | 2/301 [00:06<16:21,  3.28s/it][A
Processing images:   1%|          | 3/301 [00:08<14:05,  2.84s/it][A
Processing images:   1%|▏         | 4/301 [00:14<19:29,  3.94s/it][A
Processing images:   2%|▏         | 5/301 [00:17<17:28,  3.54s/it][A
Processing images:   2%|▏         | 6/301 [00:20<16:36,  3.38s/it][A
Processing images:   2%|▏         | 7/301 [00:22<13:56,  2.85s/it][A
Processing images:   3%|▎         | 8/301 [00:23<12:00,  2.46s/it][A
Processing images:   3%|▎         | 9/301 [00:25<11:23,  2.34s/it][A
Processing images:   3%|▎         | 10/301 [00:27<10:47,  2.22s/it][A
Processing images:   4%|▎         | 11/301 [00:30<11:03,  2.29s/it][A
Processing images:   4%|▍         | 12/301 [00:31<10:17,  2.14s/it][A
Processing images:   4%|▍         | 13/301 [00:35<11:44,  2.45s/it][A
Processing images:   5%

Using device: cuda
Initializing EasyOCR (this might take a while on first run)...
OCR model loaded successfully!
Found 312 image files



Processing images:   0%|          | 0/312 [00:00<?, ?it/s][A
Processing images:   0%|          | 1/312 [00:02<13:30,  2.61s/it][A
Processing images:   1%|          | 2/312 [00:04<10:26,  2.02s/it][A
Processing images:   1%|          | 3/312 [00:06<11:58,  2.33s/it][A
Processing images:   1%|▏         | 4/312 [00:12<17:52,  3.48s/it][A
Processing images:   2%|▏         | 5/312 [00:14<14:55,  2.92s/it][A
Processing images:   2%|▏         | 6/312 [00:15<12:52,  2.52s/it][A
Processing images:   2%|▏         | 7/312 [00:18<12:27,  2.45s/it][A
Processing images:   3%|▎         | 8/312 [00:21<14:03,  2.77s/it][A
Processing images:   3%|▎         | 9/312 [00:23<12:26,  2.46s/it][A
Processing images:   3%|▎         | 10/312 [00:24<11:01,  2.19s/it][A
Processing images:   4%|▎         | 11/312 [00:27<10:50,  2.16s/it][A
Processing images:   4%|▍         | 12/312 [00:29<10:40,  2.13s/it][A
Processing images:   4%|▍         | 13/312 [00:30<09:26,  1.90s/it][A
Processing images:   4%

Using device: cuda
Initializing EasyOCR (this might take a while on first run)...
OCR model loaded successfully!
Found 332 image files



Processing images:   0%|          | 0/332 [00:00<?, ?it/s][A
Processing images:   0%|          | 1/332 [00:02<15:53,  2.88s/it][A
Processing images:   1%|          | 2/332 [00:06<18:35,  3.38s/it][A
Processing images:   1%|          | 3/332 [00:09<16:55,  3.09s/it][A
Processing images:   1%|          | 4/332 [00:15<23:23,  4.28s/it][A
Processing images:   2%|▏         | 5/332 [00:18<21:51,  4.01s/it][A
Processing images:   2%|▏         | 6/332 [00:20<16:15,  2.99s/it][A
Processing images:   2%|▏         | 7/332 [00:22<15:29,  2.86s/it][A
Processing images:   2%|▏         | 8/332 [00:24<14:35,  2.70s/it][A
Processing images:   3%|▎         | 9/332 [00:27<14:50,  2.76s/it][A
Processing images:   3%|▎         | 10/332 [00:29<12:46,  2.38s/it][A
Processing images:   3%|▎         | 11/332 [00:32<13:53,  2.60s/it][A
Processing images:   4%|▎         | 12/332 [00:35<14:43,  2.76s/it][A
Processing images:   4%|▍         | 13/332 [00:38<14:54,  2.80s/it][A
Processing images:   4%

Using device: cuda
Initializing EasyOCR (this might take a while on first run)...
OCR model loaded successfully!
Found 465 image files



Processing images:   0%|          | 0/465 [00:00<?, ?it/s][A
Processing images:   0%|          | 1/465 [00:03<25:45,  3.33s/it][A
Processing images:   0%|          | 2/465 [00:07<28:20,  3.67s/it][A
Processing images:   1%|          | 3/465 [00:11<31:11,  4.05s/it][A
Processing images:   1%|          | 4/465 [00:16<33:39,  4.38s/it][A
Processing images:   1%|          | 5/465 [00:25<45:24,  5.92s/it][A
Processing images:   1%|▏         | 6/465 [00:29<41:10,  5.38s/it][A
Processing images:   2%|▏         | 7/465 [00:34<40:29,  5.31s/it][A
Processing images:   2%|▏         | 8/465 [00:38<36:27,  4.79s/it][A
Processing images:   2%|▏         | 9/465 [00:42<34:01,  4.48s/it][A
Processing images:   2%|▏         | 10/465 [00:47<36:33,  4.82s/it][A
Processing images:   2%|▏         | 11/465 [00:52<35:23,  4.68s/it][A
Processing images:   3%|▎         | 12/465 [00:54<30:41,  4.07s/it][A
Processing images:   3%|▎         | 13/465 [00:58<29:02,  3.85s/it][A
Processing images:   3%


처리 완료!
- 총 처리된 기업 수: 12
- 실패한 케이스 수: 0
- 결과 저장 위치: /content/drive/MyDrive/Colab Notebooks/딥러닝_프로젝트/OCR_results/combined_results_20241027_214526.csv
- 요약 정보 저장 위치: /content/drive/MyDrive/Colab Notebooks/딥러닝_프로젝트/OCR_results/processing_summary_20241027_214526.json

처리 완료 시간: 2024-10-27 21:45:26
총 소요 시간: 3:54:10.539152

처리 결과 요약:
- 전체 처리된 문서 수: 12
- 고유 기업 수: 12

신용등급 분포:
credit_rating
AAA     3
AA+     2
A+      2
BBB+    1
AA-     1
BBB     1
A-      1
BB+     1
Name: count, dtype: int64


In [None]:
import pandas as pd
import json
import os
import re
from datetime import datetime
from tqdm import tqdm

def create_core_financial_patterns():
    """핵심 신용등급 예측을 위한 간결한 정보 추출 패턴 생성"""
    patterns = {
        'balance_sheet': r'(?:재무상태표|재무상태|자산총계|부채총계|자본총계)[\s\S]*?(?=영업이익|다음|$)',
        'income_statement': r'(?:손익계산서|포괄손익계산서|매출액|영업이익|당기순이익)[\s\S]*?(?=자산|다음|$)',
        'credit_info': r'(?:신용등급|신용평가|등급전망|신용위험|채무불이행|리스크)[\s\S]*?(?=재무|다음|$)',
        'board_and_management': r'(?:이사회 의견|이사회 결정|경영진 논의|경영진 평가|경영진 의견)[\s\S]*?(?=재무|다음|$)',
        'footnotes': r'(?:주석|비고|추가정보|기타사항|부기사항|특기사항)[\s\S]*?(?=재무|다음|$)'
    }

    combined_pattern = '|'.join(f'({pattern})' for pattern in patterns.values())
    return combined_pattern

def extract_financial_data(text, pattern=None):
    """재무 관련 정보 추출 및 정제"""
    if pattern is None:
        pattern = create_core_financial_patterns()

    if pd.isna(text):  # None이나 NaN 값 처리
        return ""

    matches = re.finditer(pattern, text, re.MULTILINE | re.DOTALL)
    extracted_texts = []

    for match in matches:
        extracted_text = match.group()
        extracted_text = re.sub(r'\s+', ' ', extracted_text).strip()
        extracted_text = re.sub(r'[^\w\s.()%+-]', '', extracted_text)
        extracted_texts.append(extracted_text)

    unique_texts = list(dict.fromkeys(extracted_texts))
    return ' '.join(unique_texts)

def process_company_data(train_dir, json_output_dir, mode='TRAIN'):
    """TRAIN/TEST 디렉토리 내의 모든 기업 CSV 파일 처리"""
    try:
        # 출력 디렉토리 생성
        json_output_path = os.path.join(json_output_dir, mode)
        os.makedirs(json_output_path, exist_ok=True)

        # TRAIN/TEST 디렉토리 내의 모든 CSV 파일 찾기
        csv_files = []
        for root, _, files in os.walk(train_dir):
            for file in files:
                if file.endswith('.csv'):
                    csv_files.append(os.path.join(root, file))

        print(f"발견된 CSV 파일 수: {len(csv_files)}")

        # 각 CSV 파일 처리
        for csv_path in tqdm(csv_files, desc=f"Processing {mode} data"):
            try:
                # CSV 파일 읽기
                df = pd.read_csv(csv_path)

                # 회사명 추출 (디렉토리명에서)
                company_name = df['company'].values[0]
                company_name = company_name.split(']')[0].replace('[', '').strip()
                company_name = f"{company_name}분기"

                # 텍스트 데이터 결합 및 처리
                all_texts = ' '.join(df['text'].fillna('').astype(str))
                processed_text = extract_financial_data(all_texts)

                # JSON 데이터 구조 생성
                json_data = {
                    'company': company_name,
                    'credit_rating': df['label'].iloc[0] if not df['label'].empty else '',  # 첫 번째 label 사용
                    'financial_text': processed_text,
                    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }

                # JSON 파일 저장
                json_filename = f"{company_name}.json"
                json_filepath = os.path.join(json_output_path, json_filename)
                print(json_filename)

                with open(json_filepath, 'w', encoding='utf-8') as f:
                    json.dump(json_data, f, ensure_ascii=False, indent=4)

            except Exception as e:
                print(f"Error processing {csv_path}: {str(e)}")
                continue

        print(f"{mode} 데이터 변환 완료: {json_output_path}")
        return True

    except Exception as e:
        print(f"오류 발생: {str(e)}")
        return False

if __name__ == "__main__":
    # 사용 예시
    base_dir = "/content/drive/MyDrive/Colab Notebooks/딥러닝_프로젝트/OCR_results/CSV"  # 기본 디렉토리
    train_dir = os.path.join(base_dir, "TRAIN")  # TRAIN 디렉토리
    test_dir = os.path.join(base_dir, "TEST")    # TEST 디렉토리
    json_output_dir = os.path.join(base_dir, "JSON")  # JSON 출력 디렉토리

    # TRAIN 데이터 처리
    print("TRAIN 데이터 처리 중...")
    process_company_data(train_dir, json_output_dir, mode='TRAIN')

    # TEST 데이터 처리
    print("\nTEST 데이터 처리 중...")
    process_company_data(test_dir, json_output_dir, mode='TEST')

TRAIN 데이터 처리 중...
발견된 CSV 파일 수: 44


Processing TRAIN data:   5%|▍         | 2/44 [00:00<00:02, 15.81it/s]

동부건설분기.json
DB금융투자분기.json
HDC분기.json


Processing TRAIN data:   9%|▉         | 4/44 [00:00<00:03, 11.10it/s]

JB금융지주분기.json
JW중외제약분기.json


Processing TRAIN data:  14%|█▎        | 6/44 [00:00<00:02, 13.92it/s]

JW홀딩스분기.json
KB금융분기.json


Processing TRAIN data:  18%|█▊        | 8/44 [00:00<00:03, 11.37it/s]

SFA반도체분기.json
SK네트웍스분기.json


Processing TRAIN data:  23%|██▎       | 10/44 [00:00<00:02, 12.09it/s]

SK증권분기.json
매일홀딩스분기.json


Processing TRAIN data:  27%|██▋       | 12/44 [00:00<00:02, 13.55it/s]

부산은행분기.json
비씨카드분기.json
빙그레분기.json


Processing TRAIN data:  34%|███▍      | 15/44 [00:01<00:01, 15.89it/s]

삼성EA분기.json


Processing TRAIN data:  39%|███▊      | 17/44 [00:01<00:01, 14.40it/s]

삼성SDI분기.json
삼성증권분기.json
셀트리온헬스케어분기.json
송원산업분기.json


Processing TRAIN data:  45%|████▌     | 20/44 [00:01<00:01, 15.63it/s]

신세계푸드분기.json


Processing TRAIN data:  50%|█████     | 22/44 [00:01<00:01, 14.82it/s]

신한은행분기.json
아이비케이투자증권분기.json
아이엠뱅크분기.json


Processing TRAIN data:  55%|█████▍    | 24/44 [00:01<00:01, 13.13it/s]

우리은행분기.json
우리종합금융분기.json


Processing TRAIN data:  59%|█████▉    | 26/44 [00:01<00:01, 14.46it/s]

우리카드분기.json
유안타증권분기.json


Processing TRAIN data:  64%|██████▎   | 28/44 [00:02<00:01, 14.25it/s]

이수화학분기.json


Processing TRAIN data:  68%|██████▊   | 30/44 [00:02<00:00, 14.04it/s]

이지홀딩스분기.json
전북은행분기.json
제주은행분기.json


Processing TRAIN data:  73%|███████▎  | 32/44 [00:02<00:00, 13.62it/s]

케이비국민카드분기.json
케이비캐피탈분기.json
케이티스카이라이프분기.json


Processing TRAIN data:  80%|███████▉  | 35/44 [00:02<00:00, 14.94it/s]

키움증권분기.json


Processing TRAIN data:  84%|████████▍ | 37/44 [00:02<00:00, 14.48it/s]

하나은행분기.json
하나자산신탁분기.json


Processing TRAIN data:  89%|████████▊ | 39/44 [00:02<00:00, 14.70it/s]

한국스탠다드차타드은행분기.json
한국자산신탁분기.json


Processing TRAIN data:  93%|█████████▎| 41/44 [00:02<00:00, 15.48it/s]

한국증권금융분기.json
한전KPS분기.json


Processing TRAIN data:  98%|█████████▊| 43/44 [00:03<00:00, 15.56it/s]

한화엔진분기.json
한화투자증권분기.json


Processing TRAIN data: 100%|██████████| 44/44 [00:03<00:00, 14.37it/s]


형지엘리트분기.json
TRAIN 데이터 변환 완료: /content/drive/MyDrive/Colab Notebooks/딥러닝_프로젝트/OCR_results/CSV/JSON/TRAIN

TEST 데이터 처리 중...
발견된 CSV 파일 수: 10


Processing TEST data:  20%|██        | 2/10 [00:00<00:00, 19.20it/s]

경남은행분기.json
광동제약분기.json
광주은행분기.json


Processing TEST data:  40%|████      | 4/10 [00:00<00:00, 16.23it/s]

교보증권분기.json
국민은행분기.json


Processing TEST data:  60%|██████    | 6/10 [00:00<00:00, 11.46it/s]

기업은행분기.json
깨끗한나라분기.json


Processing TEST data:  90%|█████████ | 9/10 [00:00<00:00, 14.96it/s]

대상분기.json
동국산업분기.json


Processing TEST data: 100%|██████████| 10/10 [00:00<00:00, 14.34it/s]

동국홀딩스분기.json
TEST 데이터 변환 완료: /content/drive/MyDrive/Colab Notebooks/딥러닝_프로젝트/OCR_results/CSV/JSON/TEST



