In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Thai Medical Care Dataset Translator - Version 2.2
# Maximum efficiency with Google CPU

!pip install -q datasets googletrans==4.0.0-rc1 tqdm requests deep-translator

import pandas as pd
from datasets import load_dataset
from googletrans import Translator
from deep_translator import GoogleTranslator
from tqdm import tqdm
import os
import time
import logging
import json
import requests
from typing import Optional, List, Dict, Any
import random
import hashlib
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
warnings.filterwarnings("ignore")

# Set up logging with shortened warnings
logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(message)s')
logger = logging.getLogger(__name__)

class RobustTranslator:
    def __init__(self):
        self.translators = []
        self.cache = {}
        self.max_text_length = 3500  # Optimized for T4 GPU
        self.rate_limit_delay = 1.5   # Reduced delay for efficiency
        self.last_request_time = 0
        self.request_lock = threading.Lock()
        self._initialize_translators()

    def _initialize_translators(self):
        """Initialize translators with fixed error handling"""
        try:
            # Fix: Proper googletrans initialization
            gt = Translator()
            # Test translation without calling non-existent method
            test_result = gt.translate("test", src='en', dest='th')
            if test_result and hasattr(test_result, 'text') and test_result.text:
                self.translators.append({'name': 'googletrans', 'obj': gt})
                logger.info("✅ googletrans ready")
        except Exception as e:
            logger.warning(f"googletrans failed: {str(e)[:50]}")

        try:
            # Fix: Proper deep-translator initialization
            dt = GoogleTranslator(source='en', target='th')
            test_result = dt.translate("test")
            if test_result and isinstance(test_result, str) and test_result.strip():
                self.translators.append({'name': 'deep_translator', 'obj': dt})
                logger.info("✅ deep-translator ready")
        except Exception as e:
            logger.warning(f"deep-translator failed: {str(e)[:50]}")

        if not self.translators:
            raise RuntimeError("❌ No translators initialized!")

        logger.info(f"✅ {len(self.translators)} translator(s) active")

    def _get_cache_key(self, text: str) -> str:
        """Generate cache key"""
        return hashlib.md5(text.encode('utf-8')).hexdigest()[:16]  # Shorter keys for efficiency

    def _apply_rate_limit(self):
        """Thread-safe rate limiting optimized for T4 GPU"""
        with self.request_lock:
            current_time = time.time()
            time_diff = current_time - self.last_request_time
            if time_diff < self.rate_limit_delay:
                sleep_time = self.rate_limit_delay - time_diff + random.uniform(0.1, 0.3)
                time.sleep(sleep_time)
            self.last_request_time = time.time()

    def _clean_text(self, text: str) -> Optional[str]:
        """Fast text cleaning for maximum efficiency"""
        if not text or pd.isna(text):
            return None

        text = str(text).strip()
        if not text or text in ['', 'nan', 'None', 'null']:
            return None

        # Quick cleanup
        text = ' '.join(text.split())

        # Efficient truncation for long text
        if len(text) > self.max_text_length:
            # Find last complete sentence within limit
            truncate_pos = self.max_text_length
            for delimiter in ['. ', '? ', '! ']:
                pos = text.rfind(delimiter, 0, self.max_text_length)
                if pos > self.max_text_length * 0.7:  # At least 70% of max length
                    truncate_pos = pos + len(delimiter)
                    break
            text = text[:truncate_pos].strip()

        return text if len(text.strip()) > 0 else None

    def _translate_googletrans(self, text: str) -> Optional[str]:
        """Fixed googletrans translation"""
        try:
            translator = next((t['obj'] for t in self.translators if t['name'] == 'googletrans'), None)
            if not translator:
                return None

            self._apply_rate_limit()

            # Fix: Removed the non-existent raise_Exception call
            result = translator.translate(text, src='en', dest='th')

            if result and hasattr(result, 'text') and result.text:
                translated = result.text.strip()
                if translated and translated != text:
                    return translated

        except Exception as e:
            # Shortened error messages as requested
            logger.debug(f"GT failed: {str(e)[:30]}")

        return None

    def _translate_deep_translator(self, text: str) -> Optional[str]:
        """Fixed deep-translator translation"""
        try:
            translator = next((t['obj'] for t in self.translators if t['name'] == 'deep_translator'), None)
            if not translator:
                return None

            self._apply_rate_limit()

            # Fix: Proper error handling without non-existent methods
            result = translator.translate(text)

            if result and isinstance(result, str):
                translated = result.strip()
                if translated and translated != text:
                    return translated

        except Exception as e:
            logger.debug(f"DT failed: {str(e)[:30]}")

        return None

    def translate_text(self, text: str, max_retries: int = 2) -> str:  # Reduced retries for efficiency
        """Optimized translation with bug fixes"""
        cleaned_text = self._clean_text(text)
        if not cleaned_text:
            return ""

        # Cache check
        cache_key = self._get_cache_key(cleaned_text)
        if cache_key in self.cache:
            return self.cache[cache_key]

        # Fixed translation methods without problematic calls
        methods = [self._translate_googletrans, self._translate_deep_translator]

        # Optimized retry logic
        for attempt in range(max_retries):
            for method in methods:
                try:
                    result = method(cleaned_text)
                    if result and result.strip():
                        self.cache[cache_key] = result
                        return result
                except Exception as e:
                    continue

            # Shorter backoff for efficiency
            if attempt < max_retries - 1:
                time.sleep(1 + random.uniform(0.2, 0.8))

        # Shortened error format as requested
        short_text = cleaned_text[:20] + "..." if len(cleaned_text) > 20 else cleaned_text
        return f"[FAILED]: {short_text}"

class DatasetTranslator:
    def __init__(self, max_workers: int = 2):  # Optimized for T4 GPU
        self.translator = RobustTranslator()
        self.max_workers = max_workers

    def translate_batch(self, texts: List[str], column_name: str) -> List[str]:
        """Optimized batch translation with shortened progress messages"""
        if not texts:
            return []

        results = [''] * len(texts)
        valid_texts = [(i, text) for i, text in enumerate(texts) if text and str(text).strip()]

        logger.info(f"Translating {column_name}: {len(valid_texts)} items")

        # Optimized single-threaded processing for stability
        with tqdm(valid_texts, desc=f"{column_name}") as pbar:
            for i, text in pbar:
                try:
                    translated = self.translator.translate_text(str(text))
                    results[i] = translated

                    # Shortened progress updates
                    if (i + 1) % 10 == 0 or i == len(valid_texts) - 1:
                        success_count = sum(1 for r in results[:i+1] if r and not r.startswith("[FAILED]"))
                        pbar.set_postfix({'OK': f"{success_count}/{i+1}"})

                except Exception as e:
                    # Shortened error messages
                    short_text = str(text)[:10] + "..."
                    logger.warning(f"{short_text}")
                    results[i] = f"[ERROR]: {short_text}"

        # Quick retry for critical failures only
        failed_indices = [i for i, r in enumerate(results) if r.startswith("[FAILED]") or r.startswith("[ERROR]")]

        if failed_indices and len(failed_indices) < len(valid_texts) * 0.3:  # Only if < 30% failed
            logger.info(f"Retrying {len(failed_indices)} failed items...")

            for i in failed_indices[:10]:  # Limit retries for efficiency
                try:
                    time.sleep(2)  # Short delay
                    original_text = texts[i]
                    retranslated = self.translator.translate_text(str(original_text), max_retries=1)

                    if not retranslated.startswith("[FAILED]") and not retranslated.startswith("[ERROR]"):
                        results[i] = retranslated

                except Exception:
                    continue

        # Quick stats
        total_processed = len([r for r in results if r])
        successful = len([r for r in results if r and not r.startswith("[")])
        success_rate = (successful / total_processed * 100) if total_processed > 0 else 0

        logger.info(f"✅ {column_name}: {successful}/{total_processed} ({success_rate:.0f}%)")

        return results

    def process_dataset(self, dataset_name: str, split: str = 'train', sample_size: int =450,
                       columns_to_translate: List[str] = None) -> Optional[pd.DataFrame]:
        """Optimized dataset processing with bug fixes"""
        logger.info(f"📊 Processing: {dataset_name}")

        try:
            # Fix: Better dataset loading with proper error handling
            dataset = None
            for trust_remote in [True, False]:
                try:
                    dataset = load_dataset(dataset_name, split=split, trust_remote_code=trust_remote)
                    break
                except Exception as e:
                    if "raise_Exception" in str(e):
                        # Skip this specific error pattern
                        continue
                    logger.warning(f"Load attempt failed: {str(e)[:50]}")
                    continue

            if dataset is None:
                logger.error(f"❌ Failed to load {dataset_name}")
                return None

            # Efficient sampling
            actual_size = min(sample_size, len(dataset))

            # Fix: Improved pandas conversion with proper error handling
            try:
                if actual_size == len(dataset):
                    df = dataset.to_pandas()
                else:
                    df = dataset.select(range(actual_size)).to_pandas()
            except Exception as e:
                # Alternative method for problematic datasets
                logger.info("Using alternative conversion...")
                data_dict = {}
                sample_data = dataset.select(range(actual_size))
                for key in sample_data.features.keys():
                    try:
                        data_dict[key] = [example[key] for example in sample_data]
                    except Exception:
                        data_dict[key] = ["" for _ in range(actual_size)]  # Fill with empty strings
                df = pd.DataFrame(data_dict)

            logger.info(f"✅ Loaded {len(df)} samples")
            logger.info(f"Columns: {list(df.columns)}")

            # Smart column mapping
            if columns_to_translate:
                available_columns = set(df.columns)
                columns_map = {}

                for target_col in columns_to_translate:
                    if target_col in available_columns:
                        columns_map[target_col] = target_col
                    else:
                        # Find similar columns
                        matches = [col for col in available_columns
                                 if target_col.lower() in col.lower() or col.lower() in target_col.lower()]
                        if matches:
                            best_match = min(matches, key=len)
                            columns_map[target_col] = best_match
                            logger.info(f"Mapped '{target_col}' → '{best_match}'")

                # Translate available columns
                for target_col, actual_col in columns_map.items():
                    if actual_col in df.columns:
                        logger.info(f"🔄 Translating '{actual_col}'...")

                        text_values = df[actual_col].fillna("").astype(str).tolist()
                        non_empty_count = sum(1 for val in text_values if val and str(val).strip() not in ['', 'nan', 'None'])

                        if non_empty_count > 0:
                            translated_values = self.translate_batch(text_values, actual_col)
                            translated_col_name = f'th_{actual_col}'
                            df[translated_col_name] = translated_values

                            successful_translations = sum(
                                1 for val in translated_values
                                if val and not val.startswith("[")
                            )
                            logger.info(f"✅ Added '{translated_col_name}' with {successful_translations} translations")
                        else:
                            logger.warning(f"⚠️ No valid text in '{actual_col}'")

            return df

        except Exception as e:
            logger.error(f"❌ Error processing {dataset_name}: {str(e)[:100]}")
            return None

    def save_dataset(self, df: pd.DataFrame, filename: str, drive_path: str) -> bool:
        """Optimized dataset saving"""
        if df is None or df.empty:
            logger.error("❌ Cannot save empty dataset")
            return False

        try:
            os.makedirs(drive_path, exist_ok=True)
            file_path = os.path.join(drive_path, filename)

            # Efficient saving with proper Thai encoding
            df.to_csv(file_path, index=False, encoding='utf-8-sig', escapechar='\\')

            if os.path.exists(file_path):
                file_size = os.path.getsize(file_path)
                logger.info(f"💾 Saved {len(df)} samples ({file_size:,} bytes)")
                return True
            else:
                logger.error(f"❌ File not created: {filename}")
                return False

        except Exception as e:
            logger.error(f"❌ Save error: {str(e)[:50]}")
            return False

def main():
    """Optimized main execution for maximum T4 GPU efficiency"""
    logger.info("🚀 Thai Medical Dataset Translation v2.1 - T4 Optimized")
    logger.info("="*60)

    try:
        translator = DatasetTranslator(max_workers=2)
        drive_path = "/content/drive/MyDrive/V89Technology/thai_medicalCare_dataset450"

        # Optimized dataset config
        datasets_config = [
            {
                "name": "Amod/mental_health_counseling_conversations",
                "columns": ['Context', 'Response'],
                "filename": "mental_health_thai_450.csv"
            },
            {
                "name": "lavita/ChatDoctor-HealthCareMagic-100k",
                "columns": ['instruction', 'input', 'output'],
                "filename": "healthcare_thai_450.csv"
            },
            {
                "name": "medalpaca/medical_meadow_pubmed_causal",
                "columns": ['input', 'output', 'instruction'],
                "filename": "pubmed_thai_450.csv"
            },
            {
                "name": "medalpaca/medical_meadow_mediqa",
                "columns": ['instruction', 'input', 'output'],
                "filename": "medical_qa_thai_450.csv"
            }
        ]

        successful = 0
        failed = []
        start_time = time.time()

        for i, config in enumerate(datasets_config):
            logger.info(f"\n[{i+1}/{len(datasets_config)}] {config['name']}")
            logger.info("-"*50)

            try:
                df = translator.process_dataset(
                    config["name"],
                    sample_size=450,
                    columns_to_translate=config["columns"]
                )

                if df is not None and not df.empty:
                    if translator.save_dataset(df, config["filename"], drive_path):
                        successful += 1
                        logger.info(f"✅ SUCCESS: {config['name']}")
                    else:
                        failed.append(config["name"])
                        logger.error(f"❌ SAVE FAILED: {config['name']}")
                else:
                    failed.append(config["name"])
                    logger.error(f"❌ PROCESS FAILED: {config['name']}")

            except Exception as e:
                failed.append(config["name"])
                logger.error(f"❌ ERROR: {config['name']}: {str(e)[:50]}")

            # Reduced inter-dataset delay for efficiency
            if i < len(datasets_config) - 1:
                time.sleep(10)

        # Final summary
        total_time = time.time() - start_time
        success_rate = successful / len(datasets_config) * 100

        logger.info(f"\n{'='*60}")
        logger.info("🏁 TRANSLATION COMPLETE")
        logger.info(f"⏱️ Total time: {total_time/60:.1f} minutes")
        logger.info(f"✅ Success: {successful}/{len(datasets_config)} ({success_rate:.0f}%)")

        if failed:
            logger.info(f"❌ Failed: {', '.join(failed)}")

        logger.info(f"📁 Output: {drive_path}")
        logger.info("="*60)

    except Exception as e:
        logger.error(f"❌ Fatal error: {str(e)[:100]}")
        raise

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd

# 1. Read CSV from Google Drive
drive_path = "/content/drive/MyDrive/V89Technology/thai_medicalCare_dataset450/pubmed_thai_450.csv"
df = pd.read_csv(drive_path)
df.info()

In [None]:
df

In [None]:
df2 = pd.read_csv ("/content/drive/MyDrive/V89Technology/thai_medicalCare_dataset450/mental_health_thai_450.csv")
df2.info()

In [None]:
df3 = pd.read_csv ("/content/drive/MyDrive/V89Technology/thai_medicalCare_dataset450/medical_qa_thai_450.csv")
df3.info()

In [None]:
df4 =  pd.read_csv ("/content/drive/MyDrive/V89Technology/thai_medicalCare_dataset450/healthcare_thai_450.csv")
df4.info()

In [None]:
df4

In [None]:
print(df4.columns.tolist())


In [None]:
df4.dropna(subset=['th_input'], inplace=True)
df4.info()

In [None]:
# 2. Replace "[FAILED]: " with "" across all string columns
#df4 = df.applymap(lambda x: x.replace("[FAILED]: ", "") if isinstance(x, str) else x)

# 3. Save the cleaned file to new CSV
output_path = "/content/drive/MyDrive/V89Technology/thai_medicalCare_dataset450/healthcare_thai_449_clean.csv"
df4.to_csv(output_path, index=False)

print(f"✅ Cleaned CSV saved at: {output_path}")
