In [None]:
import os
import re
import time
import emoji
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.metrics import CategoricalAccuracy, Precision, Recall
from collections import Counter
from datetime import datetime
import csv
import gc

# Transformers kütüphanesini import et
from transformers import AutoTokenizer, TFAutoModel
import warnings
warnings.filterwarnings('ignore')

# GPU kontrol
def check_gpu():
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        print(f"{len(gpus)} GPU bulundu:")
        for gpu in gpus:
            print(f"  {gpu}")
        for gpu in gpus:
            try:
                tf.config.experimental.set_memory_growth(gpu, True)
                print(f"GPU bellek büyümesi {gpu} için etkinleştirildi")
            except RuntimeError as e:
                print(f"GPU bellek büyümesi {gpu} için ayarlanamadı: {e}")
        return True
    else:
        print("GPU bulunamadı, işlemler CPU üzerinde gerçekleştirilecek")
        return False

# F1 metriği
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super(F1Score, self).__init__(name=name, **kwargs)
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = tf.argmax(y_pred, axis=1)
        y_true = tf.argmax(y_true, axis=1)
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def reset_state(self):
        self.precision.reset_state()
        self.recall.reset_state()

    def result(self):
        p = self.precision.result()
        r = self.recall.result()
        return 2 * ((p * r) / (p + r + tf.keras.backend.epsilon()))

# Türkçe stopword temizliği
def load_turkish_stopwords(stopwords_path):
    """Türkçe stopword'leri yükler"""
    stoplist = []
    try:
        with open(stopwords_path, "r", encoding='utf-8') as f:
            for line in f:
                if line.strip() and line.strip()[0:1] != "#":
                    for word in line.split():
                        stoplist.append(word)
    except Exception as e:
        print(f"Stopword dosyası yüklenirken hata: {e}")
    return stoplist

# Metin temizleme fonksiyonu
def clean_turkish_text(text, stoplist=None):
    """Türkçe metinleri temizler"""
    text = text.lower()
    text = re.sub("\n", " ", text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = emoji.replace_emoji(text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    
    if stoplist:
        words = text.split()
        text = ' '.join([word for word in words if word not in stoplist])
    
    text = ' '.join(text.split())
    return text

# Klasörleri oluştur
def create_directories():
    """Gerekli dizin yapısını oluşturur"""
    dirs = [
        "outputs",
        "outputs/models",
        "outputs/history",
        "outputs/images"
    ]
    for dir_path in dirs:
        os.makedirs(dir_path, exist_ok=True)
    print("Dizin yapısı hazırlandı")

# Veri setini yükleme ve ön işleme
def load_and_preprocess_data(data_path, text_column="Haber Gövdesi Cleaned", 
                             label_column="Sınıf", stopwords_path=None, max_length=256):
    """Veri setini yükler ve ön işler"""
    print(f"Veri seti yükleniyor: {data_path}")
    
    try:
        df = pd.read_csv(data_path, usecols=['Haber Gövdesi Cleaned', 'Sınıf'], encoding='utf-8')
        df = df[[text_column, label_column]].copy()
        df.columns = ['text', 'label']
    except Exception as e:
        raise ValueError(f"Veri yüklenirken hata oluştu: {str(e)}")
    
    stoplist = load_turkish_stopwords(stopwords_path) if stopwords_path else None
    
    print("Metinler temizleniyor...")
    #df['text'] = df['text'].apply(lambda x: clean_turkish_text(x, stoplist))
    
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(df['label'])
    num_classes = len(label_encoder.classes_)
    
    print(f"Sınıf sayısı: {num_classes}")
    print(f"Sınıflar: {label_encoder.classes_}")
    
    X = df['text']
    print(f'NaN sayısı: {X.isna().sum()}')
    X = X.dropna()
    y_encoded = y_encoded[X.index]
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.1, random_state=42, stratify=y_encoded
    )

    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    
    # One-hot encoding
    y_train_onehot = tf.keras.utils.to_categorical(y_train, num_classes)
    y_test_onehot = tf.keras.utils.to_categorical(y_test, num_classes)
    
    # Sınıf ağırlıklarını hesapla
    class_counts = np.bincount(y_train)
    total = len(y_train)
    class_weights = {i: np.sqrt(total / (num_classes * count)) for i, count in enumerate(class_counts)}
    
    print(f"Sınıf ağırlıkları: {class_weights}")
    print(f"Eğitim seti: {len(X_train)}, Test seti: {len(X_test)}")
    
    return X_train, X_test, y_train_onehot, y_test_onehot, class_weights, label_encoder, num_classes

# BERT tokenizasyonu
def tokenize_texts(texts, tokenizer, max_length=256):
    """BERT tokenizer ile metinleri işler"""
    encodings = tokenizer(
        texts.tolist(),
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors='tf'
    )
    return encodings['input_ids'], encodings['attention_mask']

# YENİ YAKLAŞIM: Custom BERT Model Sınıfı
class BertClassifier(tf.keras.Model):
    def __init__(self, bert_model_name, num_classes, max_length=256):
        super(BertClassifier, self).__init__()
        self.max_length = max_length
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
        self.bert = TFAutoModel.from_pretrained(bert_model_name)
        self.bert.trainable = False  # Fine-tuning için False
        
        # Sınıflandırıcı katmanları
        self.dropout1 = tf.keras.layers.Dropout(0.3)
        self.dense1 = tf.keras.layers.Dense(256, activation='relu')
        self.dropout2 = tf.keras.layers.Dropout(0.2)
        self.dense2 = tf.keras.layers.Dense(128, activation='relu')
        self.dropout3 = tf.keras.layers.Dropout(0.2)
        self.classifier = tf.keras.layers.Dense(num_classes, activation='softmax')
    
    def call(self, inputs):
        # Inputs doğrudan metin listesi olacak
        if isinstance(inputs, (list, tuple)) and len(inputs) == 2:
            # Eğer tokenize edilmiş input gelirse
            input_ids, attention_mask = inputs
        else:
            # Raw text gelirse tokenize et
            texts = inputs
            encodings = self.tokenizer(
                texts.numpy() if hasattr(texts, 'numpy') else texts,
                max_length=self.max_length,
                padding=True,
                truncation=True,
                return_tensors='tf'
            )
            input_ids = encodings['input_ids']
            attention_mask = encodings['attention_mask']
        
        # BERT forward pass
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_outputs.pooler_output
        
        # Sınıflandırıcı
        x = self.dropout1(pooled_output)
        x = self.dense1(x)
        x = self.dropout2(x)
        x = self.dense2(x)
        x = self.dropout3(x)
        return self.classifier(x)
    
    def predict_text(self, text):
        """Tek bir metin için tahmin yapar"""
        encodings = self.tokenizer(
            [text],
            max_length=self.max_length,
            padding=True,
            truncation=True,
            return_tensors='tf'
        )
        return self.call((encodings['input_ids'], encodings['attention_mask']))

# Model oluşturma - YENİ YAKLAŞIM
def create_bert_model(num_classes, bert_model_name="dbmdz/bert-base-turkish-uncased", max_length=256):
    """Custom BERT modeli oluşturur"""
    print(f"Custom BERT modeli oluşturuluyor: {bert_model_name}")
    
    model = BertClassifier(bert_model_name, num_classes, max_length)
    
    # Tokenizer'ı ayrıca da döndür
    tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
    
    return model, tokenizer

# Alternatif: Functional API ile model
def create_bert_functional_model(num_classes, bert_model_name="dbmdz/bert-base-turkish-uncased", max_length=256):
    """Functional API ile BERT modeli"""
    print(f"Functional BERT modeli oluşturuluyor: {bert_model_name}")
    
    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
    
    # BERT modeli - önce tokenize, sonra BERT
    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')
    
    # BERT modelini doğrudan çağır
    bert_model = TFAutoModel.from_pretrained(bert_model_name)
    bert_model.trainable = False
    
    # BERT outputs
    bert_output = bert_model(input_ids, attention_mask=attention_mask)
    pooled_output = bert_output.pooler_output
    
    # Classification layers
    x = tf.keras.layers.Dropout(0.3)(pooled_output)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    outputs = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
    
    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=outputs)
    
    return model, tokenizer

# Model eğitim fonksiyonu
def train_model(model, X_train, y_train, X_test, y_test, tokenizer, class_weights, 
                model_name, max_length=256, use_custom_model=False):
    """Modeli eğitir, değerlendirir ve sonuçları kaydeder"""
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_filename = f"{model_name}_{timestamp}"
    
    checkpoint_path = f"outputs/models/{model_filename}.keras"
    
    if use_custom_model:
        # Custom model için training loop
        print("Custom model eğitimi için hazırlanıyor...")
        
        # Veriyi hazırla
        train_dataset = tf.data.Dataset.from_tensor_slices((X_train.values, y_train))
        train_dataset = train_dataset.batch(16).prefetch(tf.data.AUTOTUNE)
        
        test_dataset = tf.data.Dataset.from_tensor_slices((X_test.values, y_test))
        test_dataset = test_dataset.batch(16).prefetch(tf.data.AUTOTUNE)
        
        # Optimizer ve loss
        optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
        loss_fn = tf.keras.losses.CategoricalCrossentropy()
        
        # Metrics
        train_acc_metric = CategoricalAccuracy()
        val_acc_metric = CategoricalAccuracy()
        
        @tf.function
        def train_step(x_batch, y_batch):
            with tf.GradientTape() as tape:
                predictions = model(x_batch, training=True)
                loss = loss_fn(y_batch, predictions)
            gradients = tape.gradient(loss, model.trainable_weights)
            optimizer.apply_gradients(zip(gradients, model.trainable_weights))
            train_acc_metric.update_state(y_batch, predictions)
            return loss
        
        @tf.function
        def test_step(x_batch, y_batch):
            predictions = model(x_batch, training=False)
            val_acc_metric.update_state(y_batch, predictions)
        
        # Training loop
        epochs = 10
        history = {'loss': [], 'accuracy': [], 'val_loss': [], 'val_accuracy': []}
        
        for epoch in range(epochs):
            print(f"\nEpoch {epoch + 1}/{epochs}")
            
            # Training
            for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
                loss = train_step(x_batch_train, y_batch_train)
                if step % 100 == 0:
                    print(f"Step {step}, Loss: {loss:.4f}")
            
            train_acc = train_acc_metric.result()
            print(f"Training acc: {train_acc:.4f}")
            
            # Validation
            for x_batch_val, y_batch_val in test_dataset:
                test_step(x_batch_val, y_batch_val)
            
            val_acc = val_acc_metric.result()
            print(f"Validation acc: {val_acc:.4f}")
            
            # Save history
            history['accuracy'].append(float(train_acc))
            history['val_accuracy'].append(float(val_acc))
            
            # Reset metrics
            train_acc_metric.reset_state()
            val_acc_metric.reset_state()
        
        # Modeli kaydet
        model.save_weights(checkpoint_path.replace('.keras', '_weights.h5'))
        
        return history, {}, None, None, None, None
        
    else:
        # Standart Functional model eğitimi
        print("Standart model eğitimi başlıyor...")
        
        # Metinleri tokenize et
        print("Eğitim metinleri tokenize ediliyor...")
        X_train_ids, X_train_mask = tokenize_texts(X_train, tokenizer, max_length)
        
        print("Test metinleri tokenize ediliyor...")
        X_test_ids, X_test_mask = tokenize_texts(X_test, tokenizer, max_length)
        
        checkpoint = ModelCheckpoint(
            checkpoint_path,
            monitor='val_f1_score',
            verbose=1,
            save_best_only=True,
            mode='max'
        )
        
        early_stop = EarlyStopping(
            monitor='val_loss',
            patience=3,
            verbose=1,
            restore_best_weights=True
        )
        
        print(f"Model eğitimi başlıyor: {model_name}")
        
        model_start_time = time.time()
        history = model.fit(
            [X_train_ids, X_train_mask], y_train,
            batch_size=16,
            epochs=10,
            validation_data=([X_test_ids, X_test_mask], y_test),
            class_weight=class_weights,
            callbacks=[checkpoint, early_stop],
            verbose=1
        )
        model_end_time = time.time()
        model_train_time = model_end_time - model_start_time
        
        history_df = pd.DataFrame(history.history)
        history_path = f"outputs/history/{model_filename}_history.csv"
        history_df.to_csv(history_path, index=False)
        print(f"Eğitim geçmişi kaydedildi: {history_path}")
        
        # Model değerlendirme
        print("Model değerlendiriliyor...")
        model_start_time = time.time()
        evaluation = model.evaluate([X_test_ids, X_test_mask], y_test, verbose=1)
        model_end_time = time.time()
        model_test_time = model_end_time - model_start_time

        evaluation_results = dict(zip(model.metrics_names, evaluation))
        evaluation_results["Train Time"] = model_train_time
        evaluation_results["Test Time"] = model_test_time
        
        print(f"Değerlendirme sonuçları: {evaluation_results}")
        
        visualize_metrics(history, model_filename)
        create_confusion_matrix(model, X_test_ids, X_test_mask, y_test, model_filename)
        
        save_results_to_csv(model_name, evaluation_results)
        
        return history, evaluation_results, X_train_ids, X_train_mask, X_test_ids, X_test_mask

# Diğer fonksiyonlar aynı kalacak...
def visualize_metrics(history, model_name):
    """Eğitim metriklerini görselleştirir ve kaydeder"""
    if isinstance(history, dict):
        # Custom model history
        metrics_dict = history
    else:
        # Keras history
        metrics_dict = history.history
    
    metrics = [
        ('loss', 'Model Loss'),
        ('accuracy', 'Accuracy'),
    ]
    
    print("Metrikler görselleştiriliyor...")
    
    for metric, title in metrics:
        if metric in metrics_dict:
            plt.figure(figsize=(10, 6))
            plt.plot(metrics_dict[metric], label=f'Train {title}')
            val_metric = f'val_{metric}'
            if val_metric in metrics_dict:
                plt.plot(metrics_dict[val_metric], label=f'Validation {title}')
            plt.title(title)
            plt.xlabel('Epoch')
            plt.ylabel(title)
            plt.legend()
            plt.grid(True)
            plt.savefig(f"outputs/images/{model_name}_{metric}.png")
            plt.close()
    
    print(f"Metrik görselleştirmeleri 'outputs/images/' dizinine kaydedildi")

def create_confusion_matrix(model, X_test_ids, X_test_mask, y_test, model_name):
    """Test verileri için confusion matrix oluşturur ve kaydeder"""
    print("Confusion Matrix oluşturuluyor...")
    
    y_pred = model.predict([X_test_ids, X_test_mask])
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)
    
    cm = tf.math.confusion_matrix(y_test_classes, y_pred_classes).numpy()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('Gerçek Değer')
    plt.xlabel('Tahmin')
    plt.savefig(f"outputs/images/{model_name}_confusion_matrix.png", bbox_inches='tight')
    plt.close()
    
    print(f"Confusion Matrix 'outputs/images/{model_name}_confusion_matrix.png' olarak kaydedildi")

def save_results_to_csv(model_name, eval_results):
    """Model sonuçlarını CSV dosyasına kaydeder"""
    results_path = "outputs/results.csv"
    
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    result_data = {
        'model_name': model_name,
        'timestamp': timestamp,
        **eval_results
    }
    
    file_exists = os.path.isfile(results_path)
    
    with open(results_path, 'a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=result_data.keys())
        
        if not file_exists:
            writer.writeheader()
            
        writer.writerow(result_data)
    
    print(f"Sonuçlar '{results_path}' dosyasına kaydedildi")

# Ana fonksiyon
def train_news_classification_model(data_path, stopwords_path=None, model_name="Turkish_BERT_Classifier", use_functional=True):
    """
    BERT tabanlı çok sınıflı haber sınıflandırma modeli eğitimi
    """
    create_directories()
    check_gpu()
    
    # Veriyi yükle ve ön işle
    X_train, X_test, y_train, y_test, class_weights, label_encoder, num_classes = \
        load_and_preprocess_data(data_path, stopwords_path=stopwords_path)
    
    if use_functional:
        # Functional API modeli kullan
        model, tokenizer = create_bert_functional_model(
            num_classes=num_classes,
            bert_model_name="dbmdz/bert-base-turkish-uncased",
            max_length=256
        )
        use_custom_model = False
        print("Functional API modeli kullanılıyor")
    else:
        # Custom model kullan
        model, tokenizer = create_bert_model(
            num_classes=num_classes,
            bert_model_name="dbmdz/bert-base-turkish-uncased",
            max_length=256
        )
        use_custom_model = True
        print("Custom model kullanılıyor")
    
    if not use_custom_model:
        # Functional model için compile
        optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
        
        metrics = [
            CategoricalAccuracy(name='accuracy'),
            Precision(name='precision'),
            Recall(name='recall'),
            F1Score(name='f1_score')
        ]
        
        model.compile(
            optimizer=optimizer,
            loss='categorical_crossentropy',
            metrics=metrics
        )
        
        print(model.summary())
    
    # Modeli eğit
    history, eval_results, X_train_ids, X_train_mask, X_test_ids, X_test_mask = train_model(
        model, X_train, y_train, X_test, y_test, tokenizer,
        class_weights, model_name, use_custom_model=use_custom_model
    )
    
    # Belleği temizle
    tf.keras.backend.clear_session()
    gc.collect()
    
    print(f"\n{'='*50}")
    print(f"Model Eğitimi Tamamlandı!")
    print(f"Model Adı: {model_name}")
    print(f"Sınıf Sayısı: {num_classes}")
    print(f"Sınıflar: {', '.join(label_encoder.classes_)}")
    print(f"{'='*50}\n")
    
    return model, history, eval_results, label_encoder, tokenizer

# Örnek kullanım - Functional API ile çalıştır
if __name__ == "__main__":
    try:
        # Functional API modeli ile eğit
        model, history, eval_results, label_encoder, tokenizer = \
            train_news_classification_model(
                data_path="/mnt/d/work2/turkish-news-classification/data/cleaned.csv",
                stopwords_path='./assets/stop-words.txt',
                model_name="BERT",
                use_functional=True  # Functional API kullan
            )
        
        print("Model başarıyla eğitildi!")
        
    except Exception as e:
        print(f"Hata oluştu: {str(e)}")
        import traceback
        print(traceback.format_exc())