In [18]:
import os, json,logging
from typing import Tuple, List, Dict, Any

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Logging ayarı
LOG_FMT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(level=logging.INFO, format=LOG_FMT)
RND_SEED = 42

In [19]:
class beforeEDA:
    @staticmethod
    def ensureDir(path: str):
        """Klasör yoksa oluştur."""
        if not path:
            return
        if not os.path.exists(path):
            os.makedirs(path, exist_ok=True)

    @staticmethod
    def saveJson(obj: Any, path: str):
        """JSON dosyası kaydet."""
        dirn = os.path.dirname(path) or "."
        beforeEDA.ensureDir(dirn)
        with open(path, 'w', encoding='utf-8') as f:
            json.dump(obj, f, ensure_ascii=False, indent=2,default=str)

    @staticmethod
    def readInput(inputPath: str, sheetName=0) -> pd.DataFrame:
        """Excel/CSV oku ve DataFrame döndür."""
        ext = os.path.splitext(inputPath)[1].lower()
        if ext in [".xls", ".xlsx"]:
            logging.info("Excel okundu: %s (sheet=%s)", inputPath, sheetName)
            df = pd.read_excel(inputPath, sheet_name=sheetName)
        elif ext == '.csv':
            logging.info("CSV okundu: %s", inputPath)
            df = pd.read_csv(inputPath)
        else:
            raise ValueError(f"Unsupported file extension: {ext}")
        return df

    @staticmethod
    def excelToCsv(excel_path: str, csv_path: str, sheet_name=0, encoding="utf-8-sig"):
        """Excel'i CSV'ye çevir."""
        df = pd.read_excel(excel_path, sheet_name=sheet_name)
        beforeEDA.ensureDir(os.path.dirname(csv_path) or ".")
        df.to_csv(csv_path, index=False, encoding=encoding)
        logging.info("Excel to CSV done: %s -> %s", excel_path, csv_path)
        return csv_path

    @staticmethod
    def saveJsonSummary(csv_path: str, jsonPath: str, sampleValues=5):
        """CSV özetini JSON olarak kaydet."""
        df = pd.read_csv(csv_path)
        summary = {}
        n = len(df)
        for col in df.columns:
            col_data = df[col]
            non_null = col_data.dropna()
            dtype = str(col_data.dtype)
            missing = col_data.isna().sum()
            unique = int(col_data.nunique(dropna=True))
            missing_pct = round(missing / n * 100, 3)
            sample = non_null.unique()[:sampleValues].tolist()
            col_summary = {
                "dtype": dtype,
                "missing": missing,
                "missing_pct": missing_pct,
                "unique": unique,
                "sample": sample
            }
            if pd.api.types.is_numeric_dtype(col_data):
                col_summary.update({
                    "mean": round(non_null.mean(), 3) if not non_null.empty else None,
                    "std": round(non_null.std(), 3) if not non_null.empty else None,
                    "min": round(non_null.min(), 3) if not non_null.empty else None,
                    "25%": round(non_null.quantile(0.25), 3) if not non_null.empty else None,
                    "50%": round(non_null.median(), 3) if not non_null.empty else None,
                    "75%": round(non_null.quantile(0.75), 3) if not non_null.empty else None,
                    "max": round(non_null.max(), 3) if not non_null.empty else None,
                })
            else:
                top_counts = non_null.value_counts().head(10).to_dict()
                col_summary["top_values"] = {str(k): int(v) for k, v in top_counts.items()}
            summary[col] = col_summary
        beforeEDA.saveJson(summary, jsonPath)
        logging.info("CSV özet JSON kaydedildi: %s", jsonPath)
        return df, summary

    @staticmethod
    def parseMultiLabelColumn(series: pd.Series, sep=",") -> List[List[str]]:
        """Multi-label kolonları liste haline getir."""
        def splitAndClean(x):
            if pd.isna(x): return []
            if not isinstance(x, str): x = str(x)
            parts = [p.strip() for p in x.split(sep) if p.strip() != ""]
            return parts
        return series.map(splitAndClean).tolist()


In [20]:
class EdaAnalysis:
    @staticmethod
    def eda_Analysis(df: pd.DataFrame, outDir: str, LastJsonPath: str, topNCat=10):
        beforeEDA.ensureDir(outDir)
        plotsDir = os.path.join(outDir, "Plots")
        beforeEDA.ensureDir(plotsDir)
        logging.info("EDA Başlıyor. ")
        nRows, nCols = df.shape
        numericCols = df.select_dtypes(include=[np.number]).columns.to_list()
        categoricalCols = df.select_dtypes(include=["object","category"]).columns.to_list()

        edaResult = {
            "shape": {"rows": nRows, "cols": nCols},
            "numeric_columns": numericCols,
            "categorical_columns": categoricalCols,
            "missing": df.isna().sum().to_dict(),
            "data_types": df.dtypes.astype(str).to_dict(),
        }

        # Numeric stats
        numStats = {}
        for col in numericCols:
            ser = df[col]
            stats = ser.describe().to_dict()
            stats["skew"] = float(ser.dropna().skew()) if ser.dropna().shape[0] > 2 else None
            numStats[col] = {k: (float(v) if pd.notna(v) else None) for k, v in stats.items()}

            # Histogram
            plt.figure(figsize=(6,4))
            sns.histplot(ser.dropna(), kde=True)
            plt.title(f"{col} Histogram")
            plt.tight_layout()
            plt.savefig(os.path.join(plotsDir, f"{col}_Hist.png"))
            plt.close()

            # Boxplot
            plt.figure(figsize=(6,4))
            sns.boxplot(x=ser.dropna())
            plt.title(f"{col} Boxplot")
            plt.tight_layout()
            plt.savefig(os.path.join(plotsDir, f"{col}_Box.png"))
            plt.close()

        edaResult["Numeric_Stats"] = numStats

        # Korelasyon
        try:
            corr = df[numericCols].corr()
            plt.figure(figsize=(10, 8))
            sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
            plt.title("Numerik Korelasyon Matrix")
            plt.tight_layout()
            plt.savefig(os.path.join(plotsDir, "korelasyonMatrix.png"))
            plt.close()
            edaResult["Korelasyon_Matrix"] = corr.to_dict()
        except Exception as e: 
            logging.warning("Korelasyon hesaplanamadı: %s", e)

        # Target analiz
        target = "TedaviSuresi"
        if target in df.columns:
            tser = df[target]
            edaResult["target"] = {
                "missing": int(tser.isna().sum()),
                "describe": tser.describe().to_dict() if pd.api.types.is_numeric_dtype(tser) else None
            }
            if pd.api.types.is_numeric_dtype(tser):
                plt.figure(figsize=(6,4))
                sns.histplot(tser.dropna(), kde=True)
                plt.title(f"Hedef ({target}) dağılımı")
                plt.tight_layout()
                plt.savefig(os.path.join(plotsDir, f"{target}_Hist.png"))
                plt.close()

        beforeEDA.saveJson(edaResult, LastJsonPath)
        logging.info("EDA Tamamlandı")
        return edaResult, plotsDir

    @staticmethod
    def CleanData(df: pd.DataFrame, drop_dupes_on: List[str]=None, drop_missing_threshold: float=0.5) -> pd.DataFrame:
        dfc = df.copy()
        for col in dfc.select_dtypes(include=["object", "category"]).columns:
            dfc[col] = dfc[col].astype(str).map(lambda x: x.strip() if pd.notna(x) else x)
            dfc.loc[dfc[col].isin(["", "nan", "None", "NoneType"]), col] = np.nan

        # Kolonları eksik veri yüzdesine göre düşür
        colMissingPct = dfc.isna().mean()
        dropCols = colMissingPct[colMissingPct > drop_missing_threshold].index.tolist()
        if dropCols:
            logging.info("Sütunlar siliniyor (yüksek eksik veri): %s", dropCols)
            dfc.drop(columns=dropCols, inplace=True)

        # Duplicates
        if drop_dupes_on:
            before = len(dfc)
            dfc.drop_duplicates(subset=drop_dupes_on, keep="first", inplace=True)
            after = len(dfc)
            logging.info("Duplicate (subset=%s) temizlendi: %d -> %d", drop_dupes_on, before, after)
        else:
            if "HastaNo" in dfc.columns:
                before = len(dfc)
                dfc.drop_duplicates(subset=["HastaNo"], keep="first", inplace=True)
                after = len(dfc)
                logging.info("Duplicate (HastaNo) temizlendi: %d -> %d", before, after)

        # Tip dönüşümleri
        for col in dfc.columns:
            try:
                pd.to_numeric(dfc[col].dropna().head(10))
                dfc[col] = pd.to_numeric(dfc[col], errors='ignore')
            except Exception:
                pass

        # Eksikleri doldur
        for col in dfc.select_dtypes(include=["object"]).columns:
            if dfc[col].isna().sum() > 0:
                try:
                    mode = dfc[col].mode(dropna=True)[0]
                    dfc[col].fillna(mode, inplace=True)
                except Exception:
                    pass
        for col in dfc.select_dtypes(include=[np.number]).columns:
            if dfc[col].isna().sum() > 0:
                med = dfc[col].median()
                dfc[col].fillna(med, inplace=True)

        return dfc



In [21]:
class Feature:
    @staticmethod
    def feature_engineering(df: pd.DataFrame, ml_columns: List[str]=None, top_k: int=20) -> Tuple[pd.DataFrame, Dict[str,List[str]]]:
        df_fe = df.copy()
        mlColFeatures = {}
        if not ml_columns:
            ml_columns = [c for c in df_fe.columns if c in ["KronikHastalik","Alerji"]]
        for col in ml_columns:
            if col not in df_fe.columns:
                continue
            lists = beforeEDA.parseMultiLabelColumn(df_fe[col], sep=",")
            flat = [item for sublist in lists for item in sublist]
            if not flat:
                continue
            topLabels = pd.Series(flat).value_counts().head(top_k).index.tolist()
            mlColFeatures[col] = topLabels

            mlb = MultiLabelBinarizer(classes=topLabels)
            binarized = mlb.fit_transform(lists)

            col_names = [f"{col}__{label.replace(' ', '_')}" for label in mlb.classes_]
            tmp = pd.DataFrame(binarized, columns=col_names, index=df_fe.index)
            df_fe = pd.concat([df_fe, tmp], axis=1)
            df_fe[f"{col}__count"] = [len(x) for x in lists]
        return df_fe, mlColFeatures


In [22]:
class afterEDA:
    @staticmethod
    def _get_ohe_feature_names(ohe: OneHotEncoder, cols: List[str]) -> List[str]:
        try:
            names = ohe.get_feature_names_out(cols).tolist()
        except Exception:
            names = []
            categories = getattr(ohe, "categories_", [])
            for i, col in enumerate(cols):
                cats = categories[i]
                for cat in cats:
                    names.append(f"{col}__{str(cat)}")
        return names

    @staticmethod
    def prepare_model_data(df: pd.DataFrame, target_col: str="TedaviSuresi", out_csv: str="model_ready.csv"):
        if target_col in df.columns:
            y = df[target_col].copy()
            X = df.drop(columns=[target_col])
        else:
            y = None
            X = df.copy()

        numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
        logging.info("Model hazırlık: numeric=%s, categorical=%s", numeric_cols, categorical_cols)

        numeric_pipeline = Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ])
        categorical_pipeline = Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ])
        transformers = []
        if numeric_cols:
            transformers.append(("num", numeric_pipeline, numeric_cols))
        if categorical_cols:
            transformers.append(("cat", categorical_pipeline, categorical_cols))

        preprocessor = ColumnTransformer(transformers=transformers, remainder="drop", sparse_threshold=1.0)
        X_trans = preprocessor.fit_transform(X)

        feature_names = []
        if numeric_cols:
            feature_names.extend(numeric_cols)
        if categorical_cols:
            ohe = preprocessor.named_transformers_["cat"].named_steps["ohe"]
            cat_names = afterEDA._get_ohe_feature_names(ohe, categorical_cols)
            feature_names.extend(cat_names)

        if hasattr(X_trans, "toarray"):
            X_arr = X_trans.toarray()
        else:
            X_arr = np.asarray(X_trans)

        model_df = pd.DataFrame(X_arr, columns=feature_names)
        if y is not None:
            model_df[target_col] = y.reset_index(drop=True)

        beforeEDA.ensureDir(os.path.dirname(out_csv) or ".")
        model_df.to_csv(out_csv, index=False, encoding="utf-8-sig")
        up_out = os.path.join(os.path.dirname(out_csv), "MODEL_READY.csv")
        model_df.to_csv(up_out, index=False, encoding="utf-8-sig")
        logging.info("Model-ready CSV kaydedildi: %s ve %s", out_csv, up_out)
        return model_df


In [23]:
def run_pipeline(input_path: str,
                 outDir: str = "outputs",
                 sheet_name=0,
                 drop_missing_threshold=0.5,
                 top_k_labels=20):
    beforeEDA.ensureDir(outDir)
    plots_dir = os.path.join(outDir, "plots")
    beforeEDA.ensureDir(plots_dir)

    ext = os.path.splitext(input_path)[1].lower()
    if ext in [".xls", ".xlsx"]:
        csv_path = os.path.join(outDir, "converted_from_excel.csv")
        beforeEDA.excelToCsv(input_path, csv_path, sheet_name=sheet_name)
    else:
        csv_path = input_path

    csv_summary_path = os.path.join(outDir, "summary.json")
    df, summary = beforeEDA.saveJsonSummary(csv_path, csv_summary_path)

    last_json_path = os.path.join(outDir, "last.json")
    eda_result, plots_dir = EdaAnalysis.eda_Analysis(df, outDir=outDir, LastJsonPath=last_json_path)

    df_clean = EdaAnalysis.CleanData(df, drop_dupes_on=["HastaNo"] if "HastaNo" in df.columns else None,
                                     drop_missing_threshold=drop_missing_threshold)
    cleaned_path = os.path.join(outDir, "cleaned.csv")
    df_clean.to_csv(cleaned_path, index=False, encoding="utf-8-sig")
    logging.info("cleaned.csv kaydedildi: %s", cleaned_path)

    ml_cols = [c for c in df_clean.columns if c in ["KronikHastalik", "Alerji", "Tanilar"]]
    if ml_cols:
        df_fe, ml_features = Feature.feature_engineering(df_clean, ml_columns=ml_cols, top_k=top_k_labels)
        logging.info("Feature engineering tamamlandı, eklenen çoklu etiket sütunları: %s", ml_features)
    else:
        df_fe = df_clean
        ml_features = {}

    model_ready_path = os.path.join(outDir, "model_ready.csv")
    model_df = afterEDA.prepare_model_data(df_fe, target_col="TedaviSuresi", out_csv=model_ready_path)

    final_notes = {
        "generated_files": {
            "summary_json": csv_summary_path,
            "eda_last_json": last_json_path,
            "cleaned_csv": cleaned_path,
            "model_ready_csv": model_ready_path,
            "model_ready_csv_upper": os.path.join(outDir, "MODEL_READY.csv"),
            "plots_dir": plots_dir
        },
        "multilabel_features": ml_features
    }

    try:
        with open(last_json_path, "r", encoding="utf-8") as f:
            last = json.load(f)
    except Exception:
        last = {}
    last.update({"pipeline_final": final_notes})
    beforeEDA.saveJson(last, last_json_path)
    logging.info("Pipeline tamamlandı. Tüm çıktılar %s içinde.", outDir)
    return final_notes

In [None]:
run_pipeline("Talent_Academy_Case_DT_2025.xlsx", outDir="OutputData")


2025-09-06 21:51:51,388 - INFO - Excel to CSV done: Talent_Academy_Case_DT_2025.xlsx -> Veysel\converted_from_excel.csv
2025-09-06 21:51:51,444 - INFO - CSV özet JSON kaydedildi: Veysel\summary.json
2025-09-06 21:51:51,450 - INFO - EDA Başlıyor. 
2025-09-06 21:51:52,107 - INFO - EDA Tamamlandı
2025-09-06 21:51:52,135 - INFO - Duplicate (subset=['HastaNo']) temizlendi: 2235 -> 404
  dfc[col] = pd.to_numeric(dfc[col], errors='ignore')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dfc[col].fillna(mode, inplace=True)
2025-09-06 21:51:52,142 - INFO - cleaned.csv kaydedildi: Veysel\cleaned.csv
2025-09-06 21:51:52,158 - INFO - Feature engineering tamamlandı, ek

{'generated_files': {'summary_json': 'Veysel\\summary.json',
  'eda_last_json': 'Veysel\\last.json',
  'cleaned_csv': 'Veysel\\cleaned.csv',
  'model_ready_csv': 'Veysel\\model_ready.csv',
  'model_ready_csv_upper': 'Veysel\\MODEL_READY.csv',
  'plots_dir': 'Veysel\\Plots'},
 'multilabel_features': {'KronikHastalik': ['Myastenia gravis',
   'Hiportiroidizm',
   'Aritmi',
   'Limb-Girdle Musküler Distrofi',
   'Hipertiroidizm',
   'Astım',
   'Duchenne Musküler Distrofisi',
   'Kalp yetmezliği',
   'Fascioscapulohumeral Distrofi',
   'Hipertansiyon',
   'Becker Musküler Distrofisi',
   'Diyabet',
   'Polimiyozit',
   'Guatr',
   'Hipotirodizm'],
  'Alerji': ['Polen',
   'POLEN',
   'Toz',
   'TOZ',
   'NOVALGIN',
   'Sucuk',
   'CORASPIN',
   'ARVELES',
   'Yer Fıstığı',
   'Novalgin',
   'SUCUK',
   'GRİPİN',
   'VOLTAREN',
   'GRIPIN',
   'Voltaren',
   'Volteren'],
  'Tanilar': ['DORSALJİ',
   'DİĞER',
   'tanımlanmamış',
   'LUMBOSAKRAL BÖLGE',
   'İntervertebral disk bozuklukları',