In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from scipy.stats import skew
from pathlib import Path







In [4]:
file_path = input("Enter path to CSV (e.g. notebooks/sample_csv/your_file.csv): ")
df = pd.read_csv(file_path)

Enter path to CSV (e.g. notebooks/sample_csv/your_file.csv): /content/Titanic-Dataset.csv


In [5]:

# 2. Detect numeric columns (ignore binary/ID-like columns)
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()

In [6]:
filtered_cols = []
for col in numeric_cols:
    unique_vals = df[col].nunique()
    if unique_vals > 2 and df[col].isnull().mean() < 0.5:
        filtered_cols.append(col)

In [7]:
# 3. Initializing final DataFrame
scaler_report = {}
scaled_df = pd.DataFrame(index=df.index)

# 4. Auto-scaling logic
for col in filtered_cols:
    col_data = df[col]
    notna_mask = col_data.notna()

    scalers = {
        'StandardScaler': StandardScaler(),
        'MinMaxScaler': MinMaxScaler(),
        'RobustScaler': RobustScaler()
    }

    best_scaler = None
    best_skewness = float('inf')
    best_scaled_data = None

    for name, scaler in scalers.items():
        try:
            scaled = scaler.fit_transform(col_data[notna_mask].values.reshape(-1, 1))
            skewness = abs(skew(scaled.flatten()))
        except Exception as e:
            print(f"Skipping scaler {name} on column {col} due to error: {e}")
            continue

        if skewness < best_skewness:
            best_skewness = skewness
            best_scaler = name
            best_scaled_data = scaled.flatten()

    # Reconstructing full col
    full_scaled_col = pd.Series(np.nan, index=col_data.index)
    full_scaled_col[notna_mask] = best_scaled_data

    # Storing results
    scaled_df[col] = full_scaled_col
    scaler_report[col] = best_scaler

In [8]:

# 5. Saving output to CSV
scaled_df.to_csv("autoEDA_scaled_output.csv", index=False)
print("\n✅ Scaled Data Saved to: autoEDA_scaled_output.csv")

# 6. Report which scaler was used
print("\n📋 Scaler Report (column : best scaler):")
print(scaler_report)



✅ Scaled Data Saved to: autoEDA_scaled_output.csv

📋 Scaler Report (column : best scaler):
{'PassengerId': 'RobustScaler', 'Pclass': 'MinMaxScaler', 'Age': 'StandardScaler', 'SibSp': 'StandardScaler', 'Parch': 'RobustScaler', 'Fare': 'StandardScaler'}
