In [None]:
import numpy as np
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor


st.set_page_config(page_title="EDA Dashboard", layout="wide")
st.title("📊 Automated EDA Dashboard for Basic EDA")

# Upload dataset
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv", "txt", "xlsx"])


if uploaded_file:
    try:
        if isinstance(uploaded_file, str):
            df = pd.read_csv(uploaded_file)
        else:
            df = pd.read_csv(uploaded_file)
        st.subheader("Dataset Preview")
        st.dataframe(df.head(10))

        #sidebar options
        st.sidebar.header("EDA & Preprocessing Options")
        show_null_count = st.sidebar.checkbox("Show Null Value Counts", True)
        fix_nulls = st.sidebar.checkbox("Auto-Fix Nulls", False)
        normalize_data = st.sidebar.checkbox("Normalize Numeric Columns", False)
        normalization_method = st.sidebar.selectbox("Normalization Method", ["Min-Max", "Standard"], index=0)
        detect_outliers = st.sidebar.checkbox("Detect Outliers (IQR Method)", False)
        target_column = None

        show_histograms = st.sidebar.checkbox("Show Histogram of Numeric columns", True)
        show_countplots = st.sidebar.checkbox("Show Count Plots for catagorical columns", True)


        df_clean = df.copy()

        #Null values
        if show_null_count:
            st.subheader("Null Count per column")
            null_count = df_clean.isnull().sum()
            st.dataframe(null_count[null_count > 0].sort_values(ascending=False))
        if fix_nulls:
            st.subheader("Auto-Fix Nulls")
            numeric_cols = df_clean.select_dtypes(include=np.number).columns.tolist()
            catagorical_cols = df_clean.select_dtypes(include='object').columns.tolist()

            for col in numeric_cols:
                if df_clean[col].isnull().sum() > 0:
                    df_clean[col].fillna(df_clean[col].median(), inplace=True)
                    st.write(f"Filled numeric cols {col} with Median")
            for col in catagorical_cols:
                if df_clean[col].isnull().sum() > 0:
                    df_clean[col].fillna("Unknown", inplace=True)
                    st.write(f"Filled categorical cols {col} with Unknown")

        #Outlier detection
        if detect_outliers:
            st.subheader("Potential Outliers (IQR Method)")
            numeric_cols = df_clean.select_dtypes(include=np.number).columns.tolist()
            outlier_dict = {}
            for col in numeric_cols:
                Q1 = df_clean[col].quantile(0.25)
                Q3 = df_clean[col].quantile(0.75)
                IQR = Q3 - Q1
                lower = Q1 - 1.5 * IQR
                upper = Q3 + 1.5 * IQR
                outliers = df_clean[(df_clean[col] < lower) | (df_clean[col] > upper)]
                if len(outliers) > 0:
                    outlier_dict[col] = len(outliers)
            if outlier_dict:
                st.write("Number of outliers detected per column:")
                st.json(outlier_dict)
            else:
                st.write("No significant outliers detected.")


        #Data Normalization
        if normalize_data:
            numeric_cols = df_clean.select_dtypes(include=np.number).columns.tolist()
            st.subheader("Normalizing Numeric Columns")
            if normalization_method=="Min-Max":
                scaler = MinMaxScaler()
            else:
                scaler = StandardScaler()
            df_clean[numeric_cols] = scaler.fit_transform(df_clean[numeric_cols])
            st.write(f"Normalized columns: {numeric_cols} using {normalization_method} scaling")



        #visualization
        numeric_cols = df_clean.select_dtypes(include=np.number).columns.tolist()
        categorical_cols = df_clean.select_dtypes(include='object').columns.tolist()

        if show_histograms and numeric_cols:
            st.subheader("Histograms for Numeric Columns")
            for col in numeric_cols:
                fig, ax = plt.subplots()
                sns.histplot(df_clean[col], kde=True, ax=ax)
                ax.set_title(f"{col} Distribution")
                st.pyplot(fig)

        if show_countplots and categorical_cols:
            st.subheader("Count Plots for Categorical Columns")
            for col in categorical_cols:
                fig, ax = plt.subplots()
                sns.countplot(y=df_clean[col], order=df_clean[col].value_counts().index, ax=ax)
                ax.set_title(f"{col} Counts")
                st.pyplot(fig)

        # ---------- Download Cleaned Dataset ----------
        st.subheader("Download Cleaned / Processed Dataset")
        csv = df_clean.to_csv(index=False)
        st.download_button(label="Download CSV", data=csv, file_name="cleaned_dataset.csv", mime="text/csv")

    except Exception as e:
        st.error("Error loading dataset: " + str(e))
else:
    st.info("Upload a CSV file or use the sample dataset to start exploring.")
