In [1]:
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import pandas as pd
import numpy as np
import os

from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk
from matplotlib.figure import Figure

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LinearRegression

# Optional AutoML
AUTO_ML_AVAILABLE = True
TPOT_AVAILABLE = True
try:
    from autosklearn.classification import AutoSklearnClassifier
    from autosklearn.regression import AutoSklearnRegressor
except Exception:
    AUTO_ML_AVAILABLE = False
try:
    from tpot import TPOTClassifier, TPOTRegressor
except Exception:
    TPOT_AVAILABLE = False

class BusinessAnalystApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("Business Analyst GUI")
        self.geometry("1100x700")
        self.df = None
        self.filtered_df = None
        self.target_col = None

        # Notebook
        self.nb = ttk.Notebook(self)
        self.nb.pack(fill="both", expand=True)

        # Tabs
        self.tab_data = ttk.Frame(self.nb)
        self.tab_clean = ttk.Frame(self.nb)
        self.tab_viz = ttk.Frame(self.nb)
        self.tab_model = ttk.Frame(self.nb)

        self.nb.add(self.tab_data, text="Data")
        self.nb.add(self.tab_clean, text="Clean/Transform")
        self.nb.add(self.tab_viz, text="Visualize")
        self.nb.add(self.tab_model, text="Model")

        self.build_data_tab()
        self.build_clean_tab()
        self.build_viz_tab()
        self.build_model_tab()

    # === Data Tab ===
    def build_data_tab(self):
        top = ttk.Frame(self.tab_data)
        top.pack(fill="x", pady=5, padx=5)

        ttk.Button(top, text="Open CSV/Excel", command=self.open_file).pack(side="left")
        ttk.Button(top, text="Save Current View", command=self.save_current).pack(side="left", padx=8)
        self.info_lbl = ttk.Label(top, text="No file loaded")
        self.info_lbl.pack(side="left", padx=12)

        # Treeview for preview
        frame = ttk.Frame(self.tab_data)
        frame.pack(fill="both", expand=True, padx=5, pady=5)
        self.tree = ttk.Treeview(frame, show="headings")
        self.tree.pack(side="left", fill="both", expand=True)
        vsb = ttk.Scrollbar(frame, orient="vertical", command=self.tree.yview)
        hsb = ttk.Scrollbar(frame, orient="horizontal", command=self.tree.xview)
        self.tree.configure(yscrollcommand=vsb.set, xscrollcommand=hsb.set)
        vsb.pack(side="right", fill="y")
        hsb.pack(side="bottom", fill="x")

    def open_file(self):
        path = filedialog.askopenfilename(title="Select data file",
                                          filetypes=[("CSV", "*.csv"), ("Excel", "*.xlsx *.xls"), ("All files","*.*")])
        if not path:
            return
        try:
            if path.lower().endswith(".csv"):
                self.df = pd.read_csv(path)
            else:
                self.df = pd.read_excel(path)
            self.filtered_df = self.df.copy()
            self.info_lbl.config(text=f"Loaded: {os.path.basename(path)} | rows={len(self.df)} cols={self.df.shape[1]}")
            self.refresh_table(self.filtered_df.head(500))
            self.refresh_column_controls()
        except Exception as e:
            messagebox.showerror("Load error", str(e))

    def save_current(self):
        if self.filtered_df is None:
            messagebox.showwarning("No data", "Load data first")
            return
        path = filedialog.asksaveasfilename(defaultextension=".csv",
                                            filetypes=[("CSV", "*.csv")])
        if not path:
            return
        try:
            self.filtered_df.to_csv(path, index=False)
            messagebox.showinfo("Saved", f"Saved to {path}")
        except Exception as e:
            messagebox.showerror("Save error", str(e))

    def refresh_table(self, df):
        # clear
        self.tree.delete(*self.tree.get_children())
        self.tree["columns"] = list(df.columns)
        for c in df.columns:
            self.tree.heading(c, text=c)
            self.tree.column(c, width=150, anchor="w")
        for _, row in df.iterrows():
            self.tree.insert("", "end", values=[row[c] for c in df.columns])

    # === Clean/Transform Tab ===
    def build_clean_tab(self):
        pane = ttk.Panedwindow(self.tab_clean, orient="horizontal")
        pane.pack(fill="both", expand=True, padx=5, pady=5)

        left = ttk.Frame(pane, width=360)
        right = ttk.Frame(pane)
        pane.add(left, weight=1)
        pane.add(right, weight=3)

        # Cleaning controls
        ttk.Label(left, text="Cleaning").pack(anchor="w", pady=(0,6))
        self.dropna_btn = ttk.Button(left, text="Drop rows with nulls", command=self.dropna_rows)
        self.dropna_btn.pack(fill="x", pady=2)

        frm_fill = ttk.Frame(left)
        frm_fill.pack(fill="x", pady=2)
        ttk.Label(frm_fill, text="Fill NA by:").pack(side="left")
        self.fill_method = tk.StringVar(value="mean")
        ttk.Combobox(frm_fill, textvariable=self.fill_method, values=["mean","median","mode","constant"]).pack(side="left", padx=6)
        self.fill_constant = tk.StringVar()
        ttk.Entry(frm_fill, textvariable=self.fill_constant, width=8).pack(side="left")
        ttk.Button(left, text="Apply Fill", command=self.fill_na).pack(fill="x", pady=2)

        ttk.Label(left, text="Drop Column").pack(anchor="w", pady=(8,2))
        self.drop_col = tk.StringVar()
        self.drop_col_cb = ttk.Combobox(left, textvariable=self.drop_col)
        self.drop_col_cb.pack(fill="x")
        ttk.Button(left, text="Drop", command=self.drop_column).pack(fill="x", pady=2)

        ttk.Label(left, text="Remove Outliers (IQR) on column").pack(anchor="w", pady=(8,2))
        self.iqr_col = tk.StringVar()
        self.iqr_col_cb = ttk.Combobox(left, textvariable=self.iqr_col)
        self.iqr_col_cb.pack(fill="x")
        ttk.Button(left, text="Apply IQR Filter", command=self.apply_iqr).pack(fill="x", pady=2)

        ttk.Separator(left).pack(fill="x", pady=8)

        ttk.Label(left, text="Transform").pack(anchor="w", pady=(0,6))
        self.filter_expr = tk.StringVar()
        ttk.Entry(left, textvariable=self.filter_expr).pack(fill="x")
        ttk.Label(left, text="Example: Sales > 1000 and Region == 'East'").pack(anchor="w")
        ttk.Button(left, text="Apply Row Filter", command=self.apply_filter).pack(fill="x", pady=2)

        self.newcol_name = tk.StringVar()
        self.newcol_expr = tk.StringVar()
        ttk.Entry(left, textvariable=self.newcol_name).pack(fill="x")
        self.newcol_name.set("NewColumnName")
        ttk.Entry(left, textvariable=self.newcol_expr).pack(fill="x")
        self.newcol_expr.set("Quantity * Price")
        ttk.Button(left, text="Create Computed Column", command=self.create_column).pack(fill="x", pady=2)

        ttk.Label(left, text="Groupby Agg").pack(anchor="w", pady=(8,2))
        self.groupby_cols = tk.StringVar()
        self.groupby_aggs = tk.StringVar()
        ttk.Entry(left, textvariable=self.groupby_cols).pack(fill="x")
        self.groupby_cols.set("Region, Category")
        ttk.Entry(left, textvariable=self.groupby_aggs).pack(fill="x")
        self.groupby_aggs.set("Sales:sum, Quantity:mean")
        ttk.Button(left, text="Run Groupby", command=self.run_groupby).pack(fill="x", pady=2)

        # Right preview table
        self.clean_tree = ttk.Treeview(right, show="headings")
        self.clean_tree.pack(side="left", fill="both", expand=True)
        vsb = ttk.Scrollbar(right, orient="vertical", command=self.clean_tree.yview)
        hsb = ttk.Scrollbar(right, orient="horizontal", command=self.clean_tree.xview)
        self.clean_tree.configure(yscrollcommand=vsb.set, xscrollcommand=hsb.set)
        vsb.pack(side="right", fill="y")
        hsb.pack(side="bottom", fill="x")

    def refresh_clean_preview(self):
        if self.filtered_df is not None:
            self._refresh_tree(self.clean_tree, self.filtered_df.head(500))

    def _refresh_tree(self, tree, df):
        tree.delete(*tree.get_children())
        tree["columns"] = list(df.columns)
        for c in df.columns:
            tree.heading(c, text=c)
            tree.column(c, width=150, anchor="w")
        for _, row in df.iterrows():
            tree.insert("", "end", values=[row[c] for c in df.columns])

    def refresh_column_controls(self):
        if self.filtered_df is None:
            return
        cols = list(self.filtered_df.columns)
        self.drop_col_cb["values"] = cols
        self.iqr_col_cb["values"] = cols
        self.viz_x_cb["values"] = cols
        self.viz_y_cb["values"] = cols
        self.model_target_cb["values"] = cols

    def dropna_rows(self):
        if self.filtered_df is None: return
        self.filtered_df = self.filtered_df.dropna()
        self.refresh_clean_preview()

    def fill_na(self):
        if self.filtered_df is None: return
        method = self.fill_method.get()
        if method in ("mean","median"):
            nums = self.filtered_df.select_dtypes(include=np.number).columns
            if method == "mean":
                self.filtered_df[nums] = self.filtered_df[nums].fillna(self.filtered_df[nums].mean())
            else:
                self.filtered_df[nums] = self.filtered_df[nums].fillna(self.filtered_df[nums].median())
        elif method == "mode":
            self.filtered_df = self.filtered_df.fillna(self.filtered_df.mode().iloc[0])
        else:
            val = self.fill_constant.get()
            self.filtered_df = self.filtered_df.fillna(val)
        self.refresh_clean_preview()

    def drop_column(self):
        if self.filtered_df is None: return
        col = self.drop_col.get()
        if col and col in self.filtered_df.columns:
            self.filtered_df = self.filtered_df.drop(columns=[col])
            self.refresh_column_controls()
            self.refresh_clean_preview()

    def apply_iqr(self):
        if self.filtered_df is None: return
        col = self.iqr_col.get()
        if col not in self.filtered_df.columns:
            return
        q1 = self.filtered_df[col].quantile(0.25)
        q3 = self.filtered_df[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5*iqr
        upper = q3 + 1.5*iqr
        self.filtered_df = self.filtered_df[(self.filtered_df[col] >= lower) & (self.filtered_df[col] <= upper)]
        self.refresh_clean_preview()

    def apply_filter(self):
        if self.filtered_df is None: return
        expr = self.filter_expr.get().strip()
        if not expr:
            return
        try:
            self.filtered_df = self.filtered_df.query(expr)
            self.refresh_clean_preview()
        except Exception as e:
            messagebox.showerror("Filter error", str(e))

    def create_column(self):
        if self.filtered_df is None: return
        name = self.newcol_name.get().strip()
        expr = self.newcol_expr.get().strip()
        if not name or not expr:
            return
        try:
            self.filtered_df[name] = self.filtered_df.eval(expr)
            self.refresh_column_controls()
            self.refresh_clean_preview()
        except Exception as e:
            messagebox.showerror("Expression error", str(e))

    def run_groupby(self):
        if self.filtered_df is None: return
        try:
            cols = [c.strip() for c in self.groupby_cols.get().split(",") if c.strip()]
            agg_pairs = [p.strip() for p in self.groupby_aggs.get().split(",") if p.strip()]
            agg_map = {}
            for p in agg_pairs:
                col, fn = p.split(":")
                agg_map[col.strip()] = fn.strip()
            g = self.filtered_df.groupby(cols).agg(agg_map).reset_index()
            self.filtered_df = g
            self.refresh_column_controls()
            self.refresh_clean_preview()
        except Exception as e:
            messagebox.showerror("Groupby error", str(e))

    # === Visualization Tab ===
    def build_viz_tab(self):
        left = ttk.Frame(self.tab_viz, width=260)
        left.pack(side="left", fill="y", padx=5, pady=5)
        right = ttk.Frame(self.tab_viz)
        right.pack(side="left", fill="both", expand=True, padx=5, pady=5)

        ttk.Label(left, text="Chart Type").pack(anchor="w")
        self.viz_type = tk.StringVar(value="bar")
        ttk.Combobox(left, textvariable=self.viz_type, values=["bar","line","hist","scatter"]).pack(fill="x", pady=2)

        ttk.Label(left, text="X Column").pack(anchor="w", pady=(6,0))
        self.viz_x = tk.StringVar()
        self.viz_x_cb = ttk.Combobox(left, textvariable=self.viz_x)
        self.viz_x_cb.pack(fill="x")

        ttk.Label(left, text="Y Column").pack(anchor="w", pady=(6,0))
        self.viz_y = tk.StringVar()
        self.viz_y_cb = ttk.Combobox(left, textvariable=self.viz_y)
        self.viz_y_cb.pack(fill="x")

        ttk.Label(left, text="Bins (hist)").pack(anchor="w", pady=(6,0))
        self.viz_bins = tk.IntVar(value=20)
        ttk.Entry(left, textvariable=self.viz_bins).pack(fill="x")

        ttk.Button(left, text="Plot", command=self.plot_chart).pack(fill="x", pady=8)

        # Matplotlib figure embedded
        self.fig = Figure(figsize=(7,5), dpi=100)
        self.ax = self.fig.add_subplot(111)
        self.canvas = FigureCanvasTkAgg(self.fig, master=right)
        self.canvas.draw()
        self.canvas.get_tk_widget().pack(fill="both", expand=True)
        self.toolbar = NavigationToolbar2Tk(self.canvas, right)
        self.toolbar.update()

    def plot_chart(self):
        if self.filtered_df is None:
            messagebox.showwarning("No data", "Load data first")
            return
        self.ax.clear()
        chart = self.viz_type.get()
        x = self.viz_x.get()
        y = self.viz_y.get()

        try:
            if chart == "hist":
                if y and y in self.filtered_df.columns:
                    self.ax.hist(self.filtered_df[y].dropna(), bins=self.viz_bins.get(), color="#4e79a7")
                else:
                    # try all numeric
                    nums = self.filtered_df.select_dtypes(include=np.number)
                    if nums.shape[1] == 0:
                        messagebox.showwarning("No numeric", "No numeric columns for histogram")
                        return
                    self.ax.hist(nums.iloc[:,0].dropna(), bins=self.viz_bins.get(), color="#4e79a7")
                self.ax.set_title("Histogram")
            elif chart == "scatter":
                if x in self.filtered_df.columns and y in self.filtered_df.columns:
                    self.ax.scatter(self.filtered_df[x], self.filtered_df[y], alpha=0.7, color="#59a14f")
                    self.ax.set_xlabel(x); self.ax.set_ylabel(y)
                    self.ax.set_title(f"Scatter: {x} vs {y}")
                else:
                    messagebox.showwarning("Columns", "Select valid X and Y columns")
            elif chart == "bar":
                if x and y and x in self.filtered_df.columns and y in self.filtered_df.columns:
                    group = self.filtered_df.groupby(x)[y].sum().sort_values(ascending=False).head(20)
                    self.ax.bar(group.index.astype(str), group.values, color="#e15759")
                    self.ax.set_xticklabels(group.index.astype(str), rotation=45, ha="right")
                    self.ax.set_title(f"Bar: {y} by {x}")
                else:
                    messagebox.showwarning("Columns", "Select valid X and Y columns")
            else:  # line
                if x and y and x in self.filtered_df.columns and y in self.filtered_df.columns:
                    self.ax.plot(self.filtered_df[x], self.filtered_df[y], color="#f28e2b")
                    self.ax.set_xlabel(x); self.ax.set_ylabel(y)
                    self.ax.set_title(f"Line: {y} over {x}")
                else:
                    messagebox.showwarning("Columns", "Select valid X and Y columns")

            self.fig.tight_layout()
            self.canvas.draw()
        except Exception as e:
            messagebox.showerror("Plot error", str(e))

    # === Modeling Tab ===
    def build_model_tab(self):
        left = ttk.Frame(self.tab_model, width=260)
        left.pack(side="left", fill="y", padx=5, pady=5)
        right = ttk.Frame(self.tab_model)
        right.pack(side="left", fill="both", expand=True, padx=5, pady=5)

        ttk.Label(left, text="Target Column").pack(anchor="w")
        self.model_target = tk.StringVar()
        self.model_target_cb = ttk.Combobox(left, textvariable=self.model_target)
        self.model_target_cb.pack(fill="x", pady=2)

        ttk.Label(left, text="Task").pack(anchor="w", pady=(6,0))
        self.task_type = tk.StringVar(value="auto")
        ttk.Combobox(left, textvariable=self.task_type, values=["auto","classification","regression"]).pack(fill="x")

        ttk.Button(left, text="Train Baseline Model", command=self.train_model).pack(fill="x", pady=8)
        ttk.Button(left, text="Run AutoML (if available)", command=self.train_automl).pack(fill="x", pady=4)

        self.model_status = tk.Text(right, height=20)
        self.model_status.pack(fill="both", expand=True)

    def log(self, msg):
        self.model_status.insert("end", msg + "\n")
        self.model_status.see("end")
        self.update_idletasks()

    def infer_task(self, y):
        # heuristic: numeric -> regression, else classification
        return "regression" if pd.api.types.is_numeric_dtype(y) else "classification"

    def train_model(self):
        if self.filtered_df is None:
            messagebox.showwarning("No data", "Load and prepare data")
            return
        target = self.model_target.get()
        if target not in self.filtered_df.columns:
            messagebox.showwarning("Target", "Select a valid target")
            return
        df = self.filtered_df.dropna(subset=[target]).copy()
        y = df[target]
        X = df.drop(columns=[target])

        task = self.task_type.get()
        if task == "auto":
            task = self.infer_task(y)

        # Preprocess: numeric impute + categorical one-hot
        num_cols = X.select_dtypes(include=np.number).columns
        cat_cols = X.select_dtypes(exclude=np.number).columns

        num_pipe = Pipeline(steps=[("impute", SimpleImputer(strategy="median"))])
        cat_pipe = Pipeline(steps=[("impute", SimpleImputer(strategy="most_frequent")),
                                   ("onehot", OneHotEncoder(handle_unknown="ignore"))])
        pre = ColumnTransformer([
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols)
        ])

        if task == "classification":
            model = LogisticRegression(max_iter=1000)
            metric_fn = accuracy_score
        else:
            model = LinearRegression()
            metric_fn = r2_score

        pipe = Pipeline([("pre", pre), ("model", model)])

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.log("Training baseline model...")
        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)
        score = metric_fn(y_test, preds if task == "regression" else (preds > 0.5 if preds.ndim == 1 else np.argmax(preds, axis=1)))
        self.log(f"Task: {task} | Score: {score:.4f}")
        self.log("Done.\n")

    def train_automl(self):
        if self.filtered_df is None:
            messagebox.showwarning("No data", "Load and prepare data")
            return
        target = self.model_target.get()
        if target not in self.filtered_df.columns:
            messagebox.showwarning("Target", "Select a valid target")
            return

        df = self.filtered_df.dropna(subset=[target]).copy()
        y = df[target]
        X = df.drop(columns=[target])

        task = self.task_type.get()
        if task == "auto":
            task = self.infer_task(y)

        # Basic preprocessing similar to baseline
        num_cols = X.select_dtypes(include=np.number).columns
        cat_cols = X.select_dtypes(exclude=np.number).columns

        pre = ColumnTransformer([
            ("num", SimpleImputer(strategy="median"), num_cols),
            ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                              ("oh", OneHotEncoder(handle_unknown="ignore"))]), cat_cols)
        ])

        X_proc = pre.fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_proc, y, test_size=0.2, random_state=42)

        if AUTO_ML_AVAILABLE:
            self.log("AutoML: auto-sklearn running...")
            if task == "classification":
                automl = AutoSklearnClassifier(time_left_for_this_task=120, per_run_time_limit=30)
            else:
                automl = AutoSklearnRegressor(time_left_for_this_task=120, per_run_time_limit=30)
            automl.fit(X_train, y_train)
            preds = automl.predict(X_test)
            score = (accuracy_score(y_test, preds) if task == "classification" else r2_score(y_test, preds))
            self.log(f"AutoML score: {score:.4f}")
        elif TPOT_AVAILABLE:
            self.log("AutoML: TPOT running...")
            if task == "classification":
                automl = TPOTClassifier(generations=5, population_size=20, verbosity=2, max_time_mins=2)
            else:
                automl = TPOTRegressor(generations=5, population_size=20, verbosity=2, max_time_mins=2)
            automl.fit(X_train, y_train)
            preds = automl.predict(X_test)
            score = (accuracy_score(y_test, preds) if task == "classification" else r2_score(y_test, preds))
            self.log(f"TPOT score: {score:.4f}")
            try:
                automl.export("automl_pipeline.py")
                self.log("Exported pipeline to automl_pipeline.py")
            except Exception:
                pass
        else:
            self.log("AutoML not available. Install auto-sklearn or tpot.")
        self.log("Done.\n")

if __name__ == "__main__":
    app = BusinessAnalystApp()
    app.mainloop()


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\UDIT\anaconda3\envs\sklearn\Lib\tkinter\__init__.py", line 2074, in __call__
    return self.func(*args)
           ~~~~~~~~~^^^^^^^
  File "C:\Users\UDIT\AppData\Local\Temp\ipykernel_8960\3024515263.py", line 249, in apply_iqr
    q1 = self.filtered_df[col].quantile(0.25)
  File "C:\Users\UDIT\anaconda3\envs\sklearn\Lib\site-packages\pandas\core\series.py", line 2898, in quantile
    result = df.quantile(q=q, interpolation=interpolation, numeric_only=False)
  File "C:\Users\UDIT\anaconda3\envs\sklearn\Lib\site-packages\pandas\core\frame.py", line 12153, in quantile
    res_df = self.quantile(
        [q],  # type: ignore[list-item]
    ...<3 lines>...
        method=method,
    )
  File "C:\Users\UDIT\anaconda3\envs\sklearn\Lib\site-packages\pandas\core\frame.py", line 12198, in quantile
    res = data._mgr.quantile(qs=q, interpolation=interpolation)
  File "C:\Users\UDIT\anaconda3\envs\sklearn\Lib\site-

In [5]:
# pip install auto-sklearn

Collecting auto-sklearn
  Using cached auto-sklearn-0.15.0.tar.gz (6.5 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  Getting requirements to build wheel did not run successfully.
  exit code: 1
  
  [23 lines of output]
  Traceback (most recent call last):
    File [35m"C:\Users\UDIT\anaconda3\envs\sklearn\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py"[0m, line [35m389[0m, in [35m<module>[0m
      [31mmain[0m[1;31m()[0m
      [31m~~~~[0m[1;31m^^[0m
    File [35m"C:\Users\UDIT\anaconda3\envs\sklearn\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py"[0m, line [35m373[0m, in [35mmain[0m
      json_out["return_val"] = [31mhook[0m[1;31m(**hook_input["kwargs"])[0m
                               [31m~~~~[0m[1;31m^^^^^^^^^^^^^^^^^^^^^^^^[0m
    File [35m"C:\Users\UDIT\anaconda3\envs\sklearn\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py"[0m, line [35m143[0m, in [35mget_requires_for_build_wheel[0m
      return hook(config_settings)
    File [35m"C:\Users\UDI

In [6]:
# # business_analyst_agent.py
# import os, time, json, re, warnings
# warnings.filterwarnings("ignore")

# import pandas as pd
# import numpy as np
# from pathlib import Path

# # EDA / Profiling
# from ydata_profiling import ProfileReport  # pip install ydata-profiling

# # Visualizations
# import matplotlib.pyplot as plt
# import seaborn as sns

# # ML / AutoML
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score, r2_score
# import joblib

# # Optional AutoML (classification/regression)
# try:
#     import autosklearn.classification as ask_cls
#     import autosklearn.regression as ask_reg
#     AUTOSKLEARN_AVAILABLE = True
# except Exception:
#     AUTOSKLEARN_AVAILABLE = False

# # Web mining (ethical scraping)
# import requests
# from urllib.parse import urljoin, urlparse
# from bs4 import BeautifulSoup

# # ------------------ Utility ------------------

# def ensure_dir(path: str):
#     Path(path).mkdir(parents=True, exist_ok=True)
#     return path

# def safe_filename(name: str):
#     return re.sub(r"[^a-zA-Z0-9_.-]", "_", name)

# # ------------------ Data Ingestion ------------------

# def load_data(input_path_or_url: str) -> pd.DataFrame:
#     if re.match(r"^https?://", input_path_or_url, flags=re.I):
#         # Try CSV first
#         try:
#             return pd.read_csv(input_path_or_url)
#         except Exception:
#             pass
#         # Try HTML tables
#         try:
#             tables = pd.read_html(input_path_or_url)
#             if tables:
#                 return tables[0]
#         except Exception:
#             pass
#         raise ValueError("Unsupported URL or no table/CSV found at URL.")
#     else:
#         ext = Path(input_path_or_url).suffix.lower()
#         if ext in [".csv"]:
#             return pd.read_csv(input_path_or_url)
#         if ext in [".xlsx", ".xls"]:
#             return pd.read_excel(input_path_or_url)
#         raise ValueError("Unsupported file format. Use CSV/XLSX or a URL to a CSV/table.")

# # ------------------ Profiling & Cleaning ------------------

# def profile_and_clean(df: pd.DataFrame, out_dir="outputs/eda", title="EDA Report") -> pd.DataFrame:
#     ensure_dir(out_dir)
#     # Profiling report
#     profile = ProfileReport(df, title=title, explorative=True)
#     profile_path = os.path.join(out_dir, safe_filename(title) + ".html")
#     profile.to_file(profile_path)

#     # Basic cleaning
#     df = df.copy()
#     # Trim strings
#     for col in df.select_dtypes(include=["object"]).columns:
#         df[col] = df[col].astype(str).str.strip()
#     # Drop duplicate rows
#     df.drop_duplicates(inplace=True)

#     # Type inference
#     for col in df.columns:
#         # Convert to numeric where possible
#         if df[col].dtype == object:
#             df[col] = pd.to_numeric(df[col], errors="ignore")
#         # Parse dates if looks like date
#         if df[col].dtype == object and df[col].str.contains(r"\d{4}-\d{2}-\d{2}", regex=True).any():
#             try:
#                 df[col] = pd.to_datetime(df[col], errors="coerce")
#             except Exception:
#                 pass

#     # Missing value strategy: numeric -> median, categorical -> mode
#     for col in df.columns:
#         if df[col].dtype.kind in "biufc":
#             df[col] = df[col].fillna(df[col].median())
#         else:
#             mode = df[col].mode(dropna=True)
#             df[col] = df[col].fillna(mode.iloc[0] if not mode.empty else "")

#     # Outlier capping (IQR) for numeric columns
#     for col in df.select_dtypes(include=[np.number]).columns:
#         q1, q3 = df[col].quantile([0.25, 0.75])
#         iqr = q3 - q1
#         if iqr > 0:
#             low, high = q1 - 1.5 * iqr, q3 + 1.5 * iqr
#             df[col] = df[col].clip(lower=low, upper=high)

#     # Save cleaned
#     cleaned_path = os.path.join(out_dir, "cleaned_data.csv")
#     df.to_csv(cleaned_path, index=False)

#     return df

# # ------------------ Relationships ------------------

# def compute_relationships(df: pd.DataFrame, out_dir="outputs/relations"):
#     ensure_dir(out_dir)
#     rel = {}

#     # Correlations for numeric
#     num_df = df.select_dtypes(include=[np.number])
#     if not num_df.empty:
#         corr = num_df.corr(numeric_only=True)
#         corr_path = os.path.join(out_dir, "correlation_matrix.csv")
#         corr.to_csv(corr_path)
#         rel["correlation_matrix_path"] = corr_path

#     # Mutual information to a target if present (auto-pick last column candidate)
#     target = None
#     if df.columns.size > 1:
#         target = df.columns[-1]
#     rel["target_column_used"] = target

#     # Save a JSON summary
#     with open(os.path.join(out_dir, "relations_summary.json"), "w") as f:
#         json.dump(rel, f, indent=2)

#     # Quick heatmap visualization
#     if not num_df.empty:
#         plt.figure(figsize=(10, 8))
#         sns.heatmap(num_df.corr(numeric_only=True), cmap="coolwarm", annot=False)
#         plt.title("Correlation Heatmap")
#         plt.tight_layout()
#         heatmap_path = os.path.join(out_dir, "correlation_heatmap.png")
#         plt.savefig(heatmap_path, dpi=150)
#         plt.close()

#     return rel

# # ------------------ Visualizations ------------------

# def auto_visualize(df: pd.DataFrame, out_dir="outputs/visuals"):
#     ensure_dir(out_dir)
#     # Histograms for numerics
#     num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
#     for col in num_cols[:12]:
#         plt.figure()
#         sns.histplot(df[col].dropna(), kde=True)
#         plt.title(f"Distribution: {col}")
#         plt.tight_layout()
#         plt.savefig(os.path.join(out_dir, f"dist_{safe_filename(col)}.png"), dpi=150)
#         plt.close()

#     # Bar plots for top categorical
#     cat_cols = df.select_dtypes(exclude=[np.number, "datetime64[ns]"]).columns.tolist()
#     for col in cat_cols[:6]:
#         plt.figure(figsize=(8,4))
#         df[col].value_counts().head(20).plot(kind="bar")
#         plt.title(f"Top categories: {col}")
#         plt.tight_layout()
#         plt.savefig(os.path.join(out_dir, f"bar_{safe_filename(col)}.png"), dpi=150)
#         plt.close()

# # ------------------ AutoML Training ------------------

# def detect_task(df: pd.DataFrame, target: str):
#     y = df[target]
#     # Heuristic: classification if few unique values or non-numeric
#     if y.dtype.kind not in "biufc" or y.nunique() <= max(20, int(0.05 * len(y))):
#         return "classification"
#     return "regression"

# def train_automl(df: pd.DataFrame, target: str, out_dir="outputs/model", time_limit_sec=300):
#     ensure_dir(out_dir)
#     X = df.drop(columns=[target])
#     y = df[target]

#     # Basic encoding for categoricals
#     X_enc = pd.get_dummies(X, drop_first=True)

#     X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.2, random_state=42)
#     task = detect_task(df, target)

#     best_score = None
#     model = None

#     if AUTOSKLEARN_AVAILABLE:
#         if task == "classification":
#             model = ask_cls.AutoSklearnClassifier(time_left_for_this_task=time_limit_sec, per_run_time_limit=60)
#         else:
#             model = ask_reg.AutoSklearnRegressor(time_left_for_this_task=time_limit_sec, per_run_time_limit=60)
#         model.fit(X_train, y_train)
#         preds = model.predict(X_test)
#         score = accuracy_score(y_test, preds) if task == "classification" else r2_score(y_test, preds)
#         best_score = score
#     else:
#         # Fallback simple model
#         from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
#         if task == "classification":
#             model = RandomForestClassifier(n_estimators=200, random_state=42)
#             model.fit(X_train, y_train)
#             preds = model.predict(X_test)
#             best_score = accuracy_score(y_test, preds)
#         else:
#             model = RandomForestRegressor(n_estimators=300, random_state=42)
#             model.fit(X_train, y_train)
#             preds = model.predict(X_test)
#             best_score = r2_score(y_test, preds)

#     joblib.dump({"model": model, "columns": X_enc.columns.tolist(), "task": task, "target": target}, os.path.join(out_dir, "model.joblib"))
#     with open(os.path.join(out_dir, "metrics.json"), "w") as f:
#         json.dump({"task": task, "score": best_score}, f, indent=2)
#     return {"task": task, "score": best_score}

# def predict_with_saved_model(model_dir: str, new_df: pd.DataFrame):
#     bundle = joblib.load(os.path.join(model_dir, "model.joblib"))
#     model, cols, task, target = bundle["model"], bundle["columns"], bundle["task"], bundle["target"]
#     X = new_df.drop(columns=[target], errors="ignore")
#     X_enc = pd.get_dummies(X, drop_first=True)
#     # align
#     for c in cols:
#         if c not in X_enc.columns:
#             X_enc[c] = 0
#     X_enc = X_enc[cols]
#     return model.predict(X_enc)

# # ------------------ Ethical Web Mining ------------------

# def allowed_by_robots(base_url: str, path: str="/"):
#     try:
#         robots = requests.get(urljoin(base_url, "/robots.txt"), timeout=10)
#         if robots.status_code != 200:
#             return True  # default allow if no robots
#         from urllib import robotparser
#         rp = robotparser.RobotFileParser()
#         rp.parse(robots.text.splitlines())
#         return rp.can_fetch("*", urljoin(base_url, path))
#     except Exception:
#         return False

# def mine_public_table(start_url: str, max_rows=500, rate_sec=1.0):
#     # Respect robots.txt; fetch a table from the page if present
#     base = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(start_url))
#     if not allowed_by_robots(base, urlparse(start_url).path):
#         raise PermissionError("robots.txt disallows scraping this path.")
#     time.sleep(rate_sec)
#     r = requests.get(start_url, headers={"User-Agent": "BusinessAnalystAgent/1.0"}, timeout=15)
#     r.raise_for_status()
#     soup = BeautifulSoup(r.text, "html.parser")
#     table = soup.find("table")
#     if table is None:
#         # Try to find links to CSV
#         links = [a["href"] for a in soup.find_all("a", href=True)]
#         csv_links = [l for l in links if l.lower().endswith(".csv")]
#         if csv_links:
#             csv_url = urljoin(start_url, csv_links[0])
#             if not allowed_by_robots(base, urlparse(csv_url).path):
#                 raise PermissionError("robots.txt disallows CSV path.")
#             time.sleep(rate_sec)
#             return pd.read_csv(csv_url).head(max_rows)
#         raise ValueError("No HTML table or CSV link found.")
#     # Parse HTML table
#     df = pd.read_html(str(table))[0]
#     return df.head(max_rows)

# def save_mined_csv(df: pd.DataFrame, out_dir="outputs/mined", name="mined_data.csv"):
#     ensure_dir(out_dir)
#     p = os.path.join(out_dir, safe_filename(name))
#     df.to_csv(p, index=False)
#     return p

# # ------------------ Orchestration ------------------

# def run_agent(
#     data_path_or_url: str | None = None,
#     target: str | None = None,
#     web_seed_url: str | None = None,
#     time_limit_sec: int = 300
# ):
#     # Step 0: get or mine data
#     if data_path_or_url:
#         df_raw = load_data(data_path_or_url)
#     else:
#         if not web_seed_url:
#             raise ValueError("No data provided. Provide web_seed_url to mine public data.")
#         df_raw = mine_public_table(web_seed_url)

#     # Step 1: profile + clean
#     df_clean = profile_and_clean(df_raw, out_dir="outputs/eda", title="Business Analyst EDA Report")

#     # Step 2: relationships
#     rel = compute_relationships(df_clean, out_dir="outputs/relations")

#     # Step 3: visuals
#     auto_visualize(df_clean, out_dir="outputs/visuals")

#     # Step 4: train
#     metrics = None
#     if target and target in df_clean.columns:
#         metrics = train_automl(df_clean, target=target, out_dir="outputs/model", time_limit_sec=time_limit_sec)

#     # Return pointers
#     return {
#         "cleaned_csv": "outputs/eda/cleaned_data.csv",
#         "eda_report_html": "outputs/eda/Business_Analyst_EDA_Report.html",
#         "relations_summary": "outputs/relations/relations_summary.json",
#         "visuals_dir": "outputs/visuals",
#         "model_metrics": metrics,
#         "model_dir": "outputs/model" if metrics else None
#     }

# if __name__ == "__main__":
#     # Example usage:
#     # 1) With local or remote CSV
#     # results = run_agent(data_path_or_url="data.csv", target="SalePrice")
#     # 2) Without data: ethically mine a public table
#     # results = run_agent(web_seed_url="https://www.worldometers.info/world-population/population-by-country/")
#     pass


ModuleNotFoundError: No module named 'ydata_profiling'

In [14]:
# !pip install ydata-profiling



ERROR: Ignored the following versions that require a different python version: 4.0.0 Requires-Python >=3.7,<3.11; 4.1.0 Requires-Python >=3.7,<3.12; 4.1.1 Requires-Python >=3.7,<3.12; 4.1.2 Requires-Python >=3.7,<3.12; 4.10.0 Requires-Python >=3.7,<3.13; 4.11.0 Requires-Python >=3.7,<3.13; 4.12.0 Requires-Python >=3.7,<3.13; 4.12.1 Requires-Python >=3.7,<3.13; 4.12.2 Requires-Python >=3.7,<3.13; 4.13.0 Requires-Python >=3.7,<3.13; 4.14.0 Requires-Python >=3.7,<3.13; 4.15.0 Requires-Python >=3.7,<3.13; 4.15.1 Requires-Python >=3.7,<3.13; 4.16.0 Requires-Python >=3.7,<3.13; 4.16.1 Requires-Python >=3.7,<3.13; 4.2.0 Requires-Python >=3.7,<3.12; 4.3.0 Requires-Python >=3.7,<3.12; 4.3.1 Requires-Python >=3.7,<3.12; 4.3.2 Requires-Python >=3.7,<3.12; 4.4.0 Requires-Python >=3.7,<3.12; 4.5.0 Requires-Python >=3.7,<3.12; 4.5.1 Requires-Python >=3.7,<3.12; 4.6.0 Requires-Python >=3.7,<3.12; 4.6.1 Requires-Python >=3.7,<3.12; 4.6.2 Requires-Python >=3.7,<3.12; 4.6.3 Requires-Python >=3.7,<3.12; 

In [6]:

# pip install streamlit







SyntaxError: invalid syntax (2339512383.py, line 1)

In [7]:
# pip install streamlit

Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting altair!=5.4.0,!=5.4.1,<6,>=4.0 (from streamlit)
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.5.0 (from streamlit)
  Downloading blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting cachetools<7,>=4.0 (from streamlit)
  Downloading cachetools-6.2.0-py3-none-any.whl.metadata (5.4 kB)
Collecting click<9,>=7.0 (from streamlit)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting protobuf<7,>=3.20 (from streamlit)
  Downloading protobuf-6.32.1-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Collecting pyarrow>=7.0 (from streamlit)
  Downloading pyarrow-21.0.0-cp313-cp313-win_amd64.whl.metadata (3.4 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting 

In [4]:
# # app.py
# import streamlit as st
# import pandas as pd
# import numpy as np
# import plotly.express as px
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score, f1_score
# from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
# from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
# from sklearn.feature_selection import chi2
# from sklearn.linear_model import SGDClassifier, SGDRegressor
# from sklearn.utils.multiclass import type_of_target
# from category_encoders.target_encoder import TargetEncoder
# import shap
# import joblib
# import os

# st.set_page_config(page_title="Business Analyst Auto-App", layout="wide")

# st.title("Business Analyst Auto-App (No Auto-Sklearn / No YData-Profiling)")

# # ---------- Helpers ----------
# def infer_types(df):
#     df_copy = df.copy()
#     for col in df_copy.columns:
#         # try numeric conversion where possible
#         if df_copy[col].dtype == object:
#             try:
#                 df_copy[col] = pd.to_numeric(df_copy[col])
#             except:
#                 pass
#         # try datetime
#         if df_copy[col].dtype == object:
#             try:
#                 df_copy[col] = pd.to_datetime(df_copy[col])
#             except:
#                 pass
#     return df_copy

# def basic_clean(df):
#     df = df.copy()
#     # Drop exact duplicate rows
#     df = df.drop_duplicates()
#     # Remove columns with all nulls
#     df = df.dropna(axis=1, how='all')
#     # Trim whitespace in object columns
#     for c in df.select_dtypes(include=['object']).columns:
#         df[c] = df[c].astype(str).str.strip()
#     return df

# def split_cols(df):
#     num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
#     cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
#     # treat datetimes as categorical for quick EDA; could extract parts
#     dt_cols = df.select_dtypes(include=['datetime64[ns]']).columns.tolist()
#     return num_cols, cat_cols, dt_cols

# def encode_datetime(df, dt_cols):
#     # expand datetime to parts
#     df = df.copy()
#     for c in dt_cols:
#         df[c+"_year"] = df[c].dt.year
#         df[c+"_month"] = df[c].dt.month
#         df[c+"_day"] = df[c].dt.day
#         df[c+"_dow"] = df[c].dt.dayofweek
#         df[c+"_hour"] = df[c].dt.hour
#     df = df.drop(columns=dt_cols)
#     return df

# def build_preprocessor(num_cols, cat_cols, use_target_encoder=False):
#     num_pipe = Pipeline(steps=[
#         ("impute", SimpleImputer(strategy="median")),
#         ("scale", StandardScaler())
#     ])
#     if use_target_encoder:
#         cat_pipe = Pipeline(steps=[
#             ("impute", SimpleImputer(strategy="most_frequent")),
#             ("encode", TargetEncoder())
#         ])
#     else:
#         cat_pipe = Pipeline(steps=[
#             ("impute", SimpleImputer(strategy="most_frequent")),
#             ("encode", OneHotEncoder(handle_unknown="ignore"))
#         ])
#     pre = ColumnTransformer(
#         transformers=[
#             ("num", num_pipe, num_cols),
#             ("cat", cat_pipe, cat_cols)
#         ],
#         remainder="drop"
#     )
#     return pre

# def detect_task_type(y):
#     t = type_of_target(y)
#     if t in ["binary", "multiclass"]:
#         return "classification"
#     return "regression"

# def compute_relationships(df, target=None):
#     # Correlations (numeric)
#     num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
#     corr = None
#     if len(num_cols) >= 2:
#         corr = df[num_cols].corr(method="spearman")
#     # MI
#     mi = None
#     if target and target in df.columns:
#         y = df[target]
#         X = df.drop(columns=[target])
#         # quick simplification: drop datetimes if any
#         X = X.select_dtypes(exclude=["datetime64[ns]"]).copy()
#         y_type = detect_task_type(y)
#         # coerce non-numeric categorical to codes for MI
#         for c in X.select_dtypes(include=['object', 'category']).columns:
#             X[c] = X[c].astype('category').cat.codes
#         if y_type == "classification":
#             if y.dtype.kind not in ['i','u']:
#                 y = y.astype('category').cat.codes
#             mi_scores = mutual_info_classif(X, y, discrete_features='auto', random_state=42)
#         else:
#             y = pd.to_numeric(y, errors="coerce")
#             mi_scores = mutual_info_regression(X, y, random_state=42)
#         mi = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
#     return corr, mi

# def train_quick_model(df, target):
#     y = df[target]
#     X = df.drop(columns=[target])
#     # keep simple: expand datetimes
#     _, _, dt_cols = split_cols(X)
#     X = encode_datetime(X, dt_cols)
#     num_cols, cat_cols, _ = split_cols(X)
#     task = detect_task_type(y)
#     pre = build_preprocessor(num_cols, cat_cols, use_target_encoder=(task=="regression"))
#     if task == "classification":
#         model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
#     else:
#         model = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
#     pipe = Pipeline(steps=[("pre", pre), ("model", model)])
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y if task=="classification" else None)
#     pipe.fit(X_train, y_train)
#     preds = pipe.predict(X_test)
#     metrics = {}
#     if task == "classification":
#         metrics["accuracy"] = accuracy_score(y_test, preds)
#         metrics["f1"] = f1_score(y_test, preds, average="weighted")
#     else:
#         metrics["r2"] = r2_score(y_test, preds)
#         metrics["mae"] = mean_absolute_error(y_test, preds)
#     return pipe, metrics, task

# def shap_importance(pipe, X_sample, max_display=10):
#     try:
#         # Extract underlying estimator if in Pipeline
#         model = pipe.named_steps["model"]
#         pre = pipe.named_steps["pre"]
#         X_trans = pre.transform(X_sample)
#         explainer = shap.TreeExplainer(model)
#         shap_values = explainer.shap_values(X_trans)
#         # Feature names after transform
#         try:
#             feature_names = pre.get_feature_names_out()
#         except:
#             feature_names = [f"f_{i}" for i in range(X_trans.shape[1])]
#         if isinstance(shap_values, list):  # classification list per class
#             vals = np.abs(shap_values[0]).mean(axis=0)
#         else:
#             vals = np.abs(shap_values).mean(axis=0)
#         imp = pd.Series(vals, index=feature_names).sort_values(ascending=False).head(max_display)
#         return imp
#     except Exception as e:
#         return pd.Series(dtype=float)

# def incremental_fit(df, target, cache_dir="models"):
#     os.makedirs(cache_dir, exist_ok=True)
#     y = df[target]
#     X = df.drop(columns=[target])
#     _, _, dt_cols = split_cols(X)
#     X = encode_datetime(X, dt_cols)
#     num_cols, cat_cols, _ = split_cols(X)
#     task = detect_task_type(y)
#     if task == "classification":
#         base = SGDClassifier(loss="log_loss", random_state=42)
#     else:
#         base = SGDRegressor(random_state=42)
#     pre = build_preprocessor(num_cols, cat_cols, use_target_encoder=(task=="regression"))
#     pipe = Pipeline(steps=[("pre", pre), ("model", base)])
#     try:
#         pipe = joblib.load(os.path.join(cache_dir, "online.pkl"))
#     except:
#         pass
#     # Simple partial fit loop (convert y for classification if needed)
#     if task == "classification" and y.dtype.kind not in ['i','u']:
#         y_codes = y.astype('category').cat.codes
#         classes = np.unique(y_codes)
#         pipe.named_steps["model"].partial_fit(pre.fit_transform(X), y_codes, classes=classes)
#     else:
#         y_num = pd.to_numeric(y, errors="coerce")
#         m = pipe.named_steps["model"]
#         Xt = pre.fit_transform(X)
#         # mini-batch updates
#         for i in range(0, Xt.shape[0], 512):
#             m.partial_fit(Xt[i:i+512], y_num.iloc[i:i+512])
#     joblib.dump(pipe, os.path.join(cache_dir, "online.pkl"))
#     return "Model updated"

# # ---------- UI ----------
# uploaded = st.file_uploader("Upload a CSV", type=["csv"])
# target = st.text_input("Optional: enter target column for modeling (for SHAP and suggestions)")

# if uploaded is not None:
#     df_raw = pd.read_csv(uploaded)
#     st.subheader("Raw preview")
#     st.dataframe(df_raw.head(20))

#     df1 = infer_types(df_raw)
#     df2 = basic_clean(df1)
#     st.subheader("After type inference and basic cleaning")
#     st.write(f"Rows: {df2.shape[0]}, Cols: {df2.shape[1]}")
#     st.dataframe(df2.head(20))

#     # Datetime expansion for EDA charts
#     num_cols, cat_cols, dt_cols = split_cols(df2)
#     df_eda = encode_datetime(df2, dt_cols)

#     # Relationships
#     st.subheader("Relationships")
#     corr, mi = compute_relationships(df2, target=target if target in df2.columns else None)
#     if corr is not None:
#         st.write("Spearman correlation (numeric):")
#         st.dataframe(corr.round(3))
#         if len(corr.columns) > 1:
#             fig = px.imshow(corr, color_continuous_scale="Blues", title="Correlation Heatmap")
#             st.plotly_chart(fig, use_container_width=True)
#     if mi is not None:
#         st.write("Mutual Information with target:")
#         st.dataframe(mi.to_frame("MI").round(4))

#     # Visualizations
#     st.subheader("Quick Visuals")
#     # Numeric hist
#     if len(num_cols) > 0:
#         sel_num = st.selectbox("Numeric column for histogram", options=num_cols)
#         fig = px.histogram(df2, x=sel_num, nbins=40)
#         st.plotly_chart(fig, use_container_width=True)
#     # Category count
#     if len(cat_cols) > 0:
#         sel_cat = st.selectbox("Categorical column for counts", options=cat_cols)
#         fig = px.bar(df2[sel_cat].value_counts().reset_index(), x="index", y=sel_cat)
#         st.plotly_chart(fig, use_container_width=True)

#     # Modeling + SHAP
#     if target and target in df2.columns:
#         st.subheader("Auto Model + SHAP")
#         pipe, metrics, task = train_quick_model(df2, target)
#         st.write(f"Task: {task}, Metrics: {metrics}")
#         # SHAP top features
#         X_sample = df2.drop(columns=[target]).sample(min(500, df2.shape[0]), random_state=42)
#         imp = shap_importance(pipe, X_sample)
#         if not imp.empty:
#             st.write("Top features by SHAP (mean |value|):")
#             st.dataframe(imp.to_frame("importance").round(5))
#             fig = px.bar(imp.sort_values(ascending=True), orientation="h", title="SHAP mean |importance|")
#             st.plotly_chart(fig, use_container_width=True)

#         # Suggestions (rule + model-driven)
#         st.subheader("Decision Suggestions")
#         suggestions = []
#         # Example heuristic: if MI shows strong drivers, suggest focusing
#         if mi is not None and not mi.empty:
#             top_driver = mi.index[0]
#             suggestions.append(f"Investigate segments by '{top_driver}' for targeted campaigns or pricing, as it shows highest dependency with the target.")
#         # If classification and low accuracy, suggest data improvements
#         if task == "classification" and metrics.get("accuracy", 1) < 0.8:
#             suggestions.append("Model accuracy is below 0.8; consider collecting more labeled data, balancing classes, or enriching features.")
#         # If regression and low R2
#         if task == "regression" and metrics.get("r2", 1) < 0.6:
#             suggestions.append("Predictive power is modest; explore additional business drivers, lag features, seasonality, or external benchmarks.")
#         # If categorical cardinality high
#         high_card = [c for c in cat_cols if df2[c].nunique() > 100]
#         if high_card:
#             suggestions.append(f"High-cardinality categories detected ({', '.join(high_card[:3])}). Consider grouping or target encoding for better generalization.")
#         # General BI actions
#         suggestions.append("Build segment dashboards for top 3 drivers and monitor weekly KPIs to detect drift and outliers.")
#         suggestions.append("Run A/B tests on offers or pricing in high-impact segments indicated by SHAP to validate uplift.")
#         st.write("- " + "\n- ".join(suggestions))

#         # Online learning button
#         if st.button("Incrementally train for future data (online)"):
#             msg = incremental_fit(df2, target)
#             st.success(msg)

#     else:
#         st.info("Enter a valid target column to enable modeling, SHAP, and decision suggestions.")




In [3]:
# pip install --upgrade ipywidgets jupyter

Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Collecting notebook (from jupyter)
  Downloading notebook-7.4.5-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-console (from jupyter)
  Downloading jupyter_console-6.6.3-py3-none-any.whl.metadata (5.8 kB)
Collecting fqdn (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter)
  Downloading fqdn-1.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting isoduration (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.11.0->jupyter-server<3,>=2.4.0->jupyterlab->jupyter)
  Downloading isoduration-20.11

In [2]:
# streamlit run app.py


SyntaxError: invalid syntax (507122745.py, line 1)