In [2]:
!mkdir smart_data_analyzer
%cd smart_data_analyzer

/content/smart_data_analyzer


In [4]:
!./venv/bin/pip install package_name

/bin/bash: line 1: ./venv/bin/pip: No such file or directory


In [5]:
!./venv/bin/python your_script.py

/bin/bash: line 1: ./venv/bin/python: No such file or directory


In [8]:
!./venv/bin/pip install pandas numpy scikit-learn matplotlib joblib

/bin/bash: line 1: ./venv/bin/pip: No such file or directory


In [14]:
"""
smart_data_analyzer.py

Usage:
  - Default (creates synthetic data):
      python smart_data_analyzer.py

  - Run on your CSV:
      python smart_data_analyzer.py --input path/to/your.csv --target target_column_name --outdir ./output

Outputs (in outdir):
  - sample or processed csv
  - plots: age_histogram.png, income_vs_avgspend.png, target_distribution.png, roc_curve.png
  - model artifacts: preprocessor.joblib, selector.joblib, model.joblib
"""
import os
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
import joblib

def create_synthetic(n=800, seed=42):
    rng = np.random.default_rng(seed)
    age = rng.integers(18, 75, size=n)
    income = (age * rng.normal(800, 120, size=n) / 10 + rng.normal(2000, 1000, size=n)).astype(int)
    gender = rng.choice(['Male', 'Female'], size=n, p=[0.48, 0.52])
    city = rng.choice(['CityA', 'CityB', 'CityC', 'CityD'], size=n, p=[0.4,0.3,0.2,0.1])
    visits_last_month = rng.poisson(3, size=n)
    avg_spend = np.round(np.abs(rng.normal(300, 120, size=n)), 2)
    target = ((income > np.percentile(income, 60)).astype(int) & (visits_last_month >= 3)).astype(int)

    df = pd.DataFrame({
        'age': age,
        'income': income.astype(float),
        'gender': gender,
        'city': city,
        'visits_last_month': visits_last_month,
        'avg_spend': avg_spend,
        'target_high_value': target
    })

    # inject a few missing values for demo
    for col in ['income', 'avg_spend']:
        idx = rng.choice(df.index, size=int(0.03 * n), replace=False)
        df.loc[idx, col] = np.nan
    return df

def save_plot_hist(series, path, title="Histogram", xlabel=None):
    plt.figure()
    plt.hist(series.dropna(), bins=12)
    plt.title(title)
    if xlabel: plt.xlabel(xlabel)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig(path)
    plt.close()

def save_plot_scatter(x, y, path, title="Scatter", xlabel=None, ylabel=None):
    plt.figure()
    plt.scatter(x.fillna(x.median()), y.fillna(y.median()), alpha=0.6)
    plt.title(title)
    if xlabel: plt.xlabel(xlabel)
    if ylabel: plt.ylabel(ylabel)
    plt.tight_layout()
    plt.savefig(path)
    plt.close()

def save_plot_pie(series, path, title="Distribution"):
    plt.figure()
    vc = series.value_counts()
    plt.pie(vc, labels=vc.index.astype(str), autopct='%1.1f%%')
    plt.title(title)
    plt.tight_layout()
    plt.savefig(path)
    plt.close()

def ensure_dir(d):
    if not os.path.exists(d):
        os.makedirs(d, exist_ok=True)

def main(args):
    outdir = args.outdir or "./output"
    ensure_dir(outdir)

    # 1) Load or create data
    if args.input:
        print(f"Loading input CSV: {args.input}")
        df = pd.read_csv(args.input)
        if args.target is None:
            raise SystemExit("When providing --input, you must provide --target target_column_name")
        target_col = args.target
    else:
        print("No input provided — creating synthetic dataset.")
        df = create_synthetic()
        target_col = 'target_high_value'

    print("Data shape:", df.shape)
    sample_csv = os.path.join(outdir, "data_sample.csv")
    df.head(100).to_csv(sample_csv, index=False)
    print("Saved sample rows to", sample_csv)

    # Quick EDA: dtypes and missing values
    print("\n>>> Quick EDA")
    print(df.dtypes)
    print("Missing values:\n", df.isnull().sum())

    # Make some helpful plots if relevant columns exist
    # heuristics: use 'age', 'income', 'avg_spend' if present
    if 'age' in df.columns:
        p = os.path.join(outdir, 'age_histogram.png')
        save_plot_hist(df['age'], p, title="Age distribution", xlabel="Age")
        print("Saved", p)
    if 'income' in df.columns and 'avg_spend' in df.columns:
        p = os.path.join(outdir, 'income_vs_avgspend.png')
        save_plot_scatter(df['income'], df['avg_spend'], p, title="Income vs Avg Spend", xlabel="Income", ylabel="Avg Spend")
        print("Saved", p)

    if target_col in df.columns:
        p = os.path.join(outdir, 'target_distribution.png')
        save_plot_pie(df[target_col], p, title=f"{target_col} distribution")
        print("Saved", p)
    else:
        print(f"Warning: target column '{target_col}' not in dataset; skipping distribution plot.")

    # 2) Preprocessing - separate X,y
    if target_col not in df.columns:
        raise SystemExit(f"Target column '{target_col}' not found in data.")
    X = df.drop(columns=[target_col])
    y = df[target_col]

    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
    print("Numeric cols:", numeric_cols)
    print("Categorical cols:", cat_cols)

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

    # 3) Split and train baseline RandomForest (for importance)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
    print("Split sizes:", X_train.shape, X_test.shape)

    X_train_prep = preprocessor.fit_transform(X_train)
    X_test_prep = preprocessor.transform(X_test)
    print("Preprocessing completed. Shapes:", X_train_prep.shape, X_test_prep.shape)

    rf = RandomForestClassifier(n_estimators=200, random_state=42)
    rf.fit(X_train_prep, y_train)
    train_acc = accuracy_score(y_train, rf.predict(X_train_prep))
    test_acc = accuracy_score(y_test, rf.predict(X_test_prep))
    print(f"RandomForest train acc: {train_acc:.4f}, test acc: {test_acc:.4f}")
    print("Classification report (test):")
    print(classification_report(y_test, rf.predict(X_test_prep)))

    # Map importances back to names
    try:
        ohe = preprocessor.named_transformers_['cat'].named_steps['onehot']
        ohe_features = list(ohe.get_feature_names_out(cat_cols))
    except Exception:
        ohe_features = []
    feature_names = numeric_cols + ohe_features
    importances = rf.feature_importances_
    fi_df = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False)
    fi_path = os.path.join(outdir, 'feature_importances.csv')
    fi_df.to_csv(fi_path, index=False)
    print("Saved feature importances to", fi_path)

    # 4) Select features and retrain lightweight model
    selector = SelectFromModel(rf, threshold='median', prefit=True)
    X_train_sel = selector.transform(X_train_prep)
    X_test_sel = selector.transform(X_test_prep)
    print("Selected feature shapes:", X_train_sel.shape, X_test_sel.shape)

    logreg = LogisticRegression(max_iter=1000)
    logreg.fit(X_train_sel, y_train)
    pred = logreg.predict(X_test_sel)
    probs = logreg.predict_proba(X_test_sel)[:,1] if hasattr(logreg, 'predict_proba') else None
    print("Logistic Regression test accuracy:", accuracy_score(y_test, pred))
    print(classification_report(y_test, pred))
    if probs is not None:
        try:
            auc = roc_auc_score(y_test, probs)
            print("ROC AUC:", auc)
            fpr, tpr, _ = roc_curve(y_test, probs)
            plt.figure()
            plt.plot(fpr, tpr)
            plt.plot([0,1], [0,1], linestyle='--')
            plt.title('ROC Curve (LogReg)')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            roc_path = os.path.join(outdir, 'roc_curve.png')
            plt.tight_layout()
            plt.savefig(roc_path)
            plt.close()
            print("Saved ROC curve to", roc_path)
        except Exception as e:
            print("Could not compute ROC AUC:", e)

    # 5) Save processed dataset & artifacts
    # Build final feature names for the full preprocessor transform
    X_all_prep = preprocessor.transform(X)
    try:
        final_feature_names = numeric_cols + list(preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_cols))
    except Exception:
        final_feature_names = [f"f{i}" for i in range(X_all_prep.shape[1])]
    processed_csv = os.path.join(outdir, 'processed_features.csv')
    pd.DataFrame(X_all_prep, columns=final_feature_names).to_csv(processed_csv, index=False)
    print("Saved processed features to", processed_csv)

    # Save artifacts
    joblib.dump(preprocessor, os.path.join(outdir, 'preprocessor.joblib'))
    joblib.dump(selector, os.path.join(outdir, 'selector.joblib'))
    joblib.dump(logreg, os.path.join(outdir, 'model.joblib'))
    print("Saved artifacts to", outdir)

    # Report selected features
    try:
        mask = selector.get_support()
        selected_feats = [fn for fn, m in zip(final_feature_names, mask) if m]
        print("Selected features after thresholding:", selected_feats)
    except Exception:
        print("Could not list selected features.")

    print("\nDone. See outputs in:", outdir)

# Define a simple object to mimic the argparse Namespace
class Args:
    def __init__(self, input=None, target=None, outdir='./output'):
        self.input = input
        self.target = target
        self.outdir = outdir

# Create an instance of Args and call main
# To use synthetic data:
main(Args())

# To use your own CSV, uncomment the following line and replace the placeholders:
# main(Args(input='path/to/your.csv', target='target_column_name', outdir='./output'))

No input provided — creating synthetic dataset.
Data shape: (800, 7)
Saved sample rows to ./output/data_sample.csv

>>> Quick EDA
age                    int64
income               float64
gender                object
city                  object
visits_last_month      int64
avg_spend            float64
target_high_value      int64
dtype: object
Missing values:
 age                   0
income               24
gender                0
city                  0
visits_last_month     0
avg_spend            24
target_high_value     0
dtype: int64
Saved ./output/age_histogram.png
Saved ./output/income_vs_avgspend.png
Saved ./output/target_distribution.png
Numeric cols: ['age', 'income', 'visits_last_month', 'avg_spend']
Categorical cols: ['gender', 'city']
Split sizes: (640, 6) (160, 6)
Preprocessing completed. Shapes: (640, 10) (160, 10)
RandomForest train acc: 1.0000, test acc: 1.0000
Classification report (test):
              precision    recall  f1-score   support

           0       1.00 

In [16]:
!python smart_data_analyzer.py

python3: can't open file '/content/smart_data_analyzer/smart_data_analyzer.py': [Errno 2] No such file or directory


In [18]:
!python smart_data_analyzer.py --input path/to/your_file.csv --target your_target_column --outdir ./my_analysis

python3: can't open file '/content/smart_data_analyzer/smart_data_analyzer.py': [Errno 2] No such file or directory
