In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from xgboost import XGBClassifier, XGBRegressor
import os
import pickle
from sklearn.datasets import load_digits


# 1. Dataset Input Options
def load_dataset(choice):
    """Load dataset from a variety of sources like URL, local file, or built-in datasets."""
    if choice == '1':
        from sklearn.datasets import load_iris
        data = load_iris(as_frame=True)
        df = pd.concat([data.data, data.target.rename('target')], axis=1)
        return df, detect_task_type(df)
    elif choice == '2':
        from sklearn.datasets import load_wine
        data = load_wine(as_frame=True)
        df = pd.concat([data.data, data.target.rename('target')], axis=1)
        return df, detect_task_type(df)
    elif choice == '3':
        from sklearn.datasets import fetch_california_housing
        data = fetch_california_housing(as_frame=True)
        df = pd.concat([data.data, data.target.rename('target')], axis=1)
        return df, detect_task_type(df)
    elif choice == '4':
        url = input("Enter the URL to the dataset: ")
        df = pd.read_csv(url)
        return df, detect_task_type(df)
    elif choice == '5':
        file_path = input("Enter the path to the dataset file: ")
        df = pd.read_csv(file_path)
        return df, detect_task_type(df)
    else:
        raise ValueError("Invalid dataset choice")

# 2. Data Preprocessing
def preprocess_data(X, strategy_num, strategy_cat, scale_option):
    """Impute missing values, encode categorical data, and scale numeric features."""
    num_cols = X.select_dtypes(include=[np.number]).columns
    cat_cols = X.select_dtypes(exclude=[np.number]).columns

    # Impute missing values for numerical columns
    if strategy_num == '1':
        imputer_num = SimpleImputer(strategy='mean')
    elif strategy_num == '2':
        imputer_num = SimpleImputer(strategy='median')
    elif strategy_num == '3':
        imputer_num = SimpleImputer(strategy='most_frequent')
    elif strategy_num == '4':
        imputer_num = SimpleImputer(strategy='constant', fill_value=-1)
    X[num_cols] = imputer_num.fit_transform(X[num_cols])

    # Impute missing values for categorical columns
    if len(cat_cols) > 0:
        if strategy_cat == '1':
            imputer_cat = SimpleImputer(strategy='most_frequent')
        elif strategy_cat == '2':
            imputer_cat = SimpleImputer(strategy='constant', fill_value='missing')
        X[cat_cols] = imputer_cat.fit_transform(X[cat_cols])

    # Encode categorical columns
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    X_encoded = pd.DataFrame(encoder.fit_transform(X[cat_cols]))
    X_encoded.columns = encoder.get_feature_names(cat_cols)
    X = pd.concat([X[num_cols], X_encoded], axis=1)

    # Scale numeric features
    if scale_option == 'yes':
        print("Scaling features...")
        scaler = StandardScaler() if input("Choose scaling method (1 for StandardScaler, 2 for MinMaxScaler): ") == '1' else MinMaxScaler()
        X[num_cols] = scaler.fit_transform(X[num_cols])

    return X

# 3. Target Detection
def detect_task_type(df):
    """Detect if the task is classification or regression based on the target column."""
    target_column = detect_target_column(df)
    if df[target_column].dtype in ['float64', 'int64'] and df[target_column].nunique() > 10:
        return "regression"
    else:
        return "classification"

def detect_target_column(df):
    """Identify the target column in the dataset."""
    if 'target' not in df.columns:
        print("No explicitly named 'target' column found. Assuming the last column on the right is the target variable.")
        return df.columns[-1]
    return 'target'

# 4. Feature Engineering
def feature_selection(X, y):
    """Allow the user to select features based on correlation or importance scores."""

    # Calculate correlation of each feature with the target variable
    corr_matrix = X.copy()
    corr_matrix['target'] = y
    corr = corr_matrix.corr().abs()

    # Display correlation with target column, excluding the target column itself
    print("\nCorrelation with target column:")
    feature_list = []
    for idx, col in enumerate(corr.columns[:-1]):  # Exclude target column from the printout
        print(f"{idx + 1}: {col} - {corr[col]['target']:.2f}")
        feature_list.append(col)

    # Ask user which columns to drop based on correlation
    drop_columns_input = input("\nPress 'S' to skip or enter the numbers (comma-separated) of columns to drop: ")

    if drop_columns_input.strip().lower() == 's':
        print("Skipping feature selection.")
        return X

    # If user enters a comma-separated list of numbers
    if drop_columns_input.strip():
        drop_indices = drop_columns_input.split(',')
        drop_columns = [feature_list[int(idx.strip()) - 1] for idx in drop_indices if idx.strip().isdigit()]

        if drop_columns:
            X = X.drop(columns=drop_columns)
            print(f"Dropped columns: {drop_columns}")

    return X

# 5. Model Selection
def select_model(task_type):
    """Allow user to select and configure models."""
    if task_type == "classification":
        print("\nAvailable models for classification:")
        print("1: RandomForestClassifier\n2: XGBoostClassifier\n3: LogisticRegression\n4: KNeighborsClassifier")
        model_choice = input("Select model: ")
        return {
            '1': RandomForestClassifier(random_state=42),
            '2': XGBClassifier(random_state=42),
            '3': LogisticRegression(max_iter=200),
            '4': KNeighborsClassifier()
        }[model_choice]
    elif task_type == "regression":
        print("\nAvailable models for regression:")
        print("1: RandomForestRegressor\n2: XGBoostRegressor\n3: LinearRegression\n4: DecisionTreeRegressor")
        model_choice = input("Select model: ")
        return {
            '1': RandomForestRegressor(random_state=42),
            '2': XGBRegressor(random_state=42),
            '3': LinearRegression(),
            '4': DecisionTreeRegressor()
        }[model_choice]

# 6. Model Evaluation
def evaluate_model(model, task_type, X_test, y_test, y_pred):
    """Evaluate model based on the task type."""
    if task_type == "classification":
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Model Accuracy: {accuracy * 100:.2f}%")
        print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
        print("Classification Report:\n", classification_report(y_test, y_pred))
    elif task_type == "regression":
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"Mean Squared Error: {mse:.2f}")
        print(f"Mean Absolute Error: {mae:.2f}")
        print(f"R² Score: {r2:.2f}")

# Main function
def main():
    print("Select a dataset:")
    print("1: Iris dataset\n2: Wine dataset\n3: California Housing dataset\n4: Load dataset from URL\n5: Load dataset from local file")
    choice = input("Enter the number of the option you want to use: ")
    df, task_type = load_dataset(choice)

    # Preprocessing
    X = df.drop(columns=[detect_target_column(df)])
    y = df[detect_target_column(df)]
    print("Choose strategy for numerical missing values:\n1: Mean\n2: Median\n3: Mode\n4: Fill with -1")
    strategy_num = input("Enter your choice: ")
    print("Choose strategy for categorical missing values:\n1: Most frequent\n2: Fill with 'missing'")
    strategy_cat = input("Enter your choice: ")
    scale_option = input("Would you like to scale numerical features? (yes/no): ")
    X_cleaned = preprocess_data(X, strategy_num, strategy_cat, scale_option)

    # Feature Selection
    X_cleaned = feature_selection(X_cleaned, y)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y, test_size=0.3, random_state=42)

    # Model Selection
    model = select_model(task_type)

    # Model Training
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Model Evaluation
    evaluate_model(model, task_type, X_test, y_test, y_pred)

    # Save Model
    if input("Would you like to save the model? (yes/no): ").lower() == 'yes':
        with open('trained_model.pkl', 'wb') as f:
            pickle.dump(model, f)
        print("Model saved as 'trained_model.pkl'.")

    # Save Predictions
    if input("Would you like to save the predictions? (yes/no): ").lower() == 'yes':
        predictions_df = pd.DataFrame({"Actual": y_test.values, "Predicted": y_pred})
        predictions_df.to_csv("predictions.csv", index=False)
        print("Predictions saved as 'predictions.csv'.")

# Run the main program
if __name__ == "__main__":
    main()

0.23.1
Select a dataset:
1: Iris dataset
2: Wine dataset
3: California Housing dataset
4: Load dataset from URL
5: Load dataset from local file
Enter the number of the option you want to use: 2
Choose strategy for numerical missing values:
1: Mean
2: Median
3: Mode
4: Fill with -1
Enter your choice: 3
Choose strategy for categorical missing values:
1: Most frequent
2: Fill with 'missing'
Enter your choice: 1
Would you like to scale numerical features? (yes/no): yes
Scaling features...
Choose scaling method (1 for StandardScaler, 2 for MinMaxScaler): 2

Correlation with target column:
1: alcohol - 0.33
2: malic_acid - 0.44
3: ash - 0.05
4: alcalinity_of_ash - 0.52
5: magnesium - 0.21
6: total_phenols - 0.72
7: flavanoids - 0.85
8: nonflavanoid_phenols - 0.49
9: proanthocyanins - 0.50
10: color_intensity - 0.27
11: hue - 0.62
12: od280/od315_of_diluted_wines - 0.79
13: proline - 0.63

Press 'S' to skip or enter the numbers (comma-separated) of columns to drop: 11
Dropped columns: ['hue']