# ML Analyzer: Predict & Classify Any Dataset

This notebook implements a machine learning analyzer that can perform classification, regression, and clustering tasks on datasets. The implementation includes preprocessing, model training, evaluation, and visualization.

## 1. Import Libraries

First, we import all the necessary libraries for data manipulation, visualization, and machine learning.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, mean_absolute_error, mean_squared_error, r2_score
)
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 2. Data Loading and Exploration

This section handles loading the dataset and exploring its basic properties.

In [None]:
# Function to load dataset
def load_dataset(file_path):
    """
    Load a CSV dataset and return a pandas DataFrame
    """
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset loaded successfully with shape: {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading dataset: {str(e)}")
        return None

In [None]:
# Load your dataset here
# Replace 'your_dataset.csv' with your actual file path
df = load_dataset('your_dataset.csv')

# Display basic information about the dataset
if df is not None:
    print("\nDataset Preview:")
    display(df.head())
    
    print("\nDataset Information:")
    display(df.info())
    
    print("\nDataset Statistics:")
    display(df.describe())

## 3. Data Preprocessing

This section handles data preprocessing, including handling missing values and encoding categorical features.

In [None]:
def preprocess_data(df):
    """
    Preprocess data with label encoding for categorical features
    """
    if df is None:
        return None, {}
    
    # Make a copy to avoid modifying the original
    processed_df = df.copy()
    label_encoders = {}
    
    # Handle missing values
    for col in processed_df.columns:
        if processed_df[col].dtype in ['int64', 'float64']:
            # Fill numeric missing values with mean
            processed_df[col] = processed_df[col].fillna(processed_df[col].mean())
        else:
            # Fill categorical missing values with mode
            processed_df[col] = processed_df[col].fillna(processed_df[col].mode()[0])
    
    # Label encode categorical columns
    for col in processed_df.columns:
        if processed_df[col].dtype == 'object':
            le = LabelEncoder()
            processed_df[col] = le.fit_transform(processed_df[col])
            label_encoders[col] = le
            print(f"Encoded column '{col}' with {len(le.classes_)} unique values")
    
    return processed_df, label_encoders

In [None]:
# Preprocess the dataset
if df is not None:
    processed_df, label_encoders = preprocess_data(df)
    
    print("\nPreprocessed Dataset Preview:")
    display(processed_df.head())

## 4. Feature and Target Selection

This section handles selecting features and target variables for different tasks.

In [None]:
def select_features_target(df, target_col=None, task="classification"):
    """
    Select features and target based on the task
    """
    if df is None:
        return None, None
    
    if task == "clustering":
        # For clustering, all columns are features
        feature_cols = list(df.columns)
        return df[feature_cols], None
    else:
        # For classification and regression, we need a target column
        if target_col is None or target_col not in df.columns:
            print("Error: Please specify a valid target column")
            return None, None
        
        feature_cols = [col for col in df.columns if col != target_col]
        return df[feature_cols], df[target_col]

In [None]:
# Set your task and target column
# Options for task: "classification", "regression", "clustering"
task = "classification"  # Change this to your desired task
target_col = "Subscribed"  # Change this to your target column name

if processed_df is not None:
    X, y = select_features_target(processed_df, target_col, task)
    
    if X is not None:
        print(f"Selected {len(X.columns)} features for {task} task")
        if task != "clustering":
            print(f"Target column: {target_col}")
            if task == "classification":
                print(f"Target distribution:\n{y.value_counts()}")

## 5. Classification Models

This section implements and evaluates classification models.

In [None]:
def train_classification_model(X, y, algorithm="KNN"):
    """
    Train a classification model based on the selected algorithm
    """
    if X is None or y is None:
        return None
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training data shape: {X_train.shape}, Testing data shape: {X_test.shape}")
    
    # Train model based on selected algorithm
    if algorithm == "KNN":
        model = KNeighborsClassifier(n_neighbors=3)
        print("Using KNN classifier with n_neighbors=3")
    elif algorithm == "SVM":
        model = svm.SVC(kernel='rbf')
        print("Using SVM classifier with rbf kernel")
    else:  # Decision Tree
        model = DecisionTreeClassifier(criterion="entropy", max_depth=3)
        print("Using Decision Tree classifier with entropy criterion and max_depth=3")
    
    # Fit model
    model.fit(X_train, y_train)
    print("Model trained successfully")
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    
    # Display metrics
    print(f"\nClassification Metrics:")
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1 Score: {f1:.3f}")
    print(f"\nConfusion Matrix:\n{cm}")
    
    # Plot confusion matrix heatmap
    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Confusion Matrix")
    plt.colorbar()
    
    # Add text annotations
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, str(cm[i, j]), ha="center", va="center", color="black")
    
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()
    
    return model, X_test, y_test, y_pred

In [None]:
# Train classification model if task is classification
if task == "classification" and X is not None and y is not None:
    # Choose algorithm: "KNN", "SVM", or "Decision Tree"
    algorithm = "Decision Tree"  # Change this to your desired algorithm
    
    classification_model, X_test, y_test, y_pred = train_classification_model(X, y, algorithm)

## 6. Regression Models

This section implements and evaluates regression models.

In [None]:
def train_regression_model(X, y):
    """
    Train a regression model
    """
    if X is None or y is None:
        return None
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"Training data shape: {X_train.shape}, Testing data shape: {X_test.shape}")
    
    # Train model
    model = LinearRegression()
    print("Using Linear Regression model")
    
    model.fit(X_train, y_train)
    print("Model trained successfully")
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Display metrics
    print(f"\nRegression Metrics:")
    print(f"MAE: {mae:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print(f"R² Score: {r2:.3f}")
    
    # Create comparison dataframe
    df_compare = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    print(f"\nActual vs Predicted (first 5 rows):")
    display(df_compare.head())
    
    # Plot scatter plot
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title("Actual vs Predicted Scatter Plot")
    plt.tight_layout()
    plt.show()
    
    return model, X_test, y_test, y_pred

In [None]:
# Train regression model if task is regression
if task == "regression" and X is not None and y is not None:
    regression_model, X_test, y_test, y_pred = train_regression_model(X, y)

## 7. Clustering Models

This section implements and evaluates clustering models.

In [None]:
def train_clustering_model(X, n_clusters=3):
    """
    Train a KMeans clustering model
    """
    if X is None:
        return None
    
    # Select first two numeric columns for visualization
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns[:2]
    if len(numeric_cols) < 2:
        print("Error: Need at least 2 numeric columns for clustering visualization")
        return None
    
    X_cluster = X[numeric_cols]
    print(f"Using columns {numeric_cols[0]} and {numeric_cols[1]} for clustering visualization")
    
    # Train KMeans model
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(X_cluster)
    print("KMeans model trained successfully")
    
    # Display info
    print(f"\nKMeans Clustering:")
    print(f"Number of clusters: {n_clusters}")
    print(f"Cluster centers:\n{kmeans.cluster_centers_}")
    
    # Count samples in each cluster
    unique, counts = np.unique(clusters, return_counts=True)
    for i, (cluster, count) in enumerate(zip(unique, counts)):
        print(f"Cluster {cluster}: {count} samples")
    
    # Plot clusters
    plt.figure(figsize=(10, 6))
    
    # Plot each cluster
    for i in range(n_clusters):
        cluster_points = X_cluster[clusters == i]
        plt.scatter(cluster_points.iloc[:, 0], cluster_points.iloc[:, 1], label=f'Cluster {i}')
    
    # Plot centroids
    plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], 
              s=200, marker='*', c='red', label='Centroids')
    
    plt.xlabel(numeric_cols[0])
    plt.ylabel(numeric_cols[1])
    plt.title("KMeans Clustering")
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    return kmeans, clusters, X_cluster

In [None]:
# Train clustering model if task is clustering
if task == "clustering" and X is not None:
    n_clusters = 3  # Change this to your desired number of clusters
    clustering_model, clusters, X_cluster = train_clustering_model(X, n_clusters)

## 8. Random Data Generation and Prediction

This section generates random data and makes predictions using the trained models.

In [None]:
def generate_random_data(df, feature_cols, label_encoders):
    """
    Generate random data based on feature columns
    """
    if df is None or not feature_cols:
        print("Error: No dataset or feature columns available")
        return None
    
    try:
        # Generate random data based on feature columns
        random_data = {}
        random_data_original = {}
        
        for feature in feature_cols:
            if feature in df.columns:
                if pd.api.types.is_numeric_dtype(df[feature]):
                    min_val = df[feature].min()
                    max_val = df[feature].max()
                    
                    if pd.api.types.is_integer_dtype(df[feature]):
                        value = np.random.randint(int(min_val), int(max_val) + 1)
                    else:
                        value = np.random.uniform(min_val, max_val)
                    
                    random_data[feature] = value
                    random_data_original[feature] = value
        
        # Display random data
        print("Generated Random Data:")
        for feature, value in random_data.items():
            print(f"{feature}: {value}")
        
        return random_data
        
    except Exception as e:
        print(f"Error generating random data: {str(e)}")
        return None

In [None]:
# Generate random data
if processed_df is not None and X is not None:
    feature_cols = X.columns
    random_data = generate_random_data(processed_df, feature_cols, label_encoders)

In [None]:
def predict(model, random_data, task):
    """
    Make predictions using the trained model
    """
    if model is None or random_data is None:
        print("Error: No model or random data available")
        return
    
    try:
        # Create input dataframe from random data
        input_df = pd.DataFrame([random_data])
        
        if task == "classification":
            # Make prediction
            prediction = model.predict(input_df)
            
            # Simplified output
            pred_value = prediction[0]
            if pred_value == 1 or pred_value == True:
                result = "Yes"
            else:
                result = "No"
            
            print(f"\nPrediction: {result}")
            
        elif task == "regression":
            # Make prediction
            prediction = model.predict(input_df)
            print(f"\nPredicted value: {prediction[0]:.2f}")
            
        else:  # clustering
            # Make prediction
            cluster = model.predict(input_df)
            print(f"\nPredicted cluster: {cluster[0]}")
        
    except Exception as e:
        print(f"Error making prediction: {str(e)}")

In [None]:
# Make prediction based on the task
if random_data is not None:
    if task == "classification" and 'classification_model' in locals():
        predict(classification_model, random_data, task)
    elif task == "regression" and 'regression_model' in locals():
        predict(regression_model, random_data, task)
    elif task == "clustering" and 'clustering_model' in locals():
        predict(clustering_model, random_data, task)

## 9. Summary and Conclusion

This notebook has implemented a complete ML Analyzer that can:

1. Load and preprocess datasets
2. Handle missing values and encode categorical features
3. Perform classification tasks with KNN, SVM, or Decision Tree algorithms
4. Perform regression tasks with Linear Regression
5. Perform clustering tasks with KMeans
6. Generate random data for prediction
7. Make predictions and visualize results

To use this notebook with your own dataset:
1. Update the file path in the data loading section
2. Select your target column and task type
3. Choose the appropriate algorithm
4. Run the cells in sequence

The notebook provides a modular approach to machine learning tasks, making it easy to adapt to different datasets and requirements.