In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# 1. Load Data
def load_data(filepath):
    """Load CSV data into a pandas DataFrame"""
    return pd.read_csv(filepath)

In [None]:
# 2. Exploratory Data Analysis
def perform_eda(df):
    """Perform basic exploratory data analysis"""
    print("Data shape:", df.shape)
    print("\nData info:")
    print(df.info())
    print("\nDescriptive statistics:")
    print(df.describe())
    
    print("\nMissing values:")
    print(df.isnull().sum())
    
    # Visualizations
    plt.figure(figsize=(12, 8))
    
    # Histogram for each numerical feature
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    for i, col in enumerate(numerical_cols[:min(6, len(numerical_cols))]):
        plt.subplot(2, 3, i+1)
        sns.histplot(df[col], kde=True)
        plt.title(f'Distribution of {col}')
    plt.tight_layout()
    plt.show()
    
    # Correlation matrix
    plt.figure(figsize=(10, 8))
    corr_matrix = df.select_dtypes(include=['int64', 'float64']).corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Bar plots for categorical features
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    for col in categorical_cols[:min(4, len(categorical_cols))]:
        plt.figure(figsize=(10, 6))
        df[col].value_counts().plot(kind='bar')
        plt.title(f'Count of {col}')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    
    return df

In [None]:
# 3. Data Preprocessing
def create_preprocessing_pipeline(numerical_features, categorical_features, ordinal_features=None, ordinal_categories=None):
    """
    Create a column transformer for preprocessing different feature types
    abc
    Parameters:
    -----------
    numerical_features : list
        List of numerical feature names
    categorical_features : list
        List of categorical feature names
    ordinal_features : list, optional
        List of ordinal feature names
    ordinal_categories : list of lists, optional
        List of category orders for each ordinal feature
        
    Returns:
    --------
    ColumnTransformer
        Preprocessor for the features
    """
    transformers = []
    
    # Add numerical preprocessor
    if numerical_features:
        numerical_transformer = Pipeline(steps=[
            ('scaler', StandardScaler())
        ])
        transformers.append(('num', numerical_transformer, numerical_features))
    
    # Add categorical preprocessor
    if categorical_features:
        categorical_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ])
        transformers.append(('cat', categorical_transformer, categorical_features))
    
    # Add ordinal preprocessor
    if ordinal_features and ordinal_categories:
        ordinal_transformer = Pipeline(steps=[
            ('ordinal', OrdinalEncoder(categories=ordinal_categories))
        ])
        transformers.append(('ord', ordinal_transformer, ordinal_features))
    
    # Create column transformer
    preprocessor = ColumnTransformer(transformers=transformers)
    
    return preprocessor

In [None]:
# 4. Model Building and Evaluation
def build_and_evaluate_model(X_train, X_test, y_train, y_test, preprocessor, model_type='random_forest'):
    """
    Build a model pipeline, fit it to the training data, and evaluate it
    
    Parameters:
    -----------
    X_train, X_test : DataFrame
        Training and test feature sets
    y_train, y_test : Series
        Training and test target values
    preprocessor : ColumnTransformer
        Feature preprocessor
    model_type : str, optional
        Type of model to use ('linear' or 'random_forest')
    
    Returns:
    --------
    model : Pipeline
        Fitted model pipeline
    """
    # Select model
    if model_type == 'linear':
        model = LinearRegression()
    else:  # random_forest
        model = RandomForestRegressor(random_state=42)
    
    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Fit model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Evaluate model
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print("\nModel Evaluation:")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Root Mean Squared Error: {rmse:.4f}")
    print(f"R² Score: {r2:.4f}")
    
    # Plot predicted vs actual
    plt.figure(figsize=(8, 8))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title('Actual vs Predicted Values')
    plt.tight_layout()
    plt.show()
    
    return pipeline

In [None]:
# 5. Hyperparameter Tuning
def tune_hyperparameters(X_train, y_train, preprocessor, model_type='random_forest'):
    """
    Perform hyperparameter tuning with GridSearchCV
    
    Parameters:
    -----------
    X_train : DataFrame
        Training features
    y_train : Series
        Training target values
    preprocessor : ColumnTransformer
        Feature preprocessor
    model_type : str, optional
        Type of model to use ('linear' or 'random_forest')
    
    Returns:
    --------
    best_model : GridSearchCV
        Tuned model with best parameters
    """
    # Create base pipeline
    if model_type == 'linear':
        model = LinearRegression()
        param_grid = {
            # Linear regression doesn't have many hyperparameters to tune
            'model__fit_intercept': [True, False],
            'model__positive': [True, False]
        }
    else:  # random_forest
        model = RandomForestRegressor(random_state=42)
        param_grid = {
            'model__n_estimators': [50, 100, 200],
            'model__max_depth': [None, 10, 20, 30],
            'model__min_samples_split': [2, 5, 10],
            'model__min_samples_leaf': [1, 2, 4]
        }
    
    # Create pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Create grid search
    grid_search = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1,
        verbose=1
    )
    
    # Fit grid search
    grid_search.fit(X_train, y_train)
    
    print("\nHyperparameter Tuning Results:")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Best Score (Neg MSE): {grid_search.best_score_:.4f}")
    
    return grid_search

In [None]:
# Main function to tie everything together
def main():
    # Load data
    filepath = "your_data.csv"  # REPLACE WITH YOUR CSV PATH
    df = load_data(filepath)
    
    # Perform EDA
    df = perform_eda(df)
    
    # Define features and target
    target_column = "target"  # REPLACE WITH YOUR TARGET COLUMN
    
    # REPLACE THESE WITH YOUR ACTUAL COLUMN NAMES
    numerical_features = ["num_feature1", "num_feature2", "num_feature3"]
    categorical_features = ["cat_feature1", "cat_feature2"]
    ordinal_features = ["ord_feature1"]
    ordinal_categories = [["low", "medium", "high"]]  # Categories in order
    
    # Split data
    X = df.drop(columns=[target_column])
    y = df[target_column]
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    # Create preprocessor
    preprocessor = create_preprocessing_pipeline(
        numerical_features, 
        categorical_features, 
        ordinal_features, 
        ordinal_categories
    )
    
    # Build and evaluate base model
    model = build_and_evaluate_model(X_train, X_val, y_train, y_val, preprocessor, model_type='random_forest')
    
    # Tune hyperparameters
    best_model = tune_hyperparameters(X_train, y_train, preprocessor, model_type='random_forest')
    
    # Evaluate best model
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("\nFinal Model Evaluation (Test Set):")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Root Mean Squared Error: {np.sqrt(mse):.4f}")
    print(f"R² Score: {r2:.4f}")

if __name__ == "__main__":
    main()