# Screening Test:code to parse the JSON file provided and kick off in sequence the following machine learning steps programmatically. 

# Name= Swati Lilesh Patle
# PRN= 1132232057
# Course= MSc Data Science And Big Data Analytis

# Model Pipeline

In [39]:
#importing important Libraries
import pandas as pd
import numpy as np
import joblib
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error, r2_score, classification_report)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


def extract_target_and_regression_type(design_state_data): #function for extracting target and regression type
    target_info= design_state_data["target"]
    target_column= target_info["target"]
    regression_type= target_info["type"]
    print(f"Target: {target_column}, Regression Type: {regression_type}")
    return target_column, regression_type


def handle_missing_values(df, feature_handling): #function for handling missing values in each column
    for feature, settings in feature_handling.items(): #Iterating over features for imputation
        if settings["is_selected"]:
            feature_details= settings.get("feature_details", {})
            feature_type= settings.get("feature_variable_type", None)
            if feature_type == "numerical":
                missing_values= feature_details.get("missing_values")
                impute_with= feature_details.get("impute_with")
                impute_value= feature_details.get("impute_value", None)
                if missing_values == "Impute":
                    if impute_with == "Average of values":
                        strategy= "mean"
                    elif impute_with == "Median of values" and feature_type == "numerical":
                        strategy= "median"
                    elif impute_with == "Most frequent value" and feature_type == "categorical":
                        strategy= "most_frequent"
                    elif impute_with == "custom":
                        df[feature].fillna(impute_value, inplace=True)
                        print(f"Imputed missing values in column '{feature}' with custom value: {impute_value}")
                        continue
                else:
                    raise ValueError(f"Unsupported imputation method: {impute_with}")
                    
                #Applying imputation
                imputer= SimpleImputer(strategy=strategy)
                df[feature]= imputer.fit_transform(df[[feature]])
                print(f"Imputed missing values in column '{feature}' using strategy: {strategy}")
                
            elif feature_type == "text": #handling Categorical data
                #Handle label encoding for categorical columns
                print(f"Encoding categorical column '{feature}'")
                label_encoder= LabelEncoder()
                df[feature]= label_encoder.fit_transform(df[feature])
        else:
            print(f"Feature '{feature}' is not selected for processing.")
    return df


def reduce_features(df, target, feature_reduction_settings): #function for function reduction
    method= feature_reduction_settings.get("feature_reduction_method", "No Reduction")
    parameters= feature_reduction_settings.get("parameters", {}) #extracting methods and parameters from the settings
    if method == "No Reduction":
        print("No feature reduction applied.")
        return df
    elif method == "Corr with Target":
        threshold= parameters.get("threshold", 0.1)
        correlations= df.corrwith(pd.Series(target))
        selected_features = correlations[abs(correlations) >= threshold].index
        print(f"Selected features based on correlation with target: {list(selected_features)}")
        return df[selected_features]
    elif method == "Tree-based":
        n_features= parameters.get("n_features", 5)
        model= RandomForestRegressor(random_state=42)
        model.fit(df, target)
        feature_importances= pd.Series(model.feature_importances_, index=df.columns)
        selected_features= feature_importances.nlargest(n_features).index
        print(f"Selected features based on tree-based importance: {list(selected_features)}")
        print(feature_reduction_settings)
        return df[selected_features]
    elif method == "PCA":
        n_components= parameters.get("n_components", 2)
        pca= PCA(n_components=n_components)
        transformed_data = pca.fit_transform(df)
        pca_columns= [f"PCA_{i+1}" for i in range(n_components)]
        print(f"Applied PCA. Reduced data to {n_components} components.")
        return pd.DataFrame(transformed_data, columns=pca_columns)
    else:
        raise ValueError(f"Unsupported feature reduction method: {method}")


def build_model(algo_details): #function for building a model
    model_name= algo_details["model_name"]
    param_grid={}
    
    if model_name == "Random Forest Regressor":
        model= RandomForestRegressor(random_state=42)
        param_grid= {
            "n_estimators": np.arange(algo_details["min_trees"], algo_details["max_trees"] + 1),
            "max_depth": np.arange(algo_details["min_depth"], algo_details["max_depth"] + 1),
            "min_samples_leaf": np.arange(
                algo_details["min_samples_per_leaf_min_value"],
                algo_details["min_samples_per_leaf_max_value"] + 1,
            ),
        }
    elif model_name == "Random Forest Classifier":
        model= RandomForestClassifier(random_state=42)
        param_grid= {
            "n_estimators": np.arange(algo_details["min_trees"], algo_details["max_trees"] + 1),
            "max_depth": np.arange(algo_details["min_depth"], algo_details["max_depth"] + 1),
            "min_samples_leaf": np.arange(
                algo_details["min_samples_per_leaf_min_value"],
                algo_details["min_samples_per_leaf_max_value"] + 1,
            ),
        }
    elif model_name == "Gradient Boosted Trees":
        if "Regressor" in model_name:
            model= GradientBoostingRegressor(random_state=42)
        else:
            model= GradientBoostingClassifier(random_state=42)
        param_grid= {
            "n_estimators": np.arange(algo_details["min_iter"], algo_details["max_iter"] + 1),
            "max_depth": np.arange(algo_details["min_depth"], algo_details["max_depth"] + 1),
            "learning_rate": np.linspace(algo_details["min_stepsize"], algo_details["max_stepsize"], 5),
        }
    elif model_name == "LinearRegression":
        model= LinearRegression()
    elif model_name == "LogisticRegression":
        model= LogisticRegression(random_state=42, max_iter=algo_details["max_iter"])
        param_grid= {
            "C": np.linspace(algo_details["min_regparam"], algo_details["max_regparam"], 5),
        }
    elif model_name == "Lasso Regression":
        model= Lasso(alpha=algo_details.get("min_regparam", 1.0))
    elif model_name == "Support Vector Machine":
        model= SVC(kernel="linear", C=algo_details.get("c_value", 1))
        param_grid= {
            "C": algo_details["c_value"],
        }
    elif model_name == "Decision Tree":
        if "Regressor" in model_name:
            model= DecisionTreeRegressor(random_state=42)
        else:
            model= DecisionTreeClassifier(random_state=42)
        param_grid= {
            "max_depth": np.arange(algo_details["min_depth"], algo_details["max_depth"] + 1),
            "min_samples_leaf": algo_details["min_samples_per_leaf"],
        }
    elif model_name == "KNN":
        model= KNeighborsClassifier()
        param_grid= {
            "n_neighbors": algo_details["k_value"],
        }
    else:
        raise ValueError(f"Unsupported model: {model_name}")
    return model, param_grid


def perform_grid_search(X, y, model, param_grid, scoring): #function for performing hyperparameter tuning
    grid_search= GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=scoring)
    grid_search.fit(X, y)
    return grid_search


def evaluate_model(model, X_test, y_test, regression_type): #function for evaluating the model
    
    y_pred = model.predict(X_test)
    evaluation_metrics= {}

    if regression_type == "classification":
        evaluation_metrics["accuracy"]= accuracy_score(y_test, y_pred)
        evaluation_metrics["precision"]= precision_score(y_test, y_pred, average='weighted', zero_division=0)
        evaluation_metrics["recall"]= recall_score(y_test, y_pred, average='weighted', zero_division=0)
        evaluation_metrics["f1_score"]= f1_score(y_test, y_pred, average='weighted', zero_division=0)
        evaluation_metrics["classification_report"]= classification_report(y_test, y_pred)
    elif regression_type == "regression":
        evaluation_metrics["mean_squared_error"]= mean_squared_error(y_test, y_pred)
        evaluation_metrics["mean_absolute_error"]= mean_absolute_error(y_test, y_pred)
        evaluation_metrics["r2_score"]= r2_score(y_test, y_pred)
    else:
        raise ValueError("Unsupported regression type. Choose either 'classification' or 'regression'.")

    return evaluation_metrics


def run_pipeline(design_state_data, df): #function for running whole ppeline
    target_column, regression_type= extract_target_and_regression_type(design_state_data) #calling extract_target_and_regression_type function
    missing_values= handle_missing_values(df, design_state_data["feature_handling"])  #calling handle_missing_values function
    target= LabelEncoder().fit_transform(df[target_column])  #Convert target into integers
    features= df.drop(columns=[target_column])
    reduced_features= reduce_features(features, target, design_state_data["feature_reduction"]) #calling reduce_features function

    #Spliting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42) #size(train data= 80%, test data= 20%)
    print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")
    
    results= []
    for algo_name, algo_details in design_state_data["algorithms"].items():
        if algo_details.get("is_selected", False):
            print("----------------------------------------------------------")
            print(f"Processing: {algo_name}...")
            model, param_grid= build_model(algo_details)  #Unpack model and param_grid(calling build_model function)
            scoring_metric= "accuracy" if regression_type == "classification" else "neg_mean_squared_error"
            
            #Performing Grid Search with cross-validation
            grid_search= GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=scoring_metric)
            grid_search.fit(features, target)
            best_model= grid_search.best_estimator_
            print(f"Best params for {algo_name}: {grid_search.best_params_}")
            
            #Saving the best model
            model_filename= f"{algo_name.replace(' ', '_')}_best_model.pkl"
            joblib.dump(best_model, model_filename)
            print(f"Model saved as {model_filename}")
            
            # Evaluating the best model
            evaluation_results= evaluate_model(best_model, X_test, y_test, regression_type)
            results.append({"model": algo_name, 
                            "best_params": grid_search.best_params_, 
                            "evaluation_results": evaluation_results
                           })
    return results


def main(): #defining main function
    df= pd.read_csv("iris.csv")  #csv file 
    with open("output.json", "r", encoding="utf-8") as file:
        data= json.load(file)  #JSON file

    nested_json= json.loads(data["content"]) #normalizing data for further prccessing
    design_state_data= nested_json.get("design_state_data", {}) #normalizing data for further prccessing

    results= run_pipeline(design_state_data, df) #calling run_pipeline function

    print("Pipeline Results:")
    for result in results:
        print(result)


if __name__ == "__main__":
    main()


Target: petal_width, Regression Type: regression
Imputed missing values in column 'sepal_length' using strategy: mean
Imputed missing values in column 'sepal_width' with custom value: -1
Imputed missing values in column 'petal_length' using strategy: mean
Imputed missing values in column 'petal_width' with custom value: -2
Encoding categorical column 'species'
Selected features based on tree-based importance: ['petal_length', 'species', 'sepal_width', 'sepal_length']
{'feature_reduction_method': 'Tree-based', 'num_of_features_to_keep': '4', 'num_of_trees': '5', 'depth_of_trees': '6'}
Training set size: 120, Test set size: 30
----------------------------------------------------------
Processing: RandomForestRegressor...
Best params for RandomForestRegressor: {'max_depth': 20, 'min_samples_leaf': 5, 'n_estimators': 11}
Model saved as RandomForestRegressor_best_model.pkl
Pipeline Results:
{'model': 'RandomForestRegressor', 'best_params': {'max_depth': 20, 'min_samples_leaf': 5, 'n_estimat

In [40]:
#Loading the saved model(just for trial)
loaded_model = joblib.load("RandomForestRegressor_best_model.pkl")
loaded_model