In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import joblib
from datetime import datetime
import os
from sklearn.pipeline import Pipeline

from Configuration import config
from Services import service
from Tuning import model_tuning
from Preprocess import processdata


import mlflow 
from mlflow import MlflowClient

model_path = config.MODEL_PATH
model_name = config.MODEL_NAME




def run_rf(X, y, grid_cv, model_save_path):
    client = MlflowClient(tracking_uri="http://127.0.0.1:5001")
    mlflow.start_run(run_name="Random_forest_model_v2_Robert")

    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=0)
    # Create a base model
    rf = RandomForestClassifier()
    model_pipeline =  processdata.data_encoding_pipeline() 
    try:
        if not os.path.exists(model_path):
            os.makedirs(model_path)
            print('{} directory created'.format(model_path))
        
        
            
        if not model_save_path:
            model_save_path='{}/{}'.format(model_path, model_name) # add versioning here
                
        if grid_cv:
            if not os.path.exists(model_save_path):
                print("Grid Search CV Started. Calculating Best Params-------------------------\n")
                model_ = Pipeline(steps=[('preprocessor', model_pipeline),
                        ('classifier', rf)]) 
                grid = model_tuning.compute_gscv(Xtrain, ytrain, model_, config.GRIDCV_PARAM)
                print("Grid Search CV completed-----------------------------------\n")
                print("Best Params are :",grid.best_estimator_)
                # Saving the best model to a file
                model = grid.best_estimator_
                joblib.dump(model, model_save_path)
                print(f"Best Random Forest model saved to {model_save_path}")
                

            else:
                print('Found a model\n')
                # Loading the pre-trained model from file
                model = joblib.load(model_save_path)
                print(f"Random Forest model loaded from {model_save_path}")
                
        else : 
            
            # model = Pipeline(steps=[('preprocessor', model_pipeline),
            #           ('classifier', 
            #            RandomForestClassifier(n_jobs=-1, n_estimators=200)
            #              )]) 
            model = Pipeline(steps=[('preprocessor', model_pipeline),
                      ('classifier', 
                    #    RandomForestClassifier(**config.RF_BEST_PARAMS)
                        RandomForestClassifier(n_jobs=-1, n_estimators=200)
                         )]) 
            model.fit(Xtrain, ytrain)
            joblib.dump(model, model_save_path)
            print(f"Random Forest model saved to {model_save_path}")
            
        eval_rf(model, Xtrain, Xtest, ytrain, ytest)
        return model
    
    except Exception as e:
        print('Exception :',e)
   


def eval_rf(model, Xtrain, Xtest, ytrain, ytest):
    y_model = model.predict(Xtest)
    y_model_train = model.predict(Xtrain)

    # Accuracy
    print("Random Forest Train Accuracy: ", accuracy_score(ytrain, y_model_train))
    print("Random Forest Test Accuracy: ", accuracy_score(ytest, y_model))
    
    # Log test metrics
    mlflow.log_metric("test_accuracy", accuracy_score(ytest, y_model))
    mlflow.log_metric("train_accuracy", accuracy_score(ytrain, y_model_train))

    if config.DEBUG:
        service.plot_confusion_matrix(y_model, ytest)
    


In [23]:
from Preprocess import processdata
from Services import service
from Configuration import config
from Src import train
import mlflow
from mlflow import MlflowClient

train_dir = config.TRAIN_DIR
image_dir = config.TRAIN_IMAGE_DIR


client = MlflowClient(tracking_uri="http://127.0.0.1:5001")
mlflow.start_run(run_name="Random_forest_model_v2_Robert")
#mlflow.autolog(log_metrics=False) 

def train_model():
    df   = service.load_data(train_dir)
    print('Feature Processing Started...............')

    final_dataframe = processdata.process_train(df)
    print('Training Initiated...............')

    model = train.train_rf(final_dataframe)
    
    


    return model
    
def predict_data(df, img):
    final_dataframe = processdata.process(df, image_dir) 
    pass


if __name__ == '__main__':
    
    model=train_model()

    

Feature Processing Started...............
Saved image features found .........




Training Initiated...............




Random Forest model saved to Models/random_forest_model_v2.joblib




Random Forest Train Accuracy:  0.965883886310109
Random Forest Test Accuracy:  0.9065029496501578


## Shap values and Interpretability

In [3]:
import pandas as pd

X=pd.read_csv(r"data\train\train.csv")
y=X["AdoptionSpeed"]
X=X.drop("AdoptionSpeed", axis=1)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25, random_state=0)



In [None]:
model[:-1].get_feature_names_out()

In [None]:
data_transformation=model["preprocessor"].transform(Xtest[:50])

data_transformed=pd.DataFrame(model["preprocessor"].transform(Xtest).toarray(), 
                columns=model[:-1].get_feature_names_out())


In [None]:
data_transformed.head()

In [None]:
import shap

In [None]:
explainer=shap.KernelExplainer(model["classifier"].predict, shap.sample(data_transformed,20) )
#data_transformation=model["preprocessor"].transform(Xtest[:4]).to_dense()
#print(data_transformation.shape)


In [None]:
#this line takes almost two hours to run

shap_values = explainer(data_transformed)

#shap.plots.waterfall(shap_values[3])

In [None]:
import numpy as np
np.save("shap_values.npy",shap_values)
shap_values2 = np.load("shap_values.npy", allow_pickle=True)

In [None]:
shap.summary_plot(shap_values, data_transformed)

In [None]:
shap.plots.heatmap(shap_values)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.bar(shap_values)

In [None]:
shap.plots.waterfall(shap_values[9])

In [None]:
shap.plots.waterfall(shap_values[49])

In [None]:
shap.plots.scatter(shap_values[:, "num__Age"])

In [None]:

import numpy as np
np.save("shap_values.npy",shap_values)
shap_values = np.load("shap_values.npy")



In [None]:
shap.force_plot(explainer.expected_value[0], shap_values[0], data_transformed.iloc)

In [None]:
shap.force_plot(float(explainer.expected_value), shap_values[0, :], data_transformed.iloc[0, :])