In [None]:
# %pip install --pre mlflow

    Import Libraries

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

    import Dataset

In [8]:
training_data_01 =  pd.read_csv('./final_training_data.csv')


    Split the Final Dataset into Features and Target Variable

In [9]:
X = training_data_01.drop('PromotionStatus', axis=1) 
y = training_data_01['PromotionStatus'] 

    Split the Dataset into Training and Test Set

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
# %pip install imbalanced-learn
# 
X_train.shape
X_test.shape


(2000, 17)

    OverSampling

In [12]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    Experiment-1 
    RandomForest Classifier

In [64]:

params = {"random_state":42 ,"n_estimators": 100,"max_depth":10}

model = RandomForestClassifier(**params)  
# model.fit(X_train, y_train)
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test)


print("Confusion Matrix\n",confusion_matrix(y_test, y_pred))
print("\nClassification Report\n",classification_report(y_test, y_pred))

Confusion Matrix
 [[ 239   50    0]
 [  48 1084   57]
 [   0   80  442]]

Classification Report
               precision    recall  f1-score   support

           0       0.83      0.83      0.83       289
           1       0.89      0.91      0.90      1189
           2       0.89      0.85      0.87       522

    accuracy                           0.88      2000
   macro avg       0.87      0.86      0.87      2000
weighted avg       0.88      0.88      0.88      2000



In [65]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5) 
print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())

Cross-validation scores: [0.863  0.843  0.868  0.86   0.8665]
Mean accuracy: 0.8600999999999999


    Create dictionary of metrics

In [None]:
report_dict = classification_report(y_test, y_pred, output_dict=True)
report_dict

In [67]:
import mlflow

    Log the Model

In [None]:
mlflow.set_experiment("First Experiment")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")


with mlflow.start_run(run_name="Random Forest Classifier"):
    mlflow.set_tag("Training Info", "Trained with Project final Data")
    # mlflow.log_params({'random_state': 42})
    
    mlflow.log_params(params=params)

    mlflow.log_metrics({
        'accuracy': report_dict['accuracy'],
        'recall_class_0': report_dict['0']['recall'],
        'recall_class_1': report_dict['1']['recall'],
        'recall_class_2': report_dict['2']['recall'],
        'precision_class_0':report_dict['0']['precision'],
        'precision_class_1':report_dict['1']['precision'],
        'precision_class_2':report_dict['2']['precision'],
        'f1_score_macro': report_dict['macro avg']['f1-score']
    })
    # mlflow.log_metric('Cross-validation scores',scores)
    mlflow.log_metric('cross_val_mean_accuracy', scores.mean())

    model_info=  mlflow.sklearn.log_model(model, "Random Forest Classifier")

    Load our saved model as a Python Function


In [59]:
loaded_model_1 = mlflow.pyfunc.load_model("file:///c:/Assessments/final_project/Data_Science/mlruns/140846681390198027/6d3fc1e2dac64a0db9e272d9b3fd82cc/artifacts/Random Forest Classifier")
loaded_model_2 = mlflow.pyfunc.load_model(model_info.model_uri)
# print(model_info.model_uri)

    Use our model to predict 

In [61]:
y_pred = loaded_model_1.predict(X_test)

print("Confusion Matrix\n",confusion_matrix(y_test, y_pred))
print("\nClassification Report\n",classification_report(y_test, y_pred))


Confusion Matrix
 [[ 233   56    0]
 [  42 1104   43]
 [   0   80  442]]

Classification Report
               precision    recall  f1-score   support

           0       0.85      0.81      0.83       289
           1       0.89      0.93      0.91      1189
           2       0.91      0.85      0.88       522

    accuracy                           0.89      2000
   macro avg       0.88      0.86      0.87      2000
weighted avg       0.89      0.89      0.89      2000



    Experiment-2 
    Logistic Regression Model

In [73]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)  
model.fit(X_train_resampled, y_train_resampled)
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[160 127   2]
 [  0 925 264]
 [  0 142 380]]
              precision    recall  f1-score   support

           0       1.00      0.55      0.71       289
           1       0.77      0.78      0.78      1189
           2       0.59      0.73      0.65       522

    accuracy                           0.73      2000
   macro avg       0.79      0.69      0.71      2000
weighted avg       0.76      0.73      0.73      2000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    Log different Models
    Tract Experiments using MLFLOW

In [86]:
models = [
    (
        "Random Forest Classifier",
        {"random_state":42, "max_depth":10,"n_estimators" :100},
        RandomForestClassifier(random_state=42, max_depth=10,n_estimators=100),
        (X_train_resampled,y_train_resampled),
        (X_test,y_test)
     ) ,
     
     (
        "Logistic Regression Model",
        {"max_iter":1000},
        LogisticRegression(max_iter=1000),
        (X_train_resampled,y_train_resampled),
        (X_test,y_test)    
     )

]

In [87]:
reports = []

for model_name,params, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]

    model.set_params(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test,y_pred, output_dict=True)
    reports.append(report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [88]:
reports

[{'0': {'precision': 0.8327526132404182,
   'recall': 0.8269896193771626,
   'f1-score': 0.8298611111111112,
   'support': 289.0},
  '1': {'precision': 0.8929159802306426,
   'recall': 0.911690496215307,
   'f1-score': 0.9022055763628797,
   'support': 1189.0},
  '2': {'precision': 0.8857715430861723,
   'recall': 0.8467432950191571,
   'f1-score': 0.8658178256611165,
   'support': 522.0},
  'accuracy': 0.8825,
  'macro avg': {'precision': 0.8704800455190776,
   'recall': 0.861807803537209,
   'f1-score': 0.8659615043783692,
   'support': 2000.0},
  'weighted avg': {'precision': 0.8823576756058485,
   'recall': 0.8825,
   'f1-score': 0.882254598200839,
   'support': 2000.0}},
 {'0': {'precision': 1.0,
   'recall': 0.5536332179930796,
   'f1-score': 0.7126948775055679,
   'support': 289.0},
  '1': {'precision': 0.7747068676716918,
   'recall': 0.7779646761984861,
   'f1-score': 0.7763323541754091,
   'support': 1189.0},
  '2': {'precision': 0.5882352941176471,
   'recall': 0.72796934865

In [90]:
import mlflow

mlflow.set_experiment("Log Differetn Models")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

for i, element in enumerate(models):
    model_name = element[0]
    params = element[1]
    model = element[2]
    report = reports[i]

    with mlflow.start_run(run_name=model_name):
        # mlflow.set_tag("Training Info", "Trained with Project final Data")
        mlflow.log_params(params)

        mlflow.log_metrics({
        'accuracy': report['accuracy'],
        'recall_class_0': report['0']['recall'],
        'recall_class_1': report['1']['recall'],
        'recall_class_2': report['2']['recall'],
        'precision_class_0':report['0']['precision'],
        'precision_class_1':report['1']['precision'],
        'precision_class_2':report['2']['precision'],
        'f1_score_macro': report['macro avg']['f1-score']
        })


        # mlflow.log_metric('accuracy',report['accuracy'])
        # mlflow.log_metric('Cross-validation scores',scores)
        # mlflow.log_metric('cross_val_mean_accuracy', scores.mean())

        model_info  =  mlflow.sklearn.log_model(model, model_name)



2024/10/28 14:34:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest Classifier at: http://127.0.0.1:5000/#/experiments/887698436236583196/runs/53560df7e8fa4cabb6e95586a73e633c.
2024/10/28 14:34:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/887698436236583196.
2024/10/28 14:34:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression Model at: http://127.0.0.1:5000/#/experiments/887698436236583196/runs/5ff849e9827f4335b3ad0d67ee00fca1.
2024/10/28 14:34:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/887698436236583196.
