In [12]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

data = pd.read_excel('L5_fecal_RFI_class.xlsx')

#fet and tar
X = data.drop(columns=['RFI_Class', 'SampleID'])
y = data['RFI_Class']

#dummiesss
X = pd.get_dummies(X, columns=['DNA_Batch', 'Farm_Code'], drop_first=True)

# 80 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# RSearch parm grid
param_distributions = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample']
}


# Initialize RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=clf,
    param_distributions=param_distributions,
    n_iter=50,
    scoring='f1_weighted',  # Optimizing for a balanced metric
    cv=5,
    random_state=123,
    n_jobs=-1
)


#model
random_search.fit(X_train, y_train)

# best gun
best_clf = random_search.best_estimator_

#using best gun
y_pred = best_clf.predict(X_test)

#metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(conf_matrix, index=best_clf.classes_, columns=best_clf.classes_)  # CM with labels
best_params = random_search.best_params_

#saving
with open('model_evaluation_metrics.txt', 'w') as file:
    file.write("Best Parameters:\n")
    file.write(f"{best_params}\n\n")
    file.write("Accuracy: {:.4f}\n".format(accuracy))
    file.write("F1 Score: {:.4f}\n".format(f1))
    file.write("Recall: {:.4f}\n".format(recall))
    file.write("Precision: {:.4f}\n".format(precision))
    file.write("Confusion Matrix:\n")
    file.write(f"{conf_matrix_df}\n")  # Save labeled confusion matrix

print("Evaluation metrics saved to 'classification_fecal_L5_with_metadata_optimization_model_evaluation_metrics.txt'")


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.

In [13]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
from sklearn.utils import resample

data = pd.read_excel('L5_fecal_RFI_class.xlsx')

#fet and tar
X = data.drop(columns=['RFI_Class', 'SampleID'])
y = data['RFI_Class']

#dummiesss
X = pd.get_dummies(X, columns=['DNA_Batch', 'Farm_Code'], drop_first=True)

# 80 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


train_data = pd.concat([X_train, y_train], axis=1)

#separate majority and minority classes
majority_class = train_data[y_train == y_train.value_counts().idxmax()]
minority_class = train_data[y_train == y_train.value_counts().idxmin()]

#lets upsample minority
minority_upsampled = resample(minority_class,
                              replace=True,    # Sample with replacement
                              n_samples=len(majority_class),  # Match majority class
                              random_state=42)  # Reproducible

# again cobine upsampled with the majority
balanced_train_data = pd.concat([majority_class, minority_upsampled])

# Separate features and target
X_train = balanced_train_data.drop(columns=['RFI_Class'])
y_train = balanced_train_data['RFI_Class']

# RSearch parm grid
param_distributions = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample']
}

# Initialize RandomForestClassifier
clf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=clf,
    param_distributions=param_distributions,
    n_iter=50,
    scoring='f1_weighted',  # Optimizing for a balanced metric
    cv=5,
    random_state=123,
    n_jobs=1  # Disable parallel processing
)

#model
random_search.fit(X_train, y_train)

# best gun
best_clf = random_search.best_estimator_

#using best gun
y_pred = best_clf.predict(X_test)

#metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')

# Generate confusion matrix with labels from y_test to handle missing classes in predictions
labels = np.unique(y_test)  # Ensure labels align with actual test data
conf_matrix = confusion_matrix(y_test, y_pred, labels=labels)
conf_matrix_df = pd.DataFrame(conf_matrix, index=labels, columns=labels)  # Labeled CM with test set classes
best_params = random_search.best_params_

#saving
with open('model_evaluation_metrics_2.txt', 'w') as file:
    file.write("Best Parameters:\n")
    file.write(f"{best_params}\n\n")
    file.write("Accuracy: {:.4f}\n".format(accuracy))
    file.write("F1 Score: {:.4f}\n".format(f1))
    file.write("Recall: {:.4f}\n".format(recall))
    file.write("Precision: {:.4f}\n".format(precision))
    file.write("Confusion Matrix:\n")
    file.write(f"{conf_matrix_df}\n")  # Save labeled confusion matrix

print("Evaluation metrics saved to 'classification_fecal_L5_with_metadata_optimization_model_evaluation_metrics_1.txt'")


Evaluation metrics saved to 'classification_fecal_L5_with_metadata_optimization_model_evaluation_metrics_1.txt'


  _warn_prf(average, modifier, msg_start, len(result))
