In [3]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

data = pd.read_csv('features.csv')
# Encode categorical variables
data = pd.get_dummies(data)

# Split data into features (X) and target variable (y)
X = data.drop('_MICHD', axis=1)
y = data['_MICHD']

from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Display the classification report for more detailed evaluation (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.7652

Classification Report:
              precision    recall  f1-score   support

         1.0       0.74      0.83      0.78     72561
         2.0       0.80      0.70      0.75     72623

    accuracy                           0.77    145184
   macro avg       0.77      0.77      0.76    145184
weighted avg       0.77      0.77      0.76    145184



In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model (Fit it to the training data)
rf.fit(X_train, y_train)  # Ensure X_train and y_train are correctly defined

# Make predictions with the trained model
y_pred = rf.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')  
recall = recall_score(y_test, y_pred, average='binary')  
f1 = f1_score(y_test, y_pred, average='binary')  
roc_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")


Accuracy: 0.7652
Precision: 0.7355
Recall: 0.8280
F1 Score: 0.7790
ROC AUC Score: 0.8379


In [6]:
from sklearn.ensemble import RandomForestClassifier
import joblib

# Train model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)  # <-- MUST call fit() before saving

# Verify trained attributes
assert hasattr(rf, 'estimators_'), "Model not trained!"

# Save model
joblib.dump(rf, 'rf_model.pkl')

['rf_model.pkl']