In [29]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [31]:
# Load the dataset
file_path = "data.csv"  # Update the path to your dataset
data = pd.read_csv(file_path)


In [33]:
# Data preprocessing
data_cleaned = data.drop(columns=["Unnamed: 0"])  # Drop unnecessary columns
data_cleaned["state"] = data_cleaned["state"].apply(lambda x: 1 if x == "successful" else 0)  # Convert target to binary


In [35]:
# One-hot encode categorical features
data_encoded = pd.get_dummies(data_cleaned, columns=["parent_category", "sub_category", "country"], drop_first=True)


In [37]:
# Define features and target
X = data_encoded.drop(columns=["state"])
y = data_encoded["state"]


In [39]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [41]:
# Train the Random Forest model with even simpler configuration
model = RandomForestClassifier(
    n_estimators=5,            # Extremely few trees
    max_depth=2,               # Very shallow trees
    min_samples_split=100,     # Very large splits required
    min_samples_leaf=50,       # High minimum samples per leaf
    max_features=0.2,          # Use a small subset of features for splits
    random_state=42
)
model.fit(X_train, y_train)

In [43]:
# Evaluate the model
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [45]:
# Calculate metrics
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
precision = precision_score(y_test, y_pred_test)
recall = recall_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)

In [47]:
# Print metrics
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


Training Accuracy: 88.48%
Testing Accuracy: 88.36%
Precision: 87.22%
Recall: 94.27%
F1 Score: 90.60%


In [49]:
# Save the trained model and metadata
model_metadata = {
    "model": model,
    "feature_names": X.columns.tolist(),
}
joblib.dump(model_metadata, "Random_forest_b_file.pkl")

print("Model training complete. Saved as Random_forest_b_file'.")


Model training complete. Saved as Random_forest_b_file'.
