In [4]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix
import pandas as pd  # Added to display dataset structure

# Ask the user for input
n_trees = int(input("Enter the number of trees for the Random Forest: "))
impurity_function = input("Choose the impurity function ('gini' for Gini Index, 'entropy' for Information Gain): ").strip()

# Ensure correct input for impurity function
if impurity_function not in ['gini', 'entropy']:
    print("Invalid impurity function selected. Using 'gini' by default.")
    impurity_function = 'gini'

# Load dataset (for this example, we'll use the Iris dataset)
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target

# Convert to a pandas DataFrame for easier viewing
df = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target'])

# Print dataset structure and first 10 rows
print("\nDataset Structure:")
print(df.info())
print("\nFirst 10 rows of the dataset:")
print(df.head(10))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=n_trees, criterion=impurity_function, random_state=42)

# Train the Random Forest
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')  # macro average to handle multiple classes
recall = recall_score(y_test, y_pred, average='macro')  # same for recall
conf_matrix = confusion_matrix(y_test, y_pred)

# Display the results
print(f"\nRandom Forest accuracy: {accuracy * 100:.2f}%")
print(f"Random Forest F1-score (macro avg): {f1:.2f}")
print(f"Random Forest Recall (macro avg): {recall:.2f}")
print("Confusion Matrix:")
print(conf_matrix)

# Formulas for F1 Score, Recall, and Confusion Matrix:
# - Precision (P) = True Positives / (True Positives + False Positives)
# - Recall (R) = True Positives / (True Positives + False Negatives)
# - F1 Score = 2 * (Precision * Recall) / (Precision + Recall)
# - Accuracy = (True Positives + True Negatives) / Total Predictions


Enter the number of trees for the Random Forest:  5
Choose the impurity function ('gini' for Gini Index, 'entropy' for Information Gain):  gini



Dataset Structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    float64
dtypes: float64(5)
memory usage: 6.0 KB
None

First 10 rows of the dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4            