In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv("sample_data/train.csv")

# Show column names to identify target (optional)
print("Columns:\n", df.columns)

# Encode all object (categorical) columns to numeric
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column].astype(str))
        label_encoders[column] = le

# Assume last column is target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"\nAccuracy Score: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

Columns:
 Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Accuracy Score: 0.8659

Confusion Matrix:
[[ 26   0  17]
 [  0  12   5]
 [  2   0 117]]


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
X = iris.data  # Features
y = iris.target  # Target variable (species)

# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 1. Train with default n_estimators=10
rf_model_default = RandomForestClassifier(n_estimators=10, random_state=42)
rf_model_default.fit(X_train, y_train)

# Evaluate the accuracy score with default n_estimators
y_pred_default = rf_model_default.predict(X_test)
accuracy_default = accuracy_score(y_test, y_pred_default)

print(f"Accuracy with default n_estimators=10: {accuracy_default * 100:.2f}%")

# 2. Fine-tuning the model by changing n_estimators
best_accuracy = 0
best_n_estimators = 10

for n_estimators in [10, 50, 100, 200, 500]:
    rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
    rf_model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_n_estimators = n_estimators

print(f"Best accuracy achieved with n_estimators={best_n_estimators}: {best_accuracy * 100:.2f}%")


Accuracy with default n_estimators=10: 100.00%
Best accuracy achieved with n_estimators=10: 100.00%
