Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
import seaborn as sns

# Load the dataset
data = pd.read_csv("static/Iris (1).csv")

# Display the first few rows of the dataset
data.head()


EDA

In [None]:
columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

for i, col in enumerate(columns):
    plt.subplot(2, 2, i + 1)  # Adjust the subplot grid dimensions and index
    sns.histplot(data=data, x=col, hue='Species', kde=True, multiple='stack')
    plt.title(f'Distribution of {col}')

# Adjust layout for better spacing
plt.tight_layout()
plt.show()

Data Preprocessing

In [None]:
# Separate features and target variable
X = data.drop(['Id', 'Species'], axis=1)  # Features: all columns except 'Id' and 'Species'
y = data['Species']  # Target variable: 'Species'

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)


Model Training and Evaluation

In [None]:
# Initialize a Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model on the PCA-reduced training data
rf_model.fit(X_train, y_train)

# Predict on the PCA-reduced test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
print(classification_report(y_test, y_pred))


Plotting Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix


le = LabelEncoder()
le.fit(y_train)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

Plotting Decision Boundaries

In [None]:
# Convert string labels to numeric indices
le = LabelEncoder()
y_numeric = le.fit_transform(y)

# Plotting decision boundaries
def plot_decision_boundaries(X, y, classifier, title):
    cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
    cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
    
    h = .02  # step size in the mesh
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = le.transform(Z)  # Convert predicted string labels to numeric indices
    
    # Reshape the predictions to match xx and yy shapes
    Z = Z.reshape(xx.shape)
    
    # Create a figure and plot the decision boundaries
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    
    # Plot the training points
    plt.scatter(X[:, 0], X[:, 1], c=y_numeric, cmap=cmap_bold, edgecolor='k', s=20)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title(title)
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.show()

# Plot decision boundaries
plot_decision_boundaries(X_pca, y, rf_model, "Decision Boundaries for Random Forest Classifier (PCA-reduced)")


Saving the Model

In [None]:
from joblib import dump, load

# Save the model to a file
model_filename = 'random_forest_iris_model.joblib'
dump(rf_model, model_filename)

print(f"Model saved to {model_filename}")


In [None]:
# Load the model from the file
loaded_model = load('random_forest_iris_model.joblib')

# Verify the loaded model by making predictions
loaded_model_pred = loaded_model.predict(X_test)

# Check if the loaded model predictions match the original model predictions
print(f"Predictions match: {np.array_equal(y_pred, loaded_model_pred)}")
