# Wine Cultivar Origin Prediction System
## Model Development - Random Forest Classifier

**Student Name:** Ogah Victor  
**Matric Number:** 22CG031902  
**Date:** January 2026

This notebook trains a Random Forest Classifier to predict wine cultivar (origin) based on 6 selected chemical features from the Wine dataset.

### Step 1: Import Required Libraries

In [None]:
# Data manipulation and numerical computing
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.datasets import load_wine

# Model persistence
import joblib

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

print("✓ All libraries imported successfully!")

### Step 2: Load the Wine Dataset

In [None]:
# Load the Wine dataset from sklearn
wine_data = load_wine()

# Create a DataFrame for better visualization
df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)
df['cultivar'] = wine_data.target

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

### Step 3: Data Preprocessing and Feature Selection

In [None]:
# Select 6 features from the available features
# First, let's see all available features
print("All available features in dataset:")
for i, feature in enumerate(wine_data.feature_names):
    print(f"{i}: {feature}")

# Selected features by index (safer than by name)
# Index mapping: 0=alcohol, 6=total phenols, 7=flavanoids, 9=color intensity, 10=hue, 12=proline
selected_indices = [0, 6, 7, 9, 10, 12]
selected_features = [wine_data.feature_names[i] for i in selected_indices]

print("\nSelected Features:")
for i, feature in enumerate(selected_features, 1):
    print(f"{i}. {feature}")

# Separate features (X) and target (y)
X = df[selected_features]
y = df['cultivar']

print(f"\n✓ Features selected: {len(selected_features)} features")
print(f"✓ Target classes: {np.unique(y)}")
print(f"✓ Class distribution:\n{y.value_counts().sort_index()}")

### Step 4: Handle Missing Values (if any) and Feature Scaling

In [None]:
# Check for missing values
missing_values = X.isnull().sum()
print("Missing values per feature:")
print(missing_values)

if missing_values.sum() > 0:
    print("\n⚠ Handling missing values...")
    X = X.fillna(X.mean())  # Fill missing values with mean
    print("✓ Missing values filled with mean")
else:
    print("\n✓ No missing values found")

# Feature Scaling - MANDATORY due to varying ranges
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for easier handling
X_scaled = pd.DataFrame(X_scaled, columns=selected_features)

print("\n✓ Feature scaling applied (StandardScaler)")
print("\nScaled data statistics:")
print(X_scaled.describe())

### Step 5: Train-Test Split

In [None]:
# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  # Ensures balanced split across classes
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"\nTraining set class distribution:\n{pd.Series(y_train).value_counts().sort_index()}")
print(f"\nTesting set class distribution:\n{pd.Series(y_test).value_counts().sort_index()}")

### Step 6: Train Random Forest Classifier

In [None]:
# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,        # Number of trees in the forest
    max_depth=10,            # Maximum depth of trees
    min_samples_split=5,     # Minimum samples to split a node
    min_samples_leaf=2,      # Minimum samples in leaf node
    random_state=42,         # For reproducibility
    n_jobs=-1                # Use all available processors
)

# Train the model
print("Training the Random Forest model...")
rf_model.fit(X_train, y_train)
print("✓ Model training completed!")

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

### Step 7: Make Predictions

In [None]:
# Make predictions on training and testing sets
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

print("✓ Predictions made on both training and testing sets")

### Step 8: Model Evaluation - Multiclass Classification Metrics

In [None]:
print("="*60)
print("MODEL EVALUATION METRICS")
print("="*60)

# Training Set Metrics
print("\n--- TRAINING SET METRICS ---")
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred, average='weighted', zero_division=0)
train_recall = recall_score(y_train, y_train_pred, average='weighted', zero_division=0)
train_f1 = f1_score(y_train, y_train_pred, average='weighted', zero_division=0)

print(f"Accuracy:  {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall:    {train_recall:.4f}")
print(f"F1-Score:  {train_f1:.4f}")

# Testing Set Metrics
print("\n--- TESTING SET METRICS ---")
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)
test_recall = recall_score(y_test, y_test_pred, average='weighted', zero_division=0)
test_f1 = f1_score(y_test, y_test_pred, average='weighted', zero_division=0)

print(f"Accuracy:  {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall:    {test_recall:.4f}")
print(f"F1-Score:  {test_f1:.4f}")

print("\n" + "="*60)

### Step 9: Detailed Classification Report

In [None]:
print("\n--- CLASSIFICATION REPORT (Testing Set) ---\n")
print(classification_report(y_test, y_test_pred, 
                          target_names=['Cultivar 1', 'Cultivar 2', 'Cultivar 3'],
                          digits=4))

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix:")
print(cm)

### Step 10: Visualization of Results

In [None]:
# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Cultivar 1', 'Cultivar 2', 'Cultivar 3'],
            yticklabels=['Cultivar 1', 'Cultivar 2', 'Cultivar 3'])
plt.title('Confusion Matrix - Wine Cultivar Classification')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

# Plot Feature Importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.title('Feature Importance in Random Forest Model')
plt.tight_layout()
plt.show()

# Plot Metrics Comparison
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
train_metrics = [train_accuracy, train_precision, train_recall, train_f1]
test_metrics = [test_accuracy, test_precision, test_recall, test_f1]

x = np.arange(len(metrics))
width = 0.35

plt.figure(figsize=(10, 6))
plt.bar(x - width/2, train_metrics, width, label='Training', alpha=0.8)
plt.bar(x + width/2, test_metrics, width, label='Testing', alpha=0.8)
plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Model Performance Metrics Comparison')
plt.xticks(x, metrics)
plt.legend()
plt.ylim(0, 1.1)
plt.tight_layout()
plt.show()

### Step 11: Save the Trained Model to Disk

In [None]:
# Save the trained model using Joblib
model_filename = 'wine_cultivar_model.pkl'
joblib.dump(rf_model, model_filename)

print(f"✓ Model saved successfully as '{model_filename}'")

# Also save the scaler for use in the web app
scaler_filename = 'scaler.pkl'
joblib.dump(scaler, scaler_filename)

print(f"✓ Scaler saved successfully as '{scaler_filename}'")

# Save feature names for the web app
features_filename = 'features.pkl'
joblib.dump(selected_features, features_filename)

print(f"✓ Features list saved successfully as '{features_filename}'")

### Step 12: Model Summary and Key Information

In [None]:
print("\n" + "="*60)
print("MODEL SUMMARY")
print("="*60)
print(f"Algorithm: Random Forest Classifier")
print(f"Number of Trees: {rf_model.n_estimators}")
print(f"Max Depth: {rf_model.max_depth}")
print(f"Number of Features: {len(selected_features)}")
print(f"Selected Features: {', '.join(selected_features)}")
print(f"Number of Classes: 3")
print(f"Training Samples: {X_train.shape[0]}")
print(f"Testing Samples: {X_test.shape[0]}")
print(f"\nTest Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1-Score: {test_f1:.4f}")
print("="*60)