<a href="https://colab.research.google.com/github/vaibhavmishra03/-datasciencecoursera/blob/main/parameteroptimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import random
import time
import warnings
warnings.filterwarnings('ignore')


In [9]:
np.random.seed(42)
random.seed(42)


In [10]:
print("Loading the Covertype dataset from UCI repository...")

# Covertype dataset has 581,012 samples and 54 features
# We'll use a subset of 25,000 samples to meet the 5k-30k requirement
X_full, y_full = fetch_openml(name='covertype', version=1, as_frame=True, return_X_y=True)
print(f"Original dataset shape: {X_full.shape}")

# Take a random subset of 25,000 samples
subset_indices = np.random.choice(X_full.shape[0], size=25000, replace=False)
X = X_full.iloc[subset_indices]
y = y_full.iloc[subset_indices]
print(f"Subset dataset shape: {X.shape}")

# Basic data analytics of the selected dataset
print("\nDataset Analytics:")
print(f"Number of features: {X.shape[1]}")
print(f"Number of classes: {len(np.unique(y))}")
print(f"Class distribution: {pd.Series(y).value_counts().to_dict()}")


Loading the Covertype dataset from UCI repository...
Original dataset shape: (110393, 54)
Subset dataset shape: (25000, 54)

Dataset Analytics:
Number of features: 54
Number of classes: 7
Class distribution: {'Lodgepole_Pine': 11710, 'Spruce_Fir': 8848, 'Ponderosa_Pine': 1617, 'Krummholz': 1028, 'Douglas_fir': 885, 'Aspen': 584, 'Cottonwood_Willow': 328}


In [11]:
samples = []
results = {'Sample #': [], 'Best Accuracy': [], 'Best SVM Parameters': [], 'Convergence': []}

print("\nCreating 10 different train-test splits (70-30)...")
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=i*10, stratify=y
    )

    # Standardize the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    samples.append((X_train_scaled, X_test_scaled, y_train, y_test))
    results['Sample #'].append(f"S{i+1}")



Creating 10 different train-test splits (70-30)...


In [None]:
# Parameter ranges for optimization
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
C_values = np.logspace(-3, 3, 7)  # [0.001, 0.01, 0.1, 1, 10, 100, 1000]
gamma_values = np.logspace(-4, 0, 5)  # [0.0001, 0.001, 0.01, 0.1, 1]
degree_values = [2, 3, 4]
coef0_values = [0.0, 0.1, 0.5, 1.0]

max_overall_accuracy = 0
best_overall_sample = 0

for idx, (X_train, X_test, y_train, y_test) in enumerate(samples):
    print(f"\nOptimizing Sample {idx+1}...")
    best_accuracy = 0
    best_params = {}
    convergence_history = []

    for iteration in range(100):
        # Randomly select parameters
        kernel = random.choice(kernels)
        C = random.choice(C_values)

        # Initialize parameter dict
        param_dict = {'kernel': kernel, 'C': C}

        # Add kernel-specific parameters
        if kernel in ['poly', 'rbf', 'sigmoid']:
            param_dict['gamma'] = random.choice(gamma_values)

        if kernel == 'poly':
            param_dict['degree'] = random.choice(degree_values)

        if kernel in ['poly', 'sigmoid']:
            param_dict['coef0'] = random.choice(coef0_values)

        # Train SVM with selected parameters
        svm = SVC(**param_dict, random_state=42)
        try:
            svm.fit(X_train, y_train)
            y_pred = svm.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)

            # Update best parameters if current accuracy is higher
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = param_dict.copy()

            convergence_history.append(best_accuracy)

            # Print progress every 10 iterations
            if (iteration + 1) % 10 == 0:
                print(f"  Iteration {iteration+1}: Current best accuracy = {best_accuracy:.4f}")

        except Exception as e:
            print(f"  Error in iteration {iteration+1}: {e}")
            convergence_history.append(best_accuracy if convergence_history else 0)

    # Format the best parameters for display
    param_str = f"Kernel: {best_params['kernel']}, C: {best_params['C']:.4f}"
    if 'gamma' in best_params:
        param_str += f", Gamma: {best_params['gamma']:.4f}"
    if 'degree' in best_params:
        param_str += f", Degree: {best_params['degree']}"
    if 'coef0' in best_params:
        param_str += f", Coef0: {best_params['coef0']:.2f}"

    # Store results
    results['Best Accuracy'].append(f"{best_accuracy:.4f}")
    results['Best SVM Parameters'].append(param_str)
    results['Convergence'].append(convergence_history)

    # Track the overall best sample
    if best_accuracy > max_overall_accuracy:
        max_overall_accuracy = best_accuracy
        best_overall_sample = idx


Optimizing Sample 1...


In [6]:
# Step 5: Plot convergence graph for the sample with maximum accuracy
best_sample_idx = best_overall_sample
best_convergence = results['Convergence'][best_sample_idx]

plt.figure(figsize=(10, 6))
plt.plot(range(1, len(best_convergence) + 1), best_convergence)
plt.xlabel('Iteration')
plt.ylabel('Accuracy')
plt.title(f'Fitness (bestAccuracy) - Sample {best_sample_idx + 1}')
plt.grid(True, linestyle='--', alpha=0.7)
plt.savefig('convergence_graph.png', dpi=300, bbox_inches='tight')
plt.close()

print(f"\nConvergence graph saved for Sample {best_sample_idx+1} (best overall performance)")
print(f"Best accuracy achieved: {max_overall_accuracy:.4f}")

# Step 6: Generate markdown report for GitHub
github_report = f"""
# SVM Parameter Optimization

## Dataset Information
- **Dataset**: Covertype dataset from UCI repository
- **Subset size**: 25,000 samples (from original {X_full.shape[0]} samples)
- **Features**: {X.shape[1]}
- **Classes**: {len(np.unique(y))}
- **Class distribution**: {pd.Series(y).value_counts().to_dict()}

## Methodology
- Created 10 different train-test splits (70-30)
- Optimized SVM parameters for each sample through 100 iterations
- Parameters optimized: kernel type, C, gamma, degree, coef0

## Results

### Table 1: Comparative performance of Optimized-SVM with different samples

{results_df.to_markdown(index=False)}

### Figure 1: Convergence graph of best SVM

![Convergence Graph](convergence_graph.png)

## Analysis
- Best performing sample: Sample {best_sample_idx+1}
- Best accuracy achieved: {max_overall_accuracy:.4f}
- The convergence graph shows how the accuracy improved over iterations for the best sample.

## Conclusion
The SVM parameter optimization process identified optimal parameters for classifying the Covertype dataset. The best configuration achieved {max_overall_accuracy:.4f} accuracy, demonstrating the importance of parameter tuning for SVM models.
"""

with open('README.md', 'w') as f:
    f.write(github_report)

print("\nGitHub README.md report generated!")
print("\nAssignment complete! All required components have been implemented:")
print("✓ Multi-class dataset from UCI library with size between 5k-30k rows")
print("✓ 10 different 70-30 train-test splits")
print("✓ SVM optimization with 100 iterations per sample")
print("✓ Results table with best parameters for each sample")
print("✓ Convergence graph for the best performing sample")
print("✓ GitHub report with basic data analytics")

Unnamed: 0_level_0,Unnamed: 1_level_0,Silhouette,Calinski-Harabasz,Davies-Bouldin
Preprocessing,Clusters,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No Processing,3,0.551,562.0,0.67
No Processing,4,0.498,530.0,0.75
No Processing,5,0.493,495.0,0.82
Normalization,3,0.483,351.0,0.79
Normalization,4,0.444,314.0,0.91
Normalization,5,0.423,263.0,0.99
Transform,3,0.49,162.0,0.82
Transform,4,0.386,209.0,0.88
Transform,5,0.369,170.0,0.89
PCA,3,0.598,694.0,0.56


Unnamed: 0_level_0,Unnamed: 1_level_0,Silhouette,Calinski-Harabasz,Davies-Bouldin
Preprocessing,Clusters,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No Processing,3,0.554,558.0,0.66
No Processing,4,0.489,515.0,0.8
No Processing,5,0.484,488.0,0.82
Normalization,3,0.505,349.0,0.75
Normalization,4,0.433,301.0,0.85
Normalization,5,0.349,272.0,0.91
Transform,3,0.478,225.0,0.74
Transform,4,0.427,214.0,0.9
Transform,5,0.357,202.0,0.92
PCA,3,0.598,689.0,0.56


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin
No Processing,0.686,510.0,0.39
Normalization,0.477,290.0,0.76
Transform,0.342,140.0,0.78
PCA,0.562,615.0,0.56
T+N,0.399,222.0,0.86
T+N+PCA,0.404,227.0,0.72
