In [None]:
import pandas as pd 

In [None]:
data = mushroom_df

In [None]:
mushroom_df = pd.read_csv(r"C:\Users\Acer\Desktop\Data Sci Assignments\SVM\mushroom.csv")

In [None]:
mushroom_df.head()

In [None]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(mushroom_df.head())

# Get basic information about the dataset
print("\nDataset Information:")
print(mushroom_df.info())

# Summarize the dataset's statistical properties
print("\nDataset Summary Statistics:")
print(mushroom_df.describe(include='all'))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot histograms for each feature
mushroom_df.hist(bins=30, figsize=(20, 15))
plt.suptitle('Histograms of Mushroom Dataset Features')
plt.show()

# Plot density plots for each feature
mushroom_df.plot(kind='density', subplots=True, layout=(8, 3), sharex=False, figsize=(20, 30))
plt.suptitle('Density Plots of Mushroom Dataset Features')
plt.show()

# Investigate feature correlations (only if numeric features are present)
# Since the dataset may be entirely categorical, we need to convert it to numerical format first
mushroom_encoded = pd.get_dummies(mushroom_df)
correlation_matrix = mushroom_encoded.corr()

# Plot the correlation matrix
plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Mushroom Dataset')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot histograms for each categorical feature
categorical_columns = mushroom_df.select_dtypes(include=['object']).columns

fig, axes = plt.subplots(nrows=len(categorical_columns)//3+1, ncols=3, figsize=(15, 20))
fig.suptitle('Histograms of Mushroom Dataset Categorical Features')

for i, col in enumerate(categorical_columns):
    ax = axes[i//3, i%3]
    mushroom_df[col].value_counts().plot(kind='bar', ax=ax, title=col)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# Plot density plots for numeric features
numeric_columns = mushroom_df.select_dtypes(include=['float64', 'int64']).columns

fig, axes = plt.subplots(nrows=len(numeric_columns), ncols=1, figsize=(10, 15))
fig.suptitle('Density Plots of Mushroom Dataset Numeric Features')

for i, col in enumerate(numeric_columns):
    ax = axes[i]
    sns.kdeplot(mushroom_df[col], ax=ax)
    ax.set_title(col)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

# Investigate feature correlations (only if numeric features are present)
# Since the dataset may be entirely categorical, we need to convert it to numerical format first
mushroom_encoded = pd.get_dummies(mushroom_df)
correlation_matrix = mushroom_encoded.corr()

# Plot the correlation matrix
plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Mushroom Dataset')
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode categorical variables
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    mushroom_df[column] = le.fit_transform(mushroom_df[column])
    label_encoders[column] = le

# Define features and target variable
X = mushroom_df.drop(['class'], axis=1)
y = mushroom_df['class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


In [None]:
# Visualize class distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=mushroom_df, x='class')
plt.title('Class Distribution')
plt.show()

# Pair plot to visualize relationships (sampled for performance)
sampled_df = mushroom_df.sample(500, random_state=42)
sns.pairplot(sampled_df, hue='class', markers=["o", "s"])
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize class distribution
plt.figure(figsize=(8, 6))
sns.countplot(data=mushroom_df, x='class')
plt.title('Class Distribution')
plt.show()

# Pair plot to visualize relationships (sampled for performance)
sampled_df = mushroom_df.sample(500, random_state=42)
sns.pairplot(sampled_df, hue='class', markers=["o", "s"])
plt.show()


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Implement a basic SVM classifier
svm_classifier = SVC()

# Train the SVM model on the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = svm_classifier.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

accuracy, precision, recall, f1, conf_matrix, class_report


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix

# Plot confusion matrix
plt.figure(figsize=(8, 6))
plot_confusion_matrix(svm_classifier, X_test, y_test, cmap='Blues', values_format='d')
plt.title('Confusion Matrix')
plt.show()

# Plot decision boundary for the two numeric features (stalk_height and cap_diameter)
def plot_decision_boundary(X, y, model, title):
    x_min, x_max = X.iloc[:, 0].min() - 1, X.iloc[:, 0].max() + 1
    y_min, y_max = X.iloc[:, 1].min() - 1, X.iloc[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
                         np.arange(y_min, y_max, 0.01))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.figure(figsize=(10, 6))
    plt.contourf(xx, yy, Z, alpha=0.3)
    plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, edgecolors='k', marker='o')
    plt.title(title)
    plt.xlabel(X.columns[0])
    plt.ylabel(X.columns[1])
    plt.show()

# Extract only the two numeric features for plotting
numeric_features = X_train[['stalk_height', 'cap_diameter']]
svm_numeric = SVC().fit(numeric_features, y_train)

# Plot decision boundary
plot_decision_boundary(numeric_features, y_train, svm_numeric, 'SVM Decision Boundary for Numeric Features')


In [None]:
# Imports
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# Example Data Preparation
# X and y should be defined as your feature matrix and target vector
# X_train, X_test, y_train, y_test should be created using train_test_split

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

# Create and fit the GridSearchCV object
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=5)
grid.fit(X_train, y_train)

# Get the best model and predictions
best_svm = grid.best_estimator_
y_pred_best = best_svm.predict(X_test)

# Calculate and print metrics
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best, average='weighted')
recall_best = recall_score(y_test, y_pred_best, average='weighted')
f1_best = f1_score(y_test, y_pred_best, average='weighted')
conf_matrix_best = confusion_matrix(y_test, y_pred_best)
class_report_best = classification_report(y_test, y_pred_best)

print("Best Parameters:", grid.best_params_)
print("Accuracy:", accuracy_best)
print("Precision:", precision_best)
print("Recall:", recall_best)
print("F1 Score:", f1_best)
print("Confusion Matrix:\n", conf_matrix_best)
print("Classification Report:\n", class_report_best)


In [None]:
# Example for displaying the results
print("Best Parameters:", best_params)
print("Accuracy:", accuracy_best)
print("Precision:", precision_best)
print("Recall:", recall_best)
print("F1 Score:", f1_best)
print("Confusion Matrix:\n", conf_matrix_best)
print("Classification Report:\n", class_report_best)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_decision_boundaries(X, y, model):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                         np.arange(y_min, y_max, 0.02))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    plt.contourf(xx, yy, Z, alpha=0.8)
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor='k', marker='o')
    plt.title("SVM Decision Boundaries")
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.show()

# Assuming X_train_reduced and y_train are your 2D feature and target arrays
plot_decision_boundaries(X_train_reduced, y_train, best_svm)


In [None]:
# If predictions need to be visualized
plt.figure(figsize=(10, 6))
plt.scatter(np.arange(len(y_test)), y_test, color='blue', label='True labels')
plt.scatter(np.arange(len(y_test)), y_pred_best, color='red', label='Predicted labels')
plt.title('True vs Predicted Labels')
plt.xlabel('Sample index')
plt.ylabel('Label')
plt.legend()
plt.show()


In [None]:
# Results for different kernels (assuming you stored results)
print("Kernel Comparison Results:")
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    print(f"Kernel: {kernel}")
    # Retrieve metrics for each kernel if saved
    # Example: accuracy_kernel = results_dict[kernel]['accuracy']
    print(f"Accuracy: {accuracy_kernel}")
    print(f"Precision: {precision_kernel}")
    print(f"Recall: {recall_kernel}")
    print(f"F1 Score: {f1_kernel}")
    print()


In [None]:
#SVM Strengths: Effective in high-dimensional spaces, robust to overfitting, flexible with kernels, and clear margin separation.
#SVM Weaknesses: Computationally expensive, sensitive to feature scaling, challenging hyperparameter tuning, and less interpretable.

#Practical Implications: Suitable for high-dimensional, complex classification tasks but may struggle with large datasets and require careful tuning.