In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from scipy import stats

In [None]:
# Load the wine dataset
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Series(wine.target, name='target')

In [None]:
# Exploratory Data Analysis
plt.figure(figsize=(15, 12))
sns.heatmap(X.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap of Wine Features')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')
plt.close()

In [None]:
# Distribution of features
fig, axes = plt.subplots(4, 4, figsize=(20, 20))
axes = axes.ravel()
for i, column in enumerate(X.columns):
    sns.histplot(data=X, x=column, ax=axes[i], kde=True)
    axes[i].set_title(f'Distribution of {column}')
plt.tight_layout()
plt.savefig('feature_distributions.png')
plt.close()

In [None]:
# Feature Engineering
X['alcohol_to_malic_acid_ratio'] = X['alcohol'] / X['malic_acid']
X['color_intensity_to_hue_ratio'] = X['color_intensity'] / X['hue']

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

In [None]:
# Feature Selection
selector = SelectKBest(f_classif, k=10)
X_train_selected = selector.fit_transform(X_train_poly, y_train)
X_test_selected = selector.transform(X_test_poly)

In [None]:
# Model Definition
logistic = LogisticRegression(random_state=42, max_iter=1000)
svm = SVC(random_state=42, probability=True)
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)

In [None]:
# Hyperparameter Tuning
param_grid = {
    'logistic': {'C': [0.1, 1, 10]},
    'svm': {'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']},
    'rf': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]},
    'gb': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 0.5]}
}

In [None]:
best_models = {}
for name, model in [('logistic', logistic), ('svm', svm), ('rf', rf), ('gb', gb)]:
    grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='accuracy')
    grid_search.fit(X_train_selected, y_train)
    best_models[name] = grid_search.best_estimator_

In [None]:
# Ensemble Method
ensemble = VotingClassifier(
    estimators=[('logistic', best_models['logistic']),
                ('svm', best_models['svm']),
                ('rf', best_models['rf']),
                ('gb', best_models['gb'])],
    voting='soft'
)
ensemble.fit(X_train_selected, y_train)

In [None]:
# Model Evaluation
models = {**best_models, 'ensemble': ensemble}
for name, model in models.items():
    y_pred = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name.capitalize()} Accuracy: {accuracy:.4f}")

In [None]:
# Cross-validation
    cv_scores = cross_val_score(model, X_train_selected, y_train, cv=5)
    print(f"{name.capitalize()} Cross-validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

In [None]:
# Detailed report for the best model (ensemble)
y_pred_ensemble = ensemble.predict(X_test_selected)
print("\nClassification Report for Ensemble Model:")
print(classification_report(y_test, y_pred_ensemble, target_names=wine.target_names))

In [None]:
# Confusion Matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred_ensemble)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix for Ensemble Classifier')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix.png')
plt.close()

In [None]:
# Feature Importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_models['rf'].feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance in Random Forest Classifier')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

In [None]:
# PCA Visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train_selected)

plt.figure(figsize=(12, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_train, cmap='viridis')
plt.title('PCA of Wine Dataset')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')
plt.colorbar(scatter)
plt.savefig('pca_visualization.png')
plt.close()

In [None]:
# Learning Curves
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure(figsize=(10, 6))
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

plot_learning_curve(ensemble, "Learning Curve for Ensemble Classifier", 
                    X_train_selected, y_train, cv=5)
plt.savefig('learning_curve.png')

In [None]:
# ROC Curve
from sklearn.metrics import roc_curve, auc
from itertools import cycle

plt.figure(figsize=(10, 8))
for i, color in zip(range(3), ['blue', 'red', 'green']):
    fpr, tpr, _ = roc_curve(y_test, ensemble.predict_proba(X_test_selected)[:, i], pos_label=i)
    plt.plot(fpr, tpr, color=color, lw=2,
             label=f'ROC curve of class {i} (AUC = {auc(fpr, tpr):.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.savefig('roc_curve.png')
plt.close()

print("All visualizations have been saved as PNG files.")

In [None]:
# Statistical Tests
# ANOVA test to check if there are significant differences between classes for each feature
anova_results = {}
for feature in X.columns:
    f_statistic, p_value = f_classif(X[[feature]], y)
    anova_results[feature] = {'F-statistic': f_statistic[0], 'p-value': p_value[0]}

anova_df = pd.DataFrame(anova_results).T
anova_df = anova_df.sort_values('p-value')
print("\nANOVA Test Results:")
print(anova_df)

In [None]:
# Shapiro-Wilk test for normality
normality_results = {}
for feature in X.columns:
    _, p_value = stats.shapiro(X[feature])
    normality_results[feature] = p_value

normality_df = pd.DataFrame.from_dict(normality_results, orient='index', columns=['p-value'])
normality_df = normality_df.sort_values('p-value')
print("\nShapiro-Wilk Test for Normality:")
print(normality_df)