In [None]:
import subprocess
try:
    result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
    print(result.stdout)
    GPU_AVAILABLE = True
except:
    print("Kh√¥ng t√¨m th·∫•y GPU")
    GPU_AVAILABLE = False

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score,
    ConfusionMatrixDisplay
)
from sklearn.calibration import CalibratedClassifierCV

try:
    from cuml.svm import SVC as cuSVC
    from cuml.preprocessing import StandardScaler as cuStandardScaler
    from cuml.decomposition import PCA as cuPCA
    CUML_AVAILABLE = True
    print("‚úÖ cuML ƒë√£ ƒë∆∞·ª£c import th√†nh c√¥ng - S·ª≠ d·ª•ng GPU acceleration!")
except ImportError:
    CUML_AVAILABLE = False
    print("‚ö†Ô∏è cuML kh√¥ng kh·∫£ d·ª•ng - S·ª≠ d·ª•ng sklearn (CPU)")

import joblib

print(f"\nüìä C·∫•u h√¨nh:")
print(f"   - GPU Available: {GPU_AVAILABLE}")
print(f"   - cuML Available: {CUML_AVAILABLE}")

In [None]:
print("üì• ƒêang t·∫£i d·ªØ li·ªáu MNIST...")
start_time = time()

X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False, parser='auto')

print(f"‚úÖ T·∫£i xong trong {time() - start_time:.2f} gi√¢y")
print(f"\nüìä Th√¥ng tin d·ªØ li·ªáu:")
print(f"   - Shape c·ªßa X: {X.shape}")
print(f"   - Shape c·ªßa y: {y.shape}")
print(f"   - S·ªë l∆∞·ª£ng l·ªõp: {len(np.unique(y))}")
print(f"   - C√°c l·ªõp: {np.unique(y)}")
print(f"   - Dtype c·ªßa X: {X.dtype}")
print(f"   - Range c·ªßa pixel: [{X.min()}, {X.max()}]")

In [None]:
fig, axes = plt.subplots(2, 10, figsize=(15, 4))
fig.suptitle('M·ªôt s·ªë ·∫£nh m·∫´u t·ª´ MNIST', fontsize=14)

for i, ax in enumerate(axes.flat):
    ax.imshow(X[i].reshape(28, 28), cmap='gray')
    ax.set_title(f'Label: {y[i]}')
    ax.axis('off')

plt.tight_layout()
plt.show()

In [None]:
y = y.astype(int)

print("üîÑ Chu·∫©n h√≥a d·ªØ li·ªáu...")
X = X.astype(np.float32) / 255.0

print(f"   - Dtype sau chu·∫©n h√≥a: {X.dtype}")
print(f"   - Range sau chu·∫©n h√≥a: [{X.min():.2f}, {X.max():.2f}]")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=10000, 
    random_state=42,
    stratify=y
)

print(f"üìä Chia d·ªØ li·ªáu:")
print(f"   - Train: {X_train.shape[0]} m·∫´u")
print(f"   - Test: {X_test.shape[0]} m·∫´u")

unique, counts = np.unique(y_train, return_counts=True)
print(f"\nüìà Ph√¢n b·ªë l·ªõp trong t·∫≠p train:")
for label, count in zip(unique, counts):
    print(f"   Ch·ªØ s·ªë {label}: {count} m·∫´u ({count/len(y_train)*100:.1f}%)")

In [None]:
USE_SUBSET = True
SUBSET_SIZE = 10000

if USE_SUBSET:
    print(f"‚ö° S·ª≠ d·ª•ng t·∫≠p con {SUBSET_SIZE} m·∫´u ƒë·ªÉ th·ª≠ nghi·ªám nhanh...")
    from sklearn.model_selection import StratifiedShuffleSplit
    
    sss = StratifiedShuffleSplit(n_splits=1, train_size=SUBSET_SIZE, random_state=42)
    for train_idx, _ in sss.split(X_train, y_train):
        X_train_subset = X_train[train_idx]
        y_train_subset = y_train[train_idx]
    
    print(f"   - T·∫≠p train subset: {X_train_subset.shape[0]} m·∫´u")
else:
    X_train_subset = X_train
    y_train_subset = y_train
    print(f"üìä S·ª≠ d·ª•ng to√†n b·ªô {X_train.shape[0]} m·∫´u train")

In [None]:
def train_svm(X_train, y_train, kernel='rbf', C=1.0, gamma='scale', 
              use_pca=False, n_components=100, use_gpu=False):
    steps = []
    
    if use_gpu and CUML_AVAILABLE:
        steps.append(('scaler', cuStandardScaler()))
    else:
        steps.append(('scaler', StandardScaler()))
    
    if use_pca:
        if use_gpu and CUML_AVAILABLE:
            steps.append(('pca', cuPCA(n_components=n_components)))
        else:
            steps.append(('pca', PCA(n_components=n_components)))
    
    if use_gpu and CUML_AVAILABLE:
        svm = cuSVC(kernel=kernel, C=C, gamma=gamma, probability=True)
    else:
        svm = SVC(kernel=kernel, C=C, gamma=gamma, probability=True, cache_size=1000)
    
    steps.append(('svc', svm))
    pipeline = Pipeline(steps)
    
    print(f"üèãÔ∏è B·∫Øt ƒë·∫ßu hu·∫•n luy·ªán SVM...")
    print(f"   - Kernel: {kernel}")
    print(f"   - C: {C}")
    print(f"   - Gamma: {gamma}")
    print(f"   - PCA: {use_pca} ({n_components} components)" if use_pca else f"   - PCA: {use_pca}")
    print(f"   - GPU: {use_gpu and CUML_AVAILABLE}")
    
    start_time = time()
    pipeline.fit(X_train, y_train)
    train_time = time() - start_time
    
    print(f"\n‚úÖ Hu·∫•n luy·ªán ho√†n t·∫•t trong {train_time:.2f} gi√¢y")
    
    return pipeline, train_time

In [None]:
print("="*60)
print("üéØ Hu·∫•n luy·ªán SVM v·ªõi RBF Kernel")
print("="*60)

model_rbf, time_rbf = train_svm(
    X_train_subset, y_train_subset,
    kernel='rbf',
    C=1.0,
    gamma='scale',
    use_pca=False,
    use_gpu=CUML_AVAILABLE
)

In [None]:
print("\n" + "="*60)
print("üéØ Hu·∫•n luy·ªán SVM v·ªõi Linear Kernel")
print("="*60)

model_linear, time_linear = train_svm(
    X_train_subset, y_train_subset,
    kernel='linear',
    C=1.0,
    use_pca=False,
    use_gpu=CUML_AVAILABLE
)

In [None]:
print("\n" + "="*60)
print("üéØ Hu·∫•n luy·ªán SVM v·ªõi RBF Kernel + PCA")
print("="*60)

model_pca, time_pca = train_svm(
    X_train_subset, y_train_subset,
    kernel='rbf',
    C=1.0,
    gamma='scale',
    use_pca=True,
    n_components=100,
    use_gpu=CUML_AVAILABLE
)

In [None]:
def evaluate_model(model, X_test, y_test, model_name="Model"):
    print(f"\n{'='*60}")
    print(f"üìä ƒê√°nh gi√°: {model_name}")
    print(f"{'='*60}")
    
    start_time = time()
    y_pred = model.predict(X_test)
    predict_time = time() - start_time
    
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\nüéØ Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"‚è±Ô∏è Th·ªùi gian d·ª± ƒëo√°n: {predict_time:.4f} gi√¢y")
    
    print(f"\nüìã Classification Report:")
    print(classification_report(y_test, y_pred, digits=4))
    
    cm = confusion_matrix(y_test, y_pred)
    
    return {
        'accuracy': accuracy,
        'predict_time': predict_time,
        'y_pred': y_pred,
        'confusion_matrix': cm
    }

In [None]:
results_rbf = evaluate_model(model_rbf, X_test, y_test, "SVM RBF Kernel")
results_linear = evaluate_model(model_linear, X_test, y_test, "SVM Linear Kernel")
results_pca = evaluate_model(model_pca, X_test, y_test, "SVM RBF + PCA")

In [None]:
def plot_confusion_matrix(cm, title="Confusion Matrix"):
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='YlOrRd',
                xticklabels=range(10), yticklabels=range(10))
    plt.title(title, fontsize=14)
    plt.xlabel('D·ª± ƒëo√°n', fontsize=12)
    plt.ylabel('Th·ª±c t·∫ø', fontsize=12)
    plt.tight_layout()
    plt.show()

plot_confusion_matrix(results_rbf['confusion_matrix'], "Ma tr·∫≠n Nh·∫ßm l·∫´n - SVM RBF Kernel")

In [None]:
comparison_df = pd.DataFrame({
    'Model': ['SVM RBF', 'SVM Linear', 'SVM RBF + PCA'],
    'Accuracy': [results_rbf['accuracy'], results_linear['accuracy'], results_pca['accuracy']],
    'Train Time (s)': [time_rbf, time_linear, time_pca],
    'Predict Time (s)': [results_rbf['predict_time'], results_linear['predict_time'], results_pca['predict_time']]
})

print("\nüìä So s√°nh c√°c m√¥ h√¨nh:")
print(comparison_df.to_string(index=False))

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].bar(comparison_df['Model'], comparison_df['Accuracy'], color=['#3498db', '#e74c3c', '#2ecc71'])
axes[0].set_ylabel('Accuracy')
axes[0].set_title('So s√°nh Accuracy')
axes[0].set_ylim([0.9, 1.0])
for i, v in enumerate(comparison_df['Accuracy']):
    axes[0].text(i, v + 0.002, f'{v:.4f}', ha='center')

axes[1].bar(comparison_df['Model'], comparison_df['Train Time (s)'], color=['#3498db', '#e74c3c', '#2ecc71'])
axes[1].set_ylabel('Th·ªùi gian (s)')
axes[1].set_title('So s√°nh Th·ªùi gian Hu·∫•n luy·ªán')

axes[2].bar(comparison_df['Model'], comparison_df['Predict Time (s)'], color=['#3498db', '#e74c3c', '#2ecc71'])
axes[2].set_ylabel('Th·ªùi gian (s)')
axes[2].set_title('So s√°nh Th·ªùi gian D·ª± ƒëo√°n')

plt.tight_layout()
plt.show()

In [None]:
def show_misclassified(X_test, y_test, y_pred, n_samples=10):
    misclassified_idx = np.where(y_test != y_pred)[0]
    
    if len(misclassified_idx) == 0:
        print("Kh√¥ng c√≥ m·∫´u n√†o b·ªã ph√¢n lo·∫°i sai!")
        return
    
    n_show = min(n_samples, len(misclassified_idx))
    fig, axes = plt.subplots(2, 5, figsize=(15, 6))
    fig.suptitle(f'C√°c m·∫´u b·ªã ph√¢n lo·∫°i sai ({len(misclassified_idx)} m·∫´u)', fontsize=14)
    
    for i, ax in enumerate(axes.flat):
        if i < n_show:
            idx = misclassified_idx[i]
            ax.imshow(X_test[idx].reshape(28, 28), cmap='gray')
            ax.set_title(f'Th·ª±c: {y_test[idx]}\nD·ª± ƒëo√°n: {y_pred[idx]}', 
                        color='red', fontsize=10)
        ax.axis('off')
    
    plt.tight_layout()
    plt.show()

show_misclassified(X_test, y_test, results_rbf['y_pred'])

In [None]:
print("üîç B·∫Øt ƒë·∫ßu Grid Search...")
print("‚ö†Ô∏è Qu√° tr√¨nh n√†y c√≥ th·ªÉ m·∫•t v√†i ph√∫t...\n")

pipeline_grid = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(probability=True, cache_size=1000))
])

param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__gamma': ['scale', 0.01, 0.1],
    'svc__kernel': ['rbf', 'linear']
}

n_grid_samples = min(5000, len(X_train_subset))
X_grid = X_train_subset[:n_grid_samples]
y_grid = y_train_subset[:n_grid_samples]

print(f"üìä S·ª≠ d·ª•ng {n_grid_samples} m·∫´u cho GridSearch")
print(f"üìä S·ªë l∆∞·ª£ng k·∫øt h·ª£p tham s·ªë: {len(param_grid['svc__C']) * len(param_grid['svc__gamma']) * len(param_grid['svc__kernel'])}")

grid_search = GridSearchCV(
    pipeline_grid,
    param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2,
    scoring='accuracy',
    return_train_score=True
)

start_time = time()
grid_search.fit(X_grid, y_grid)
grid_time = time() - start_time

print(f"\n‚úÖ GridSearch ho√†n t·∫•t trong {grid_time:.2f} gi√¢y")

In [None]:
print("\nüìä K·∫øt qu·∫£ GridSearch:")
print(f"   - Best Score (CV): {grid_search.best_score_:.4f}")
print(f"   - Best Parameters: {grid_search.best_params_}")

results_df = pd.DataFrame(grid_search.cv_results_)
results_df = results_df.sort_values('rank_test_score')[[
    'params', 'mean_test_score', 'std_test_score', 'mean_train_score', 'rank_test_score'
]].head(10)

print("\nüèÜ Top 10 k·∫øt h·ª£p tham s·ªë:")
print(results_df.to_string(index=False))

In [None]:
print("\nüèãÔ∏è Hu·∫•n luy·ªán m√¥ h√¨nh t·ªëi ∆∞u v·ªõi tham s·ªë t·ªët nh·∫•t...")

best_params = grid_search.best_params_

model_best, time_best = train_svm(
    X_train_subset, y_train_subset,
    kernel=best_params['svc__kernel'],
    C=best_params['svc__C'],
    gamma=best_params['svc__gamma'] if 'svc__gamma' in best_params else 'scale',
    use_pca=False,
    use_gpu=CUML_AVAILABLE
)

results_best = evaluate_model(model_best, X_test, y_test, "SVM T·ªëi ∆∞u (Best Params)")

In [None]:
print("\nüìä Th·ª≠ nghi·ªám v·ªõi PCA:")

pca_components = [50, 100, 150, 200]
pca_results = []

for n_comp in pca_components:
    print(f"\nüîÑ Hu·∫•n luy·ªán v·ªõi PCA n_components={n_comp}")
    
    model_pca_test, train_time = train_svm(
        X_train_subset, y_train_subset,
        kernel='rbf',
        C=best_params.get('svc__C', 1.0),
        gamma='scale',
        use_pca=True,
        n_components=n_comp,
        use_gpu=CUML_AVAILABLE
    )
    
    y_pred = model_pca_test.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    pca_results.append({
        'n_components': n_comp,
        'accuracy': accuracy,
        'train_time': train_time
    })
    print(f"   Accuracy: {accuracy:.4f}")

pca_df = pd.DataFrame(pca_results)
print("\nüìä So s√°nh c√°c c·∫•u h√¨nh PCA:")
print(pca_df.to_string(index=False))

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(pca_df['n_components'], pca_df['accuracy'], 'bo-', markersize=8)
axes[0].set_xlabel('S·ªë th√†nh ph·∫ßn PCA')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Accuracy vs S·ªë th√†nh ph·∫ßn PCA')
axes[0].grid(True, alpha=0.3)

axes[1].plot(pca_df['n_components'], pca_df['train_time'], 'ro-', markersize=8)
axes[1].set_xlabel('S·ªë th√†nh ph·∫ßn PCA')
axes[1].set_ylabel('Th·ªùi gian hu·∫•n luy·ªán (s)')
axes[1].set_title('Th·ªùi gian hu·∫•n luy·ªán vs S·ªë th√†nh ph·∫ßn PCA')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
print("="*60)
print("üèÜ Hu·∫•n luy·ªán M√¥ h√¨nh Cu·ªëi c√πng")
print("="*60)

final_model = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(
        kernel=best_params['svc__kernel'],
        C=best_params['svc__C'],
        gamma=best_params.get('svc__gamma', 'scale'),
        probability=True,
        cache_size=1000
    ))
])

print(f"\nüìä C·∫•u h√¨nh m√¥ h√¨nh cu·ªëi c√πng:")
print(f"   - Kernel: {best_params['svc__kernel']}")
print(f"   - C: {best_params['svc__C']}")
print(f"   - Gamma: {best_params.get('svc__gamma', 'scale')}")

start_time = time()
final_model.fit(X_train_subset, y_train_subset)
final_train_time = time() - start_time

print(f"\n‚úÖ Hu·∫•n luy·ªán ho√†n t·∫•t trong {final_train_time:.2f} gi√¢y")

final_results = evaluate_model(final_model, X_test, y_test, "M√¥ h√¨nh Cu·ªëi c√πng")

In [None]:
print("\nüì§ Xu·∫•t ƒë·∫ßu ra cho Ensemble:")

proba = final_model.predict_proba(X_test)
pred = final_model.predict(X_test)

print(f"\nüìä Shape c·ªßa x√°c su·∫•t: {proba.shape}")
print(f"   - M·ªói h√†ng l√† m·ªôt m·∫´u")
print(f"   - M·ªói c·ªôt l√† x√°c su·∫•t cho ch·ªØ s·ªë 0-9")

print(f"\nüìã V√≠ d·ª• 5 m·∫´u ƒë·∫ßu ti√™n:")
sample_output = pd.DataFrame(
    proba[:5],
    columns=[f'P(digit={i})' for i in range(10)]
)
sample_output['Predicted'] = pred[:5]
sample_output['Actual'] = y_test[:5]
print(sample_output.to_string(index=False))

In [None]:
print("\nüíæ L∆∞u ƒë·∫ßu ra...")

ensemble_output = pd.DataFrame(proba, columns=[f'prob_digit_{i}' for i in range(10)])
ensemble_output['predicted_label'] = pred
ensemble_output['true_label'] = y_test

ensemble_output.to_csv('svm_predictions_for_ensemble.csv', index=False)
print("‚úÖ ƒê√£ l∆∞u: svm_predictions_for_ensemble.csv")

np.save('svm_probabilities.npy', proba)
print("‚úÖ ƒê√£ l∆∞u: svm_probabilities.npy")

np.save('svm_predictions.npy', pred)
print("‚úÖ ƒê√£ l∆∞u: svm_predictions.npy")

In [None]:
print("\nüíæ L∆∞u m√¥ h√¨nh...")

joblib.dump(final_model, 'svm_digit_classifier.joblib')
print("‚úÖ ƒê√£ l∆∞u m√¥ h√¨nh: svm_digit_classifier.joblib")

In [None]:
print("="*60)
print("üìä T·ªîNG K·∫æT K·∫æT QU·∫¢")
print("="*60)

print(f"\nüéØ M√¥ h√¨nh cu·ªëi c√πng:")
print(f"   - Accuracy: {final_results['accuracy']:.4f} ({final_results['accuracy']*100:.2f}%)")
print(f"   - Kernel: {best_params['svc__kernel']}")
print(f"   - C: {best_params['svc__C']}")
print(f"   - Gamma: {best_params.get('svc__gamma', 'scale')}")

print(f"\nüìÅ C√°c file ƒë√£ l∆∞u:")
print("   - svm_digit_classifier.joblib (m√¥ h√¨nh)")
print("   - svm_predictions_for_ensemble.csv (ƒë·∫ßu ra cho ensemble)")
print("   - svm_probabilities.npy (x√°c su·∫•t d·ª± ƒëo√°n)")
print("   - svm_predictions.npy (nh√£n d·ª± ƒëo√°n)")

print(f"\n‚úÖ Ho√†n t·∫•t!")

In [None]:
plt.figure(figsize=(12, 10))

cm = final_results['confusion_matrix']
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues',
            xticklabels=range(10), yticklabels=range(10),
            cbar_kws={'label': 'T·ª∑ l·ªá'})

plt.title('Ma tr·∫≠n Nh·∫ßm l·∫´n (Normalized) - M√¥ h√¨nh SVM Cu·ªëi c√πng', fontsize=14)
plt.xlabel('D·ª± ƒëo√°n', fontsize=12)
plt.ylabel('Th·ª±c t·∫ø', fontsize=12)
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=150, bbox_inches='tight')
print("‚úÖ ƒê√£ l∆∞u: confusion_matrix.png")
plt.show()

In [None]:
def predict_digit(model, image):
    if image.ndim == 2:
        image = image.reshape(1, -1)
    elif image.ndim == 1:
        image = image.reshape(1, -1)
    
    if image.max() > 1:
        image = image.astype(np.float32) / 255.0
    
    pred = model.predict(image)[0]
    proba = model.predict_proba(image)[0]
    
    return {
        'prediction': pred,
        'confidence': proba[pred],
        'probabilities': proba
    }

test_image = X_test[0]
result = predict_digit(final_model, test_image)

print(f"üîÆ D·ª± ƒëo√°n: {result['prediction']}")
print(f"üìä ƒê·ªô tin c·∫≠y: {result['confidence']:.4f}")
print(f"üéØ Nh√£n th·ª±c t·∫ø: {y_test[0]}")

plt.figure(figsize=(8, 4))

plt.subplot(1, 2, 1)
plt.imshow(test_image.reshape(28, 28), cmap='gray')
plt.title(f'D·ª± ƒëo√°n: {result["prediction"]} (Th·ª±c t·∫ø: {y_test[0]})')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.bar(range(10), result['probabilities'])
plt.xlabel('Ch·ªØ s·ªë')
plt.ylabel('X√°c su·∫•t')
plt.title('Ph√¢n b·ªë x√°c su·∫•t')
plt.xticks(range(10))

plt.tight_layout()
plt.show()