##a)

In [15]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split


X, y = make_classification(
    n_samples=1000, # 1000 observations
    n_features=5, # 5 total features
    n_informative=3, # 3 'useful' features
    n_classes=2, # binary target/label
    random_state=999 # if you want the same results as mine
)

In [16]:
import pandas as pd

dataset = pd.DataFrame(X)
dataset.columns = ['X1', 'X2', 'X3', 'X4', 'X5']
dataset['y'] = y

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X1      1000 non-null   float64
 1   X2      1000 non-null   float64
 2   X3      1000 non-null   float64
 3   X4      1000 non-null   float64
 4   X5      1000 non-null   float64
 5   y       1000 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 47.0 KB


In [17]:
dataset.head()

Unnamed: 0,X1,X2,X3,X4,X5,y
0,2.501284,-0.159155,0.672438,3.469991,0.949268,0
1,2.203247,-0.331271,0.794319,3.259963,0.832451,0
2,-1.524573,-0.870737,1.004304,-1.028624,-0.717383,1
3,1.801498,3.106336,1.490633,-0.297404,-0.607484,0
4,-0.125146,0.987915,0.880293,-0.937299,-0.626822,0


In [18]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
print(f"Train X:{X_train.shape} Y:{y_train.shape}")
print(f"Val X:{X_val.shape} Y:{y_val.shape}")

Train X:(800, 5) Y:(800,)
Val X:(200, 5) Y:(200,)


##b)

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.1, 0.01, 0.001],
    'kernel': ['linear','poly','rbf']
}

# Create the SVM model
svm_model = SVC()

# Perform grid search with cross-validation
grid_search = GridSearchCV(svm_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Best Score: 0.8675


##d)

In [21]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import make_pipeline
from sklearn.multiclass import OneVsRestClassifier


kernels = ['linear', 'poly', 'rbf']
svm_models = []
svm_model = SVC()

for k in kernels:
    svm_model = SVC(kernel=k)
    svm_model.fit(X_train, y_train)
    svm_models.append(svm_model)

predictions = [svm_model.predict(X_val) for svm_model in svm_models]

for kernel, prediction in zip(kernels, predictions):
    print(f"Kernel: {kernel}")
    print(classification_report(y_val, prediction))
    print()

Kernel: linear
              precision    recall  f1-score   support

           0       0.90      0.19      0.31       102
           1       0.54      0.98      0.69        98

    accuracy                           0.57       200
   macro avg       0.72      0.58      0.50       200
weighted avg       0.72      0.57      0.50       200


Kernel: poly
              precision    recall  f1-score   support

           0       0.94      0.44      0.60       102
           1       0.62      0.97      0.76        98

    accuracy                           0.70       200
   macro avg       0.78      0.71      0.68       200
weighted avg       0.78      0.70      0.68       200


Kernel: rbf
              precision    recall  f1-score   support

           0       0.89      0.75      0.81       102
           1       0.77      0.91      0.84        98

    accuracy                           0.82       200
   macro avg       0.83      0.83      0.82       200
weighted avg       0.84      0.8

In [22]:
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier


svm_linear = SVC(kernel='linear',probability=True)
svm_rbf = SVC(kernel='rbf',probability=True)
svm_poly = SVC(kernel='poly', degree=3,probability=True)


pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mkl', VotingClassifier(estimators=[
        ('linear', svm_linear),
        ('rbf', svm_rbf),
        ('poly', svm_poly)
    ], voting='soft'))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_val)


print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.75      0.81       102
           1       0.78      0.89      0.83        98

    accuracy                           0.82       200
   macro avg       0.83      0.82      0.82       200
weighted avg       0.83      0.82      0.82       200

