In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, recall_score

### 1. SVM with Submanifold Minimization

In [2]:
class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # Map labels to +1 and -1
        y_ = np.where(y <= 0, -1, 1)

        # Initialize weights and bias
        self.w = np.zeros(n_features)
        self.b = 0

        # Submanifold Minimization Algorithm
        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y_[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    # Update weights for correctly classified points
                    self.w -= self.lr * (2 * self.lambda_param * self.w)
                else:
                    # Update weights and bias for misclassified points
                    self.w -= self.lr * (
                        2 * self.lambda_param * self.w - np.dot(x_i, y_[idx])
                    )
                    self.b -= self.lr * y_[idx]

    def predict(self, X):
        approx = np.dot(X, self.w) - self.b
        return np.sign(approx)


In [3]:
X_pos = np.array([[2.0, 2.2], [2.7, 2.5], [2.3, 2.0], [3.1, 2.3], [2.5, 2.4], [2.8, 2.7]])
y_pos = np.ones(len(X_pos))

X_neg = np.array([[1.6, 1.5], [2.0, 1.9], [2.1, 1.8], [1.7, 1.6], [1.8, 1.7], [2.0, 1.6]])
y_neg = -np.ones(len(X_neg))

# Combining the  positive and negative classes
X = np.vstack((X_pos, X_neg))
y = np.hstack((y_pos, y_neg))

clf = SVM(learning_rate=0.001, lambda_param=0.01, n_iters=1000)
clf.fit(X, y)

# Model parameters
print("Weights:", clf.w)
print("Bias:", clf.b)

# Predictions
predictions = clf.predict(X)
print("Predictions:", predictions)

Weights: [0.35074838 0.51297922]
Bias: 1.313999999999966
Predictions: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [4]:
accuracy = accuracy_score(y, predictions)
f1 = f1_score(y, predictions)
recall = recall_score(y, predictions)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Recall:", recall)

Accuracy: 0.5
F1 Score: 0.6666666666666666
Recall: 1.0


In [None]:
#### code basics

In [None]:
from sklearn.svm import SVC

model = SVC(kernel="rbf")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

model.n_iter_

from sklearn.svm import SVC

model = SVC(kernel="linear")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)
print(report)

model.n_iter_



# Import necessary libraries
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

# Step 1: Import the dataset and display the first few rows
df = pd.read_csv("breast_cancer_data.csv")

print("Number of rows and columns: ", df.shape)

print("First few rows of the dataset:")
df.head()

df = pd.get_dummies(df, columns=['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'breast', 'breast-quad', 'irradiat'], drop_first=True)
print("First few rows of the encoded dataset:")
df.head()

### Task 2: Logistic Regression Model Without Handling Class Imbalance

1. Split the dataset into training and test sets.
2. Train a Logistic Regression model using the training data.
3. Evaluate the model using a classification report, including precision, recall, and F1-score.

# Step 1: Split the dataset into training and test sets
X = df.drop('class', axis=1)
y = df['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Step 2: Train a Logistic Regression model using the training data
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

# Step 3: Evaluate the model using a classification report, including precision, recall, and F1-score
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report Without Handling Class Imbalance:")
print(report)



### Task 3: Handling Class Imbalance Using Undersampling

1. Apply undersampling to balance the classes in the training data.
2. Train a Logistic Regression model using the undersampled training data.
3. Evaluate the model using a classification report, including precision, recall, and F1-score.

In [None]:
# Step 1: Apply undersampling to balance the classes in the training data
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

# Step 2: Train a Logistic Regression model using the undersampled training data
model_rus = LogisticRegression(max_iter=2000)
model_rus.fit(X_train_rus, y_train_rus)

# Step 3: Evaluate the model using a classification report, including precision, recall, and F1-score
y_pred_rus = model_rus.predict(X_test)
report_rus = classification_report(y_test, y_pred_rus)
print("Classification Report with Undersampling:")
print(report_rus)


### Task 4: Handling Class Imbalance Using SMOTE (Oversampling)

1. Apply SMOTE (Synthetic Minority Over-sampling Technique) to balance the classes in the training data.
2. Train a Logistic Regression model using the SMOTE-balanced training data.
3. Evaluate the model using a classification report, including precision, recall, and F1-score.

In [None]:
# Step 1: Apply SMOTE to balance the classes in the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


# Step 2: Train a Logistic Regression model using the SMOTE-balanced training data
model_smote = LogisticRegression(max_iter=200)
model_smote.fit(X_train_smote, y_train_smote)


# Step 3: Evaluate the model using a classification report, including precision, recall, and F1-score
y_pred_smote = model_smote.predict(X_test)
report_smote = classification_report(y_test, y_pred_smote)
print("Classification Report with SMOTE:")
print(report_smote)

### Task 5: Handling Class Imbalance Using SMOTE Tomek Links

1. Apply SMOTE Tomek Links to balance the classes in the training data.
2. Train a Logistic Regression model using the SMOTE Tomek Links-balanced training data.
3. Evaluate the model using a classification report, including precision, recall, and F1-score.

In [None]:
# Step 1: Apply SMOTE Tomek Links to balance the classes in the training data
smt = SMOTETomek(random_state=42)
X_tomek, y_tomek = smt.fit_resample(X_train, y_train)

# Step 2: Train a Logistic Regression model using the SMOTE Tomek Links-balanced training data
model_tomek = LogisticRegression(max_iter=2000)
model_tomek.fit(X_tomek, y_tomek)

# Step 3: Evaluate the model using a classification report, including precision, recall, and F1-score
y_pred_tomek = model_tomek.predict(X_test)
report_tomek = classification_report(y_test, y_pred_tomek)
print("Classification Report with SMOTE Tomek Links:")
print(report_tomek)

In [None]:

# Implementing SVM from Scratch using Submanifold Minimization Algorithm

class SVM:
    def __init__(self, learning_rate=0.001, lambda_param=0.01, n_iters=1000):
        self.learning_rate = learning_rate  # Step size for gradient descent
        self.lambda_param = lambda_param  # Regularization parameter
        self.n_iters = n_iters  # Number of iterations
        self.w = None  # Weights
        self.b = None  # Bias

    def fit(self, X, y):
        """
        Train the SVM model using gradient descent with hinge loss.
        """
        n_samples, n_features = X.shape
        self.w = np.zeros(n_features)
        self.b = 0

        # Gradient descent optimization
        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y[idx] * (np.dot(x_i, self.w) - self.b) >= 1
                if condition:
                    # No margin violation: update weights with regularization only
                    self.w -= self.learning_rate * (2 * self.lambda_param * self.w)
                else:
                    # Margin violation: apply hinge loss gradient
                    self.w -= self.learning_rate * (2 * self.lambda_param * self.w - np.dot(x_i, y[idx]))
                    self.b -= self.learning_rate * y[idx]

    def predict(self, X):
        """
        Predict class labels.
        """
        linear_output = np.dot(X, self.w) - self.b
        return np.sign(linear_output)

# Define dataset from the image
X_pos = np.array([[2.0, 2.2], [2.7, 2.5], [2.3, 2.0], [3.1, 2.3], [2.5, 2.4], [2.8, 2.7]])
y_pos = np.ones((X_pos.shape[0],))  # Labels for class +1

X_neg = np.array([[1.6, 1.5], [2.0, 1.9], [2.1, 1.8], [1.7, 1.6], [1.8, 1.7], [2.0, 1.6]])
y_neg = -np.ones((X_neg.shape[0],))  # Labels for class -1

# Combine dataset
X = np.vstack((X_pos, X_neg))
y = np.hstack((y_pos, y_neg))

# Train SVM model
svm = SVM(learning_rate=0.001, lambda_param=0.01, n_iters=1000)
svm.fit(X, y)

# Make predictions
y_pred = svm.predict(X)

# Compute evaluation metrics manually
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred) * 100

def precision(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == -1) & (y_pred == 1))
    return tp / (tp + fp) if (tp + fp) > 0 else 0

def recall(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == -1))
    return tp / (tp + fn) if (tp + fn) > 0 else 0

def f1_score(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * (p * r) / (p + r) if (p + r) > 0 else 0

# Compute and store results
accuracy_val = accuracy(y, y_pred)
recall_val = recall(y, y_pred)
f1_val = f1_score(y, y_pred)

metrics_df = pd.DataFrame({
    "Metric": ["Accuracy", "Recall", "F1 Score"],
    "Value": [accuracy_val, recall_val, f1_val]
})

import ace_tools as tools
tools.display_dataframe_to_user(name="SVM Evaluation Metrics (No Sklearn)", dataframe=metrics_df)

# Plot the dataset and decision boundary
plt.figure(figsize=(8, 6))
plt.scatter(X_pos[:, 0], X_pos[:, 1], color='blue', label="Class +1")
plt.scatter(X_neg[:, 0], X_neg[:, 1], color='red', label="Class -1")

# Compute decision boundary
x_vals = np.linspace(1.5, 3.5, 100)
y_vals = -(svm.w[0] * x_vals + svm.b) / svm.w[1]
plt.plot(x_vals, y_vals, color='black', linestyle='--', label="Decision Boundary")

plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("SVM Decision Boundary (No Sklearn)")
plt.legend()
plt.grid(True)
plt.show()





# Re-import necessary libraries after execution state reset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, recall_score

# Define the dataset from the provided image
X_pos = np.array([[2.0, 2.2], [2.7, 2.5], [2.3, 2.0], [3.1, 2.3], [2.5, 2.4], [2.8, 2.7]])
y_pos = np.ones((X_pos.shape[0],))  # Labels for class +1

X_neg = np.array([[1.6, 1.5], [2.0, 1.9], [2.1, 1.8], [1.7, 1.6], [1.8, 1.7], [2.0, 1.6]])
y_neg = -np.ones((X_neg.shape[0],))  # Labels for class -1

# Combine the dataset
X = np.vstack((X_pos, X_neg))
y = np.hstack((y_pos, y_neg))

# Train an SVM classifier using a linear kernel
svm_model = SVC(kernel='linear')  
svm_model.fit(X, y)

# Make predictions
y_pred = svm_model.predict(X)

# Compute evaluation metrics
accuracy = accuracy_score(y, y_pred)
f1 = f1_score(y, y_pred)
recall = recall_score(y, y_pred)

# Display results
metrics_df = pd.DataFrame({
    "Metric": ["Accuracy", "F1 Score", "Recall"],
    "Value": [accuracy, f1, recall]
})

import ace_tools as tools
tools.display_dataframe_to_user(name="SVM Evaluation Metrics", dataframe=metrics_df)

# Plot the dataset and decision boundary
plt.figure(figsize=(8, 6))
plt.scatter(X_pos[:, 0], X_pos[:, 1], color='blue', label="Class +1")
plt.scatter(X_neg[:, 0], X_neg[:, 1], color='red', label="Class -1")

# Plot decision boundary
w = svm_model.coef_[0]
b = svm_model.intercept_[0]
x_vals = np.linspace(1.5, 3.5, 100)
y_vals = -(w[0] * x_vals + b) / w[1]
plt.plot(x_vals, y_vals, color='black', linestyle='--', label="Decision Boundary")

plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("SVM Decision Boundary")
plt.legend()
plt.grid(True)
plt.show()








######### Using Algorithm in Note

import numpy as np

class SubmanifoldSVM:
    def __init__(self, C=1.0, tol=1e-5, max_iter=1000):
        self.C = C  # Regularization parameter
        self.tol = tol  # Tolerance for stopping criteria
        self.max_iter = max_iter  # Maximum iterations
        self.w = None  # Weight vector
        self.b = None  # Bias term

    def fit(self, X, y):
        """
        Train SVM using the Submanifold Minimization Algorithm.
        """
        n_samples, n_features = X.shape

        # Define matrices for quadratic programming (QP)
        C_mat = np.eye(n_features + 1)  # Identity matrix for regularization
        C_mat[-1, -1] = 0  # No regularization for bias term
        c_vec = np.zeros(n_features + 1)  # Zero initialization

        # Define inequality constraints: y * (Xw + b) >= 1
        A = np.hstack((X * y[:, np.newaxis], y[:, np.newaxis]))  # Constraint matrix
        a = np.ones(n_samples)  # Constraint vector

        # Define bound constraints: 0 <= alpha <= C
        B = np.vstack((np.eye(n_samples), -np.eye(n_samples)))  # Bounds for Lagrange multipliers
        b = np.hstack((self.C * np.ones(n_samples), np.zeros(n_samples)))  # Limits for alphas

        # Initialize w and b
        x = np.zeros(n_features + 1)  # Initial guess for [w, b]

        # Step 1: Initialize active constraints set
        K0 = np.where((B @ x - b) == 0)[0]

        # Iterative Optimization Process
        for _ in range(self.max_iter):
            while True:
                # Step 2: Solve quadratic problem with active constraints
                A_tilde = np.vstack((A, B[K0]))  # Active constraints
                a_tilde = np.hstack((a, b[K0]))

                # Solve for x_star using normal equation
                QP_mat = np.block([[C_mat, A_tilde.T], [A_tilde, np.zeros((A_tilde.shape[0], A_tilde.shape[0]))]])
                rhs = np.hstack((c_vec, a_tilde))

                try:
                    solution = np.linalg.solve(QP_mat, rhs)
                    x_star, nu_star = solution[: n_features + 1], solution[n_features + 1 :]
                except np.linalg.LinAlgError:
                    break  # If singular, break

                # Step 3: Check convergence
                if np.dot(x_star, x_star) >= np.dot(x, x):
                    break

                # Step 4: Compute step size
                step_size = np.max([mu for mu in np.linspace(0, 1, 100) if np.all(B @ (x + mu * (x_star - x)) <= b)])
                x = x + step_size * (x_star - x)

                # Step 5: Update active constraint set
                K0 = np.where((B @ x - b) == 0)[0]

            # Step 6: Remove negative constraints
            if np.all(nu_star >= 0):
                break

            K0 = np.delete(K0, np.where(nu_star < 0)[0])

        # Extract weights and bias
        self.w, self.b = x[:-1], x[-1]

    def predict(self, X):
        """
        Predict using trained SVM model.
        """
        return np.sign(np.dot(X, self.w) + self.b)

# Define dataset from the image
X_pos = np.array([[2.0, 2.2], [2.7, 2.5], [2.3, 2.0], [3.1, 2.3], [2.5, 2.4], [2.8, 2.7]])
y_pos = np.ones((X_pos.shape[0],))  # Labels for class +1

X_neg = np.array([[1.6, 1.5], [2.0, 1.9], [2.1, 1.8], [1.7, 1.6], [1.8, 1.7], [2.0, 1.6]])
y_neg = -np.ones((X_neg.shape[0],))  # Labels for class -1

# Combine dataset
X = np.vstack((X_pos, X_neg))
y = np.hstack((y_pos, y_neg))

# Train SVM model using Submanifold Minimization Algorithm
svm = SubmanifoldSVM(C=1.0, max_iter=1000)
svm.fit(X, y)

# Make predictions
y_pred = svm.predict(X)

# Compute evaluation metrics manually
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred) * 100

def precision(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == -1) & (y_pred == 1))
    return tp / (tp + fp) if (tp + fp) > 0 else 0

def recall(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == -1))
    return tp / (tp + fn) if (tp + fn) > 0 else 0

def f1_score(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * (p * r) / (p + r) if (p + r) > 0 else 0

# Compute and store results
accuracy_val = accuracy(y, y_pred)
recall_val = recall(y, y_pred)
f1_val = f1_score(y, y_pred)

metrics_df = pd.DataFrame({
    "Metric": ["Accuracy", "Recall", "F1 Score"],
    "Value": [accuracy_val, recall_val, f1_val]
})

import ace_tools as tools
tools.display_dataframe_to_user(name="SVM Evaluation Metrics (Submanifold Minimization)", dataframe=metrics_df)

# Plot the dataset and decision boundary
plt.figure(figsize=(8, 6))
plt.scatter(X_pos[:, 0], X_pos[:, 1], color='blue', label="Class +1")
plt.scatter(X_neg[:, 0], X_neg[:, 1], color='red', label="Class -1")

# Compute decision boundary
x_vals = np.linspace(1.5, 3.5, 100)
y_vals = -(svm.w[0] * x_vals + svm.b) / svm.w[1]
plt.plot(x_vals, y_vals, color='black', linestyle='--', label="Decision Boundary")

plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("SVM Decision Boundary (Submanifold Minimization Algorithm)")
plt.legend()
plt.grid(True)
plt.show()



In this part, you will be using the credit card fraud detection dataset from https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud to train and test a Support Vector Machine (SVM) classifier. Your task
is to:

1. Download the data and split the dataset into training and testing sets (80-20 split) in a stratified manner to take care of the class imbalance. You need to code the stratified splitting function from scratch. *sklearn is not allowed for this part*
1. Implement the basic Pegasos Algorithm from the paper https://home.ttic.edu/~nati/Publications/PegasosMPB.pdf. This is in page 5, Fig 1.
1. Implement the mini-batch Pegasos algorithm from the paper https://home.ttic.edu/~nati/Publications/PegasosMPB.pdf. Do not forget the projection step. This is in page 6, Fig 2.
1. Implement the dual coordinate descent method for SVM’s from the paper https://icml.cc/Conferences/2008/papers/166.pdf. This is Algorithm 1 in the paper.
1. Report a final accuracy on the test set for all 3 approches.

In [None]:
### Write your code here
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score,roc_auc_score

def StratifiedSplit(X, y, test_size=0.3):
    unique_classes = np.unique(y)
    
    train_indices, test_indices = [], []
    
    for class_label in unique_classes:
        # Find indices of samples with the current class label
        class_indices = np.where(y == class_label)[0]
        np.random.shuffle(class_indices)
        
        # Calculate the number of samples for the test set
        num_test_samples = int(len(class_indices) * test_size)
        
        # Split indices into train and test sets
        train_indices.extend(class_indices[num_test_samples:])
        test_indices.extend(class_indices[:num_test_samples])
    
    # Shuffle the indices to randomize the order
    np.random.shuffle(train_indices)
    np.random.shuffle(test_indices)
    
    # Create the train and test sets based on the indices
    X_train, X_test = X[train_indices,:], X[test_indices,:]
    y_train, y_test = y[train_indices], y[test_indices]

    return X_train, X_test, y_train, y_test

# reading the file
df = pd.read_csv('creditcard.csv')
# separating into X and Y
X = df.iloc[:,:-1]
y=df.iloc[:,-1]
y = df.iloc[:,-1].values
# make the y labels as -1,1 instead of 0,1
y = np.where(y>0,y,-1)
X_train, X_test, y_train, y_test = StratifiedSplit(X.values, y, test_size=0.2)

class Pegasos:
    def __init__ (self, lamda, k, projection):
        self.lamda = lamda #lambda value
        self.k = k #number of observations to be used
        self.projection = projection #for projections
        
    def gradient(self, p): #to calculate the gradient
        return np.where(p<1,1,0)
    
    def fit(self, x, y, n_iters=1500):
        m, n = x.shape
        self.W = np.zeros(n)
        for t in range(1,n_iters+1): #iterate till max_iters
            #pick a random instance
            idx = np.random.choice(range(m), self.k, replace=False)
            lr = 1/(self.lamda*t) #get the learning rate
            x_i = x[idx] #get x_i
            y_i = y[idx] #get y_i
            prod = y_i * (x_i@self.W) #obtain the product
            #update the weights
            self.W = (1-lr*self.lamda)*self.W + \
            (lr/self.k)*(np.sum(np.multiply(y_i.reshape(-1,1),x_i)*self.gradient(prod).reshape(-1,1),axis=0))
    
    def predict(self, x):
        #transform the inputs using the weight vector
        p = x@self.W.reshape(-1,1)
        return np.sign(p) #the sign function outputs the class


class SVMDC:
    def __init__ (self, C, mode = "L1", tol=1e-3):
        self.C = C #C value
        self.mode = mode
        self.tol = tol #tolerance value to break out of the loop
    
    def partial_gradient(self,G,a,U): #to calculate the partial gradient
        if a == 0:
            return min(G,0)
        elif a == U:
            return max(G,0)
        elif (a>0) and (a<U):
            return G
    
    def fit(self, X, y,iters=100):
        m, n = X.shape
        self.w = 0 #weight matrix
        
        #SVMDC can be done in L1 and L2 modes
        if self.mode == "L1":
            Dii = 0
            U=self.C
        else:
            Dii = 1/(2*self.C)
            U=np.inf
        
        #to get the langrangian multipliers
        alpha = np.zeros(m)
        self.w = np.zeros(shape=(n)) #initialize the weight matrix
        Qii = np.sum(X**2, 1) + Dii #calculate Qii
        for t in range(iters): #iterate till max_iters
            err = 0 #calculate error to break the loop
            for i in range(m): #iterate over each instance
                Qhat = Qii[i] #get Q_bar
                G = np.multiply(np.dot(self.w,X[i,:]),y[i]) - 1 + Dii * alpha[i] #gradient of the objective function
                PG = self.partial_gradient(G,alpha[i],U) #partial gradient of the objective function
                if np.abs(G) > err: #to keep updating the error term
                    err = np.abs(G)
                
                #to find optimal solution
                if np.abs(G) > 0: 
                    alpha_new = min(max(alpha[i]-G/Qhat,0),U)
                    self.w = self.w+(np.multiply((alpha_new - alpha[i])* y[i] ,X[i,:]))
                    alpha[i] = alpha_new
            
            #stop iterating once the error fall below tolerance        
            if err<self.tol:
                break
        
    def predict(self, x):
        #project the points using the weight matrix
        p = x@self.w.reshape(-1,1)
        return np.sign(p) #the sign function tells the class which the object belong to



# Pegasos Basic
peg_basic = Pegasos(0.01,1,projection=False)
peg_basic.fit(X_train,y_train)

# Test the model
print(f'Accuracy of Basic Pegasos is {accuracy_score(y_test,peg_basic.predict(X_test))} and ROC AUC Score is {roc_auc_score(y_test,peg_basic.predict(X_test))}')

# Pegasos Batch
peg_batch = Pegasos(0.01,10,projection=False)
peg_batch.fit(X_train,y_train)
# Test the model
print(f'Accuracy of Batch Pegasos is {accuracy_score(y_test,peg_batch.predict(X_test))} and ROC AUC Score is {roc_auc_score(y_test,peg_batch.predict(X_test))}')


# Dual Coordinate Descent
svm = SVMDC(0.1)
svm.fit(X_train,y_train)
# Test the model
print(f'Accuracy of Dual Coordinate Descent SVM is {accuracy_score(y_test,svm.predict(X_test))} and ROC AUC Score is {roc_auc_score(y_test,svm.predict(X_test))}')



**NOTE:** Here, since we have a huge class imbalance in the dataset, therefore, a better approach would be to use some techniques like Oversampling, Undersampling, and SMOTE as a preprocessing step and then apply the model over the preprocessed dataset. Additionally, Pegasos and CD uses stochastic optimization approach therefore the results obtained after these techniques may differ largely for each run and for the choice of each random state. 

For the below data, fit a SVM model to dataset. Later, transform the features using PCA to 2-D and then perform SVM. Compare the performance between the two approaches. Do not forget to scale the features before applying PCA. (Use Sklearn)

In [None]:
from sklearn.datasets import load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
data = load_wine()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.svm import SVC
svm = SVC()

svm.fit(X_train,y_train)
preds = svm.predict(X_test)
accuracy_score(y_test,preds)


#Now after doing PCA

#To scaling of the data
scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

#apply PCA
pca = PCA(n_components=2)
X_train_scaled = pca.fit_transform(X_train_scaled)
X_test_scaled = pca.transform(X_test_scaled)


svm = SVC()
svm.fit(X_train_scaled,y_train)
preds = svm.predict(X_test_scaled)
accuracy_score(y_test,preds)



For the below dataset. Try to build an ensemble model using logistic regression. You have to select subset of features and instances and fit a logistic regression model. You can take number of logistic regression models to be 10. To get final output, perform soft voting (take the average of the probabilities and then apply the thereshold)

Note: Sklearn is allowed for this part

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

column_names = [
    "pregnancies", "glucose", "blood_pressure", "skin_thickness", "insulin",
    "bmi", "diabetes_pedigree_function", "age", "outcome"
]

# Load the dataset into a pandas DataFrame
diabetes_data = pd.read_csv('diabetes.csv')

X = diabetes_data.iloc[:,:-1].values
y = diabetes_data.iloc[:,-1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Since we have 614 instances let's choose each learner model is trained using 200 instances and 4 features

models = [] 
selected_features = [] #to store features used for each learner model

for i in range(10):
    #Create a random subset of features
    subset_features = np.random.choice(range(X_train.shape[1]), size=4, replace=False) #to retrieve features for testing
    X_subset_features = X_train[:, subset_features]
    selected_features.append(subset_features)

    #Create a random subset of instances
    subset_instances = np.random.choice(range(X_train.shape[0]), size=200, replace=True)
    X_subset = X_subset_features[subset_instances, :]
    y_subset = y_train[subset_instances]

    #Create and fit a logistic regression model
    model = LogisticRegression()
    model.fit(X_subset, y_subset)

    #Append the trained model to the list
    models.append(model)


#now to create soft voting module

#make prediction on test set
probabilities = np.array([model.predict_proba(X_test[:, features]) for model, features in zip(models, selected_features)])

#calculate the mean probabilities
average_probabilities = np.mean(probabilities, axis=0)

#make final decision based on the final averaged probabilities
final_prediction = np.argmax(average_probabilities, axis=1)

#To get accuracy of the ensemble model
accuracy_score(y_test, final_prediction)

#What if  we had only used a single Log REg model instead of ensemble
single_log = LogisticRegression(max_iter=1000)
single_log.fit(X_train, y_train)
single_pred = single_log.predict(X_test)
accuracy_score(y_test, single_pred)



In [None]:

# X_pos = np.array([[2.0, 2.2],
#                   [2.7, 2.5],
#                   [2.3, 2.0],
#                   [3.1, 2.3],
#                   [2.5, 2.4],
#                   [2.8, 2.7]])

# y_pos = np.ones(X_pos.shape[0])

# X_neg = np.array([[1.6, 1.5],
#                   [2.0, 1.9],
#                   [2.1, 1.8],
#                   [1.7, 1.6],
#                   [1.8, 1.7],
#                   [2.0, 1.6]])

# y_neg = -1 * np.ones(X_neg.shape[0])

# # Combine positive and negative samples
# X = np.vstack((X_pos, X_neg))
# y = np.hstack((y_pos, y_neg))

# # Helper function for inequality-constrained submanifold minimization
# def minimize_ineq_cstr_submanifold(C, c, A, a, B, b, x_init, max_iterations=1000, tol=1e-6):
#     """
#     Implements the inequality-constrained submanifold minimization algorithm.

#     Parameters:
#         C: Quadratic term in the objective function (N x N matrix)
#         c: Linear term in the objective function (N-dimensional vector)
#         A: Equality constraint matrix (M x N matrix)
#         a: Equality constraint vector (M-dimensional vector)
#         B: Inequality constraint matrix (K x N matrix)
#         b: Inequality constraint vector (K-dimensional vector)
#         x_init: Initial guess for x (N-dimensional vector)
#         max_iterations: Maximum number of iterations
#         tol: Convergence tolerance

#     Returns:
#         x: Optimized variable vector
#     """
#     x = x_init
#     K0 = {k for k in range(B.shape[0]) if np.isclose((B @ x - b)[k], 0)}

#     while True:
#         while True:
#             A_tilde = np.vstack([A, B[list(K0), :]])
#             a_tilde = np.hstack([a, b[list(K0)]])

#             # Solve the linear system
#             system_matrix = np.block([[C, A_tilde.T], [A_tilde, np.zeros((A_tilde.shape[0], A_tilde.shape[0]))]])
#             rhs = np.hstack([c, a_tilde])
#             solution = np.linalg.solve(system_matrix, rhs)
#             x_star, nu_star = solution[:x.shape[0]], solution[x.shape[0]:]

#             # Check for improvement in the objective function
#             if (c @ x_star + 0.5 * x_star @ C @ x_star) >= (c @ x + 0.5 * x @ C @ x):
#                 break

#             # Line search
#             direction = x_star - x
#             mu = max([mu for mu in np.linspace(0, 1, 100) if np.all(B @ (x + mu * direction) - b <= 0)])
#             x = x + mu * direction
#             K0 = {k for k in range(B.shape[0]) if np.isclose((B @ x - b)[k], 0)}

#         # Check optimality of dual variables
#         if np.all(nu_star >= 0):
#             break

#         # Remove violating constraint from active set
#         violating_constraints = [k for k in K0 if nu_star[k] < 0]
#         if violating_constraints:
#             K0.remove(violating_constraints[0])

#     return x

# # SVM training using the submanifold minimization algorithm
# def learn_svm(D_train, gamma, K_func):
#     """
#     Trains an SVM using the dataset and the submanifold minimization algorithm.

#     Parameters:
#         D_train: Training data as a list of (x_i, y_i) tuples
#         gamma: Regularization parameter
#         K_func: Kernel function

#     Returns:
#         beta_0: Bias term
#         alpha: Dual variables
#     """
#     X, y = zip(*D_train)
#     X = np.array(X)
#     y = np.array(y)
#     N = len(y)

#     # Construct components for the optimization problem
#     K = np.array([[K_func(x_i, x_j) for x_j in X] for x_i in X])
#     C = np.outer(y, y) * K
#     c = np.ones(N)
#     A = y.reshape(1, -1)
#     a = np.array([0])
#     B = np.vstack([-np.eye(N), np.eye(N)])
#     b = np.hstack([np.zeros(N), gamma * np.ones(N)])

#     # Initial guess for alpha
#     N_pos = sum(y == 1)
#     N_neg = N - N_pos
#     alpha_init = np.where(y == 1, gamma / N_pos, gamma / N_neg)

#     # Minimize the constrained problem
#     alpha = minimize_ineq_cstr_submanifold(C, c, A, a, B, b, alpha_init)

#     # Compute the bias term
#     support_indices = np.where(alpha > 1e-6)[0]
#     beta_0 = np.mean([y[n] - sum(alpha[m] * y[m] * K[m, n] for m in support_indices) for n in support_indices])

#     return beta_0, alpha

# # Example usage
# D_train = [(X[i], y[i]) for i in range(len(y))]

# def linear_kernel(x1, x2):
#     return np.dot(x1, x2)

# gamma = 1.0
# beta_0, alpha = learn_svm(D_train, gamma, linear_kernel)

# print("Bias (beta_0):", beta_0)
# print("Alpha coefficients:", alpha)


### 2. Imbalanced Classification with Sampling Techniques and MLP

In [5]:
data = pd.read_csv('creditcard.csv')
data.head()
from sklearn.model_selection import train_test_split
X = data.drop('Class', axis=1)
y = data['Class']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape

(227845, 30)

#### a.) Applying Smote Oversampling and Rnadom Undersampling

In [7]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# SMOTE Oversampling
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

#Random Undersampling
undersampler = RandomUnderSampler(random_state=42)
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)

X_train_smote.shape, X_train_under.shape


((454902, 30), (788, 30))

### b.)

In [8]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from tqdm import trange
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Custom MLP Class
class CustomMLP(nn.Module):
    def __init__(self, input_size, hidden_layers):
        super(CustomMLP, self).__init__()
        self.layers = nn.ModuleList()
        self.activation_fn = nn.ReLU()

        # Hidden layers
        prev_size = input_size
        for hidden_size in hidden_layers:
            self.layers.append(nn.Linear(prev_size, hidden_size))
            prev_size = hidden_size

        # Output layer
        self.output = nn.Linear(prev_size, 1)

    def forward(self, x):
        for layer in self.layers:
            x = self.activation_fn(layer(x))
        return torch.sigmoid(self.output(x))

In [9]:
# Dataset Preparation
def prepare_dataloader(X, y, batch_size=64):
    dataset = TensorDataset(torch.tensor(X.values, dtype=torch.float32),
                            torch.tensor(y.values, dtype=torch.float32).unsqueeze(1))
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)



# Model Initialization
input_size = X_train.shape[1]
hidden_layers = [64, 32, 16]
model = CustomMLP(input_size, hidden_layers)
device = "cpu"
model.to(device)

# Optimizer and Loss Function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [12]:
# Training and test Functions
def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    train_loss = 0
    for X_batch, y_batch in dataloader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        # Forward pass
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * X_batch.size(0)

    return train_loss / len(dataloader.dataset)

def test_epoch(model, dataloader, criterion, device):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            test_loss += loss.item() * X_batch.size(0)

    return test_loss / len(dataloader.dataset)


n_epochs = 50
batch_size = 64

datasets = {
    "Original": (X_train, y_train),
    "SMOTE": (X_train_smote, y_train_smote),
    "Undersampled": (X_train_under, y_train_under)
}

results = {}

for name, (X_data, y_data) in datasets.items():
    print(f"\nTraining on {name} Dataset...")
    train_loader = prepare_dataloader(X_data, y_data, batch_size)
    test_loader = prepare_dataloader(X_test, y_test, batch_size)
    
    model = CustomMLP(input_size, hidden_layers).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCELoss()
    
    train_losses = []
    test_losses = []

    for epoch in trange(n_epochs, desc="Epochs"):
        train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
        train_losses.append(train_loss)

        test_loss = test_epoch(model, test_loader, criterion, device)
        test_losses.append(test_loss)

        if epoch % 10 == 0:
            print(f"Epoch {epoch+1}/{n_epochs} - Train Loss: {train_loss:.4f} - Test Loss: {test_loss:.4f}")

    results[name] = {"train_losses": train_losses, "test_losses": test_losses}



Training on Original Dataset...


Epochs:   2%|▏         | 1/50 [00:30<24:55, 30.52s/it]

Epoch 1/50 - Train Loss: 0.1985 - Test Loss: 0.1720


Epochs:  22%|██▏       | 11/50 [04:18<14:36, 22.48s/it]

Epoch 11/50 - Train Loss: 0.1729 - Test Loss: 0.1720


Epochs:  42%|████▏     | 21/50 [2:30:00<59:04, 122.23s/it]    

Epoch 21/50 - Train Loss: 0.1729 - Test Loss: 0.1720


Epochs:  62%|██████▏   | 31/50 [2:33:02<06:37, 20.92s/it] 

Epoch 31/50 - Train Loss: 0.1729 - Test Loss: 0.1720


Epochs:  82%|████████▏ | 41/50 [2:36:10<03:02, 20.28s/it]

Epoch 41/50 - Train Loss: 0.1729 - Test Loss: 0.1720


Epochs: 100%|██████████| 50/50 [2:39:32<00:00, 191.44s/it]



Training on SMOTE Dataset...


Epochs:   2%|▏         | 1/50 [00:31<25:38, 31.39s/it]

Epoch 1/50 - Train Loss: 49.6609 - Test Loss: 99.7009


Epochs:  22%|██▏       | 11/50 [09:49<50:57, 78.41s/it]

Epoch 11/50 - Train Loss: 49.8580 - Test Loss: 99.6393


Epochs:  42%|████▏     | 21/50 [20:27<28:35, 59.14s/it]

Epoch 21/50 - Train Loss: 49.8167 - Test Loss: 0.1756


Epochs:  62%|██████▏   | 31/50 [36:28<45:31, 143.76s/it]

Epoch 31/50 - Train Loss: 49.7608 - Test Loss: 0.1756


Epochs:  82%|████████▏ | 41/50 [45:40<08:54, 59.43s/it] 

Epoch 41/50 - Train Loss: 49.2281 - Test Loss: 98.6588


Epochs: 100%|██████████| 50/50 [1:00:09<00:00, 72.18s/it]



Training on Undersampled Dataset...


Epochs:   2%|▏         | 1/50 [00:02<02:00,  2.47s/it]

Epoch 1/50 - Train Loss: 47.1822 - Test Loss: 99.1081


Epochs:  22%|██▏       | 11/50 [00:32<01:53,  2.91s/it]

Epoch 11/50 - Train Loss: 49.7696 - Test Loss: 99.1953


Epochs:  42%|████▏     | 21/50 [01:14<02:16,  4.70s/it]

Epoch 21/50 - Train Loss: 49.5472 - Test Loss: 98.9971


Epochs:  62%|██████▏   | 31/50 [01:45<00:56,  2.95s/it]

Epoch 31/50 - Train Loss: 49.8874 - Test Loss: 99.5621


Epochs:  82%|████████▏ | 41/50 [04:24<00:52,  5.86s/it]

Epoch 41/50 - Train Loss: 49.8861 - Test Loss: 99.5444


Epochs: 100%|██████████| 50/50 [04:48<00:00,  5.77s/it]


In [15]:
# Function to evaluate a model
def evaluate_model(model, X_test, y_test, device):
    model.eval()
    with torch.no_grad():
        X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32).to(device)
        y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1).to(device)
        
        predictions = model(X_test_tensor).cpu().numpy()
        predictions = (predictions >= 0.5).astype(int)  

        accuracy = accuracy_score(y_test, predictions)
        recall = recall_score(y_test, predictions)
        f1 = f1_score(y_test, predictions)
        
        return accuracy, recall, f1

# Evaluating the models for each dataset
metrics = {}
for name, (X_data, y_data) in datasets.items():
    print(f"Evaluating model trained on {name} dataset...")
    accuracy, recall, f1 = evaluate_model(model, X_test, y_test, device)
    metrics[name] = {"Accuracy": accuracy, "Recall": recall, "F1-Score": f1}

# Display metrics
for dataset, values in metrics.items():
    print(f"\nMetrics for {dataset} Dataset:")
    for metric, value in values.items():
        print(f"{metric}: {value:.4f}")


Evaluating model trained on Original dataset...
Evaluating model trained on SMOTE dataset...
Evaluating model trained on Undersampled dataset...

Metrics for Original Dataset:
Accuracy: 0.0018
Recall: 1.0000
F1-Score: 0.0034

Metrics for SMOTE Dataset:
Accuracy: 0.0018
Recall: 1.0000
F1-Score: 0.0034

Metrics for Undersampled Dataset:
Accuracy: 0.0018
Recall: 1.0000
F1-Score: 0.0034


### Comparing Results


 - Original Dataset: Has less loss than the sampled datasets
 - SMOTE Oversamplinga: low accuracy
 - Random Undersampling: low accuracy


TRADEOFFS
 - Accuracy decreasees with SMOTE and undersampling due to changes in class distributions.
