# Lab 2

In [None]:
%pip install plotly

In [7]:
import numpy as np
import plotly.graph_objects as go
from sklearn.datasets import make_classification, make_circles, make_moons, make_blobs

## Utility Functions

In [17]:
def draw_plot(X, y, title=""):
    fig = go.Figure(data=go.Scatter(x=X[:, 0], y=X[:, 1], mode='markers', marker=dict(color=y)))
    fig.update_layout(
        title=title,
        xaxis_title="X1",
        yaxis_title="X2",
        showlegend=False,
        width=600,
        height=600,
    )
    fig.show()

## Task 1

In [26]:
pairs = [
    ("Naive Bayes", "Decision Tree"),
    ("Logistic Regression", "Random Forest"),
    ("SVM", "Nearest Neighbors")
]

## Task 2

### Synthetic Data for Naive Bayes

In [64]:
def synthetic_dataset_nb(n_samples: int = 500, noise_scale: float = 0.5, random_state: int = 1505):
    """
    Generate a synthetic dataset for Naive Bayes.
    Description:
    - Linearly separable classes
    - No redundant features
    - No correlation between features
    """

    np.random.seed(random_state)
    
    # Create diagonal centers
    centers = np.array([
        [2, 2],   # Class 0
        [4, 4],   # Class 1
        [6, 6],   # Class 2
        [8, 8]    # Class 3
    ])

    n_samples_per_class = int(n_samples // len(centers))

    X_list = []
    y_list = []

    for i, (cx, cy) in enumerate(centers):
        x1 = np.random.normal(cx, noise_scale, n_samples_per_class)
        x2 = np.random.normal(cy, noise_scale, n_samples_per_class)
        X_list.append(np.vstack([x1, x2]).T)
        y_list.append(np.full(n_samples_per_class, i))

    X = np.vstack(X_list)
    y = np.concatenate(y_list)

    return X, y
    

### Synthetic Data for Logistic Regression

In [68]:
def synthetic_dataset_lr(n_samples : int = 500, correlation : float = 0.3, noise_scale : float = 0.5, random_state : int = 1505):
    """Generate a synthetic dataset for Logistic Regression.
    Description: 
    - No feature independence assumption
    - Linear separability
    """
    np.random.seed(random_state)
    
    # Create class means (arranged in a square)
    class_means = np.array([
        [2, 2],     # Class 0: top-right
        [-2, 2],    # Class 1: top-left
        [-2, -2],   # Class 2: bottom-left
        [2, -2]     # Class 3: bottom-right
    ])

    # Create data for each class
    X_list = []
    y_list = []

    n_samples_per_class = int(n_samples // len(class_means))

    for class_idx, mean in enumerate(class_means):
        # Generate points around each class mean with small variance
        X_class = np.random.normal(loc=mean, scale=0.5, size=(n_samples_per_class, 2))
        # Add some noise to make the classes less separable
        X_class += np.random.normal(scale=noise_scale, size=X_class.shape)
        X_list.append(X_class)
        y_list.append(np.full(n_samples_per_class, class_idx))

    # Combine all classes
    X = np.vstack(X_list)
    y = np.concatenate(y_list)

    # Create covariance matrix
    cov_matrix = np.array([
        [1.0, correlation],  # Variance of X1 = 1, Cov(X1,X2) = rho
        [correlation, 1.0]   # Cov(X1,X2) = rho, Variance of X2 = 1
    ])

    # Apply Cholesky decomposition to get transformation matrix
    L = np.linalg.cholesky(cov_matrix)

    # Apply the transformation to add correlation
    X_correlated = X @ L.T

    return X_correlated, y

### Synthetic Data for Support Vector Machine (Linear Kernel)

In [69]:
def synthetic_dataset_svm(n_samples: int = 500, correlation: float = 0.5, noise_scale: float = 0.4, random_state: int = 1505):
    """Generate a synthetic dataset for Support Vector Machine (Linear Kernel).
    Description: 
    - No feature independence assumption
    - Linear separability
    - Doesnt handle well overlapping classes. So let's define less noise scale.
    """
    np.random.seed(random_state)    

    # Create class means (arranged in a square)
    class_means = np.array([
        [2, 2],     # Class 0: top-right
        [-2, 2],    # Class 1: top-left
        [-2, -2],   # Class 2: bottom-left
        [2, -2]     # Class 3: bottom-right
    ])

    n_samples_per_class = int(n_samples // len(class_means))

    # Create data for each class
    X_list = []
    y_list = []

    for class_idx, mean in enumerate(class_means):
        # Generate points around each class mean with small variance
        X_class = np.random.normal(loc=mean, scale=0.5, size=(n_samples_per_class, 2))
        # Add some noise to make the classes less separable
        X_class += np.random.normal(scale=noise_scale, size=X_class.shape)
        X_list.append(X_class)
        y_list.append(np.full(n_samples_per_class, class_idx))

    # Combine all classes
    X = np.vstack(X_list)
    y = np.concatenate(y_list)

    # Create covariance matrix
    cov_matrix = np.array([
        [1.0, correlation],  # Variance of X1 = 1, Cov(X1,X2) = rho
        [correlation, 1.0]   # Cov(X1,X2) = rho, Variance of X2 = 1
    ])

    # Apply Cholesky decomposition to get transformation matrix
    L = np.linalg.cholesky(cov_matrix)

    # Apply the transformation to add correlation
    X_correlated = X @ L.T

    return X_correlated, y

### Synthetic Data for Decision Trees

In [54]:
def synthetic_dataset_dt(n_samples: int =600, noise_rate: float =0.2, correlation: float = 0.5, random_state: int = 1505) -> tuple[np.ndarray, np.ndarray]:
    """
    Generate a synthetic dataset for Decision Trees.
    Description: 
    XOR structure: for correlation.
    NB independence assumption fails; Tree captures interaction well.
    """
    np.random.seed(random_state)

    n_q = n_samples // 4
    class_means = [(-1.5, -1.5), (1.5, 1.5), (-1.5, 1.5), (1.5, -1.5)]
    data = []
    labels = []
    for i, (cx, cy) in enumerate(class_means):
        pts = np.random.normal(loc=[cx, cy], scale=noise_rate, size=(n_q, 2))
        data.append(pts)
        labels.append(np.full(n_q, i))
    X = np.vstack(data)
    y = np.hstack(labels)

    # Create covariance matrix
    cov_matrix = np.array([
        [1.0, correlation],  # Variance of X1 = 1, Cov(X1,X2) = rho
        [correlation, 1.0]   # Cov(X1,X2) = rho, Variance of X2 = 1
    ])

    # Apply Cholesky decomposition to get transformation matrix
    L = np.linalg.cholesky(cov_matrix)

    # Apply the transformation to add correlation
    X_correlated = X @ L.T

    return X_correlated, y

### Synthetic Data for Decision Rules

In [80]:
def synthetic_dataset_dr(n_samples: int =600, noise_rate: float =0.05, random_state: int =1505) -> tuple[np.ndarray, np.ndarray]:
    """
    Generate a synthetic dataset for Decision Rules.
    Axis-aligned rectangular rules:
      - Class 0: x1 <= 0 and x2 <= 0
      - Class 1: x1 > 0 and x2 <= 0
      - Class 2: x1 <= 0 and x2 > 0
      - Class 3: x1 > 0 and x2 > 0
    Shallow trees (decision rules) capture rectangles cleanly.
    """
    np.random.seed(random_state)
    
    # Uniform points in 2D space
    X = np.random.uniform(low=-2.0, high=2.0, size=(n_samples, 2))

    # Assign class based on axis-aligned rules (quadrants)
    y = np.zeros(n_samples, dtype=int)
    y[(X[:,0] > 0) & (X[:,1] <= 0)] = 1
    y[(X[:,0] <= 0) & (X[:,1] > 0)] = 2
    y[(X[:,0] > 0) & (X[:,1] > 0)] = 3

    # Add noise: flip labels for a fraction of samples
    idx = np.random.choice(n_samples, size=int(noise_rate*n_samples), replace=False)
    y[idx] = np.random.randint(0, 4, size=len(idx))

    return X, y

### Synthetic Data for Nearest Neighbor

In [87]:
def synthetic_dataset_knn(n_samples: int = 600, std: float = 0.6, noise: float = 0.6, random_state: int = 42) -> tuple[np.ndarray, np.ndarray]:
    """
    Synthetic dataset designed to favor kNN over SVM.
    - kNN advantages:
    The same class is contiguous in feature space.
    There's no dependency between features.
    - SVM disadvantages:
    The boundaries are not linearly separable.
    """
    
    np.random.seed(random_state)
    X, y = [], []
    n_classes = 4
    
    # Define radii and calculate samples per class
    radii = [i for i in range(n_classes)]
    n_samples_per_class = int(n_samples // len(radii))
    
    # Generate data
    for i, r in enumerate(radii):
        theta = np.linspace(0, 2 * np.pi, n_samples_per_class, endpoint=False)
        circle = np.c_[r * np.cos(theta), r * np.sin(theta)]
        circle += np.random.normal(0, noise, circle.shape)
        X.append(circle)
        y.extend([i] * n_samples_per_class)
    
    X = np.vstack(X)
    y = np.array(y)
    return X, y

## Drawing data

### Naive Bayes

In [66]:
X,y = synthetic_dataset_nb(n_samples=500, noise_scale=0.7, random_state=1505)
print("Correlation:", np.corrcoef(X[:,0], X[:,1])[0,1])
# Plotting the data
draw_plot(X,y,"Synthetic dataset for Naive Bayes")


Correlation: 0.9104225518895338


### Logistic Regression

In [71]:
X,y = synthetic_dataset_lr(n_samples=500, correlation=-0.4, noise_scale=0.7)
correlation = np.corrcoef(X[:,0], X[:,1])[0,1]
print("Correlation:", correlation)
draw_plot(X,y,"Synthetic dataset for Logistic Regression")


Correlation: -0.4194113310085513


### Support Vector Machine

In [73]:
X, y = synthetic_dataset_svm(n_samples=500, correlation=0.8)
correlation = np.corrcoef(X[:,0], X[:,1])[0,1]
print("Correlation:", correlation)
draw_plot(X,y,"Synthetic dataset for SVM (Linear Kernel)")

Correlation: 0.8016553008473553


### Decision Tree

In [76]:
X, y = synthetic_dataset_dt(n_samples=500, noise_rate=0.7, random_state=1505)
correlation = np.corrcoef(X[:,0], X[:,1])[0,1]
print("Correlation:", correlation)
draw_plot(X,y,"Synthetic dataset for Decision Tree")

Correlation: 0.5050440824524273


### Decision Rules

In [82]:
X, y = synthetic_dataset_dr(n_samples=500, noise_rate=0.01, random_state=1505)
correlation = np.corrcoef(X[:,0], X[:,1])[0,1]
print("Correlation:", correlation)
draw_plot(X,y,"Synthetic dataset for Decision Rules")

Correlation: 0.0028942143292361093


### Nearest Neighbor

In [88]:
X, y = synthetic_dataset_knn(n_samples=500, std=0.8, noise=0.2, random_state=42)
correlation = np.corrcoef(X[:,0], X[:,1])[0,1]
print("Correlation:", correlation)
draw_plot(X,y,"Synthetic dataset for Nearest Neighbor")

Correlation: -0.004685765220882318
