
# Assignment: Linear Regression, Logistic Regression, and K-Means (From Scratch)

**Instructions**
- You are NOT allowed to use `scikit-learn` for model implementation, scaling.
- You may use it for implementation of clustering
- You may use: `numpy`, `matplotlib`, and standard Python libraries only.
- Every step (scaling, loss, gradients, optimization) must be implemented manually.
- Clearly comment your code and explain your reasoning in Markdown cells.


## Question 1: Linear Regression from Scratch (with Standardization and Regularization)

You are given a dataset `(X, y)`.

### Tasks
1. Implement **StandardScaler manually**:
   - Compute mean and standard deviation for each feature.
   - Standardize the features.
2. Implement **Linear Regression using Gradient Descent**.
3. Add **L2 Regularization (Ridge Regression)**.
4. Plot:
   - Loss vs iterations
   - True vs predicted values

Do NOT use `sklearn`.


In [None]:

import numpy as np
import matplotlib.pyplot as plt



In [None]:

# 1. DATA GENERATION 


def generate_regression_data(n_samples=100, n_features=1, noise=10):
    """Generates random data for regression: y = wX + b + noise"""
    np.random.seed(42)
    X = 2 * np.random.rand(n_samples, n_features)
    true_weights = np.random.randn(n_features) * 10
    bias = 5
    # y = Xw + b + noise
    y = X.dot(true_weights) + bias + np.random.randn(n_samples) * noise
    return X, y

def generate_classification_data(n_samples=200):
    """Generates two blobs of data for binary classification"""
    np.random.seed(42)
    # Class 0: Centered at (2, 2)
    X0 = np.random.randn(n_samples // 2, 2) + 2
    y0 = np.zeros(n_samples // 2)
    # Class 1: Centered at (6, 6)
    X1 = np.random.randn(n_samples // 2, 2) + 6
    y1 = np.ones(n_samples // 2)
    
    X = np.vstack((X0, X1))
    y = np.hstack((y0, y1))
    
    # Shuffle
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    return X[indices], y[indices]

In [None]:
# 2.  ALGORITHMS

# Implement StandardScaler manually ,  first read about it, how it works and then implement it 
class StandardScalerManual:
    def fit(self, X):
        self.mean_ = np.mean(X, axis=0)
        self.std_ = np.std(X, axis=0)
        self.std_[self.std_ == 0] = 1.0
    
    def transform(self, X):
        return (X - self.mean_) / self.std_
    
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

In [15]:

# Implement Linear Regression from scratch, here you have to also construct the regulization term coefficient of which will be
# denoted by l2_lambda 
# try to implement L1 regularization or atlease read about it and where it is used
class LinearRegressionManual:
    def _init_(self, lr=0.01, epochs=1000, l2_lambda=0.0):
        self.lr = lr
        self.epochs = epochs
        self.l2_lambda = l2_lambda
        self.loss_history = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.epochs):
            y_pred = np.dot(X, self.weights) + self.bias
            
            # MSE Loss + L2 Penalty
            loss = (1/(2*n_samples)) * np.sum((y - y_pred)**2) + \
                   (self.l2_lambda/(2*n_samples)) * np.sum(self.weights**2)
            self.loss_history.append(loss)
            
            # Gradients
            dw = (1/n_samples) * np.dot(X.T, (y_pred - y)) + (self.l2_lambda/n_samples) * self.weights
            db = (1/n_samples) * np.sum(y_pred - y)
            
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias




## Question 2: Logistic Regression from Scratch (with Standardization and Regularization)

You are given a binary classification dataset.

### Tasks
1. Reuse your **manual StandardScaler**.
2. Implement **Logistic Regression using Gradient Descent**.
3. Use:
   - Sigmoid function
   - Binary Cross Entropy loss
4. Add **L2 Regularization**.
5. Report:
   - Training loss curve
   - Final accuracy

Do NOT use `sklearn`.


In [16]:

#Implement sigmoid function as told in the lectures 
def sigmoid(z):
    return 1 / (1 + np.exp(-z))


In [17]:

#Implement Logistic Regression from scratch and here also add the regularizaation term 
class LogisticRegressionManual:
    def _init_(self, lr=0.01, epochs=1000, l2_lambda=0.0):
        self.lr = lr
        self.epochs = epochs
        self.l2_lambda = l2_lambda
        self.loss_history = []

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.epochs):
            linear = np.dot(X, self.weights) + self.bias
            y_pred = sigmoid(linear)
            y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
            
            # BCE Loss + L2 Penalty
            loss = -(1/n_samples) * np.sum(y*np.log(y_pred) + (1-y)*np.log(1-y_pred)) + \
                   (self.l2_lambda/(2*n_samples)) * np.sum(self.weights**2)
            self.loss_history.append(loss)
            
            dw = (1/n_samples) * np.dot(X.T, (y_pred - y)) + (self.l2_lambda/n_samples) * self.weights
            db = (1/n_samples) * np.sum(y_pred - y)
            
            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict_proba(self, X):
        return sigmoid(np.dot(X, self.weights) + self.bias)

    def predict(self, X, threshold=0.5):
        return (self.predict_proba(X) > threshold).astype(int)



## Question 3: K-Means Clustering from Scratch (Matrix Clustering)

You are given a **random matrix** `M` of shape `(n, m)`.

### Tasks
Implement K-Means clustering **from scratch** such that:

1. Input:
   - A random matrix `M`
   - Number of clusters `k`
2. Output:
   - `assignment_table`: a matrix of same shape as `M`, where each element stores the **cluster label**
   - `cookbook`: a dictionary (hashmap) where:
     - Key = cluster index
     - Value = list of **positions (i, j)** belonging to that cluster
   - `centroids`: array storing centroid values

You must cluster **individual elements**, not rows.


In [18]:

# Implement K-Means for matrix elements
#CAN USE SK-LEARN FOR THIS TASK AS THIS TASK WILL HELP US DIRECTLY IN OUR PROJECT ! 
def kmeans_matrix_manual(M, k, max_iters=100):
    '''
    Returns:
    assignment_table: same shape as M, contains cluster labels
    cookbook: dict -> cluster_id : list of (i, j) positions
    centroids: numpy array of centroid values
    '''
   
    n, m = M.shape
    data = M.flatten()
    
    # 1. Initialize Centroids randomly from the data
    np.random.seed(42)
    centroids = np.random.choice(data, k, replace=False).astype(float)
    
    labels = np.zeros(data.shape, dtype=int)
    
    for _ in range(max_iters):
        # 2. Assignment Step
        # Calculate distance from each point to each centroid
        # distances shape: (n_points, k)
        distances = np.abs(data[:, np.newaxis] - centroids)
        new_labels = np.argmin(distances, axis=1)
        
        # Check convergence
        if np.all(labels == new_labels):
            break
        labels = new_labels
        
        # 3. Update Step
        for i in range(k):
            points_in_cluster = data[labels == i]
            if len(points_in_cluster) > 0:
                centroids[i] = np.mean(points_in_cluster)
                
    # Reshape results
    assignment_table = labels.reshape(n, m)
    cookbook = {i: [] for i in range(k)}
    for r in range(n):
        for c in range(m):
            cookbook[assignment_table[r, c]].append((r, c))
            
    return assignment_table, cookbook, centroids




In [None]:

# 3. EXECUTION


print(" 1. Linear Regression ")
X_reg, y_reg = generate_regression_data()
scaler_reg = StandardScalerManual()
X_reg_scaled = scaler_reg.fit_transform(X_reg)

lin_reg = LinearRegressionManual(lr=0.1, epochs=500)
lin_reg.fit(X_reg_scaled, y_reg)

plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1); plt.plot(lin_reg.loss_history); plt.title("LinReg Loss")
plt.subplot(1, 2, 2); plt.scatter(X_reg, y_reg); plt.plot(X_reg, lin_reg.predict(X_reg_scaled), 'r'); plt.title("LinReg Fit")
plt.show()

print("\n 2. Logistic Regression ")
X_cls, y_cls = generate_classification_data()
scaler_cls = StandardScalerManual()
X_cls_scaled = scaler_cls.fit_transform(X_cls)

log_reg = LogisticRegressionManual(lr=0.1, epochs=1000)
log_reg.fit(X_cls_scaled, y_cls)
acc = np.mean(log_reg.predict(X_cls_scaled) == y_cls)
print(f"Accuracy: {acc*100:.2f}%")

plt.figure(figsize=(6, 4))
plt.plot(log_reg.loss_history); plt.title("LogReg Loss"); plt.show()

print("\n 3. K-Means")
M_matrix = np.random.randint(0, 255, (10, 10))
assign, cook, cents = kmeans_matrix_manual(M_matrix, k=3)
print("Centroids:", np.round(cents, 2))
print("Assignments (5x5):\n", assign[:5, :5])


## Submission Guidelines
- Submit the completed `.ipynb` file.
- Clearly label all plots and outputs.
- Code readability and correctness matter.
- Partial credit will be given for logically correct implementations.

**Bonus**
- Compare convergence with and without standardization.
- Try different values of regularization strength.
