# Linear Regression
## Question 1
Make a class called LinearRegression which provides two functions : fit and predict. Try to implement it from scratch. If stuck, refer to the examples folder.

In [None]:
class LinearRegression:
    def __init__(self):
        self.m = None 
        self.b = None 

    def fit(self, X, y):
      

        n = len(X)
        if n == 0:
            raise ValueError("Input data is empty")

        mean_x = sum(X) / n
        mean_y = sum(y) / n

        numerator = 0
        denominator = 0

        for i in range(n):
            numerator += (X[i] - mean_x) * (y[i] - mean_y)
            denominator += (X[i] - mean_x) ** 2

        if denominator == 0:
            raise ValueError("Cannot fit model when all X values are same")

        self.m = numerator / denominator
        self.b = mean_y - self.m * mean_x

    def predict(self, X):

        if self.m is None or self.b is None:
            raise ValueError("Model has not been fitted yet")

        return [self.m * x + self.b for x in X]


## Question 2

Use the dataset https://www.kaggle.com/datasets/quantbruce/real-estate-price-prediction (*).
1. Read it using pandas.
2. Check for **null values**.
3. For each of the columns (except the first and last), plot the column values in the X-axis against the last column of prices in the Y-axis.
4. Remove the unwanted columns.
5. Split the dataset into train and test data. Test data size = 25% of total dataset.
6. **Normalize** the X_train and X_test using MinMaxScaler from sklearn.preprocessing.
7. Fit the training data into the model created in question 1 and predict the testing data.
8. Use **mean square error and R<sup>2</sup>** from sklearn.metrics as evaluation criterias.
9. Fit the training data into the models of the same name provided by sklearn.linear_model and evaluate the predictions using MSE and R<sup>2</sup>.
10. Tune the hyperparameters of your models (learning rate, epochs) to achieve losses close to that of the sklearn models.

Note : (*) To solve this question, you may proceed in any of the following ways :
1. Prepare the notebook in Kaggle, download it and submit it separately with the other questions.
2. Download the dataset from kaggle. Upload it to the session storage in Colab.
3. Use Colab data directly in Colab. [Refer here](https://www.kaggle.com/general/74235). For this, you need to create kaggle API token. Before submitting, hide or remove the API token.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression as SklearnLinearRegression

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

df = pd.read_csv(
    "/kaggle/input/real-estate-price-prediction/Real estate.csv"
)

df.head()

df.isnull().sum()

target_column = df.columns[-1]
feature_columns = df.columns[1:-1]

plt.figure(figsize=(15, 10))

for col in feature_columns:
    plt.figure()
    plt.scatter(df[col], df[target_column])
    plt.xlabel(col)
    plt.ylabel(target_column)
    plt.title(f"{col} vs {target_column}")
    plt.show()


X = df.iloc[:, 1:-1]   # features
y = df.iloc[:, -1]    # target

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    random_state=42
)

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


class LinearRegressionScratch:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs
        self.weights = None
        self.bias = None
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.epochs):
            y_pred = np.dot(X, self.weights) + self.bias
            
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)
            
            self.weights -= self.lr * dw
            self.bias -= self.lr * db
            
    def predict(self, X):
        return np.dot(X, self.weights) + self.bias


custom_model = LinearRegressionScratch(lr=0.05, epochs=3000)
custom_model.fit(X_train_scaled, y_train)

y_pred_custom = custom_model.predict(X_test_scaled)

mse_custom = mean_squared_error(y_test, y_pred_custom)
r2_custom = r2_score(y_test, y_pred_custom)

print("Custom Linear Regression")
print("MSE:", mse_custom)
print("R²:", r2_custom)


sk_model = SklearnLinearRegression()
sk_model.fit(X_train_scaled, y_train)

y_pred_sklearn = sk_model.predict(X_test_scaled)

mse_sklearn = mean_squared_error(y_test, y_pred_sklearn)
r2_sklearn = r2_score(y_test, y_pred_sklearn)

print("\nSklearn Linear Regression")
print("MSE:", mse_sklearn)
print("R²:", r2_sklearn)


learning_rates = [0.001, 0.01, 0.05, 0.1]
epochs_list = [1000, 3000, 5000]

best_mse = float("inf")

for lr in learning_rates:
    for epochs in epochs_list:
        model = LinearRegressionScratch(lr=lr, epochs=epochs)
        model.fit(X_train_scaled, y_train)
        
        y_pred = model.predict(X_test_scaled)
        mse = mean_squared_error(y_test, y_pred)
        
        if mse < best_mse:
            best_mse = mse
            best_params = (lr, epochs)

print("Best Parameters:")
print("Learning Rate:", best_params[0])
print("Epochs:", best_params[1])
print("Best MSE:", best_mse)
print("Sklearn MSE:", mse_sklearn)


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Logistic Regression
## Question 3

The breast cancer dataset is a binary classification dataset commonly used in machine learning tasks. It is available in scikit-learn (sklearn) as part of its datasets module.
Here is an explanation of the breast cancer dataset's components:

* Features (X):

 * The breast cancer dataset consists of 30 numeric features representing different characteristics of the FNA images. These features include mean, standard error, and worst (largest) values of various attributes such as radius, texture, smoothness, compactness, concavity, symmetry, fractal dimension, etc.

* Target (y):

 * The breast cancer dataset is a binary classification problem, and the target variable (y) represents the diagnosis of the breast mass. It contains two classes:
    * 0: Represents a malignant (cancerous) tumor.
    * 1: Represents a benign (non-cancerous) tumor.

Complete the code given below in place of the "..."

1. Load the dataset from sklearn.datasets
2. Separate out the X and Y columns.
3. Normalize the X data using MinMaxScaler or StandardScaler.
4. Create a train-test-split. Take any suitable test size.

In [None]:
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

X = data.data
y = data.target

print("X shape:", X.shape)
print("y shape:", y.shape)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,
    y,
    test_size=0.2,    
    random_state=42,
    stratify=y       
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)



5. Write code for the sigmoid function and Logistic regression.


In [None]:
import numpy as np

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(z):
    s = sigmoid(z)
    return s * (1 - s)


class LogisticRegression:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.lr = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        y = y.reshape(-1, 1)

        self.weights = np.zeros((n_features, 1))
        self.bias = 0

        for _ in range(self.epochs):
            z = np.dot(X, self.weights) + self.bias
            y_pred = sigmoid(z)

            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db

    def predict(self, X):
        z = np.dot(X, self.weights) + self.bias
        y_pred_probs = sigmoid(z)
        return (y_pred_probs >= 0.5).astype(int)


6. Fit your model on the dataset and make predictions.
7. Compare your model with the Sklearn Logistic Regression model. Try out all the different penalties.
8. Print accuracy_score in each case using sklearn.metrics .

In [None]:
from sklearn.linear_model import LogisticRegression as SkLogisticRegression
from sklearn.metrics import accuracy_score

custom_model = LogisticRegression(learning_rate=0.01, epochs=3000)
custom_model.fit(X_train, y_train)

y_pred_custom = custom_model.predict(X_test)


sk_none = SkLogisticRegression(
    penalty='none',
    solver='lbfgs',
    max_iter=5000
)
sk_none.fit(X_train, y_train)
y_pred_none = sk_none.predict(X_test)


sk_l2 = SkLogisticRegression(
    penalty='l2',
    solver='lbfgs',
    max_iter=5000
)
sk_l2.fit(X_train, y_train)
y_pred_l2 = sk_l2.predict(X_test)


sk_l1 = SkLogisticRegression(
    penalty='l1',
    solver='liblinear',
    max_iter=5000
)
sk_l1.fit(X_train, y_train)
y_pred_l1 = sk_l1.predict(X_test)


sk_elastic = SkLogisticRegression(
    penalty='elasticnet',
    solver='saga',
    l1_ratio=0.5,
    max_iter=5000
)
sk_elastic.fit(X_train, y_train)
y_pred_elastic = sk_elastic.predict(X_test)


print("Accuracy Scores:\n")

print("Custom Logistic Regression:",
      accuracy_score(y_test, y_pred_custom))

print("Sklearn Logistic Regression (No penalty):",
      accuracy_score(y_test, y_pred_none))

print("Sklearn Logistic Regression (L2 penalty):",
      accuracy_score(y_test, y_pred_l2))

print("Sklearn Logistic Regression (L1 penalty):",
      accuracy_score(y_test, y_pred_l1))

print("Sklearn Logistic Regression (ElasticNet penalty):",
      accuracy_score(y_test, y_pred_elastic))


9. For the best model in each case (yours and scikit-learn), print the classification_report using sklearn.metrics .
10. For the best model in each case (yours and scikit-learn), print the confusion_matrix using sklearn.metrics .

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Classification Report — Custom Logistic Regression\n")
print(classification_report(y_test, y_pred_custom))

sklearn_predictions = {
    "No penalty": y_pred_none,
    "L2 penalty": y_pred_l2,
    "L1 penalty": y_pred_l1,
    "ElasticNet penalty": y_pred_elastic
}

best_model_name = None
best_model_accuracy = 0
best_model_predictions = None

for name, preds in sklearn_predictions.items():
    acc = accuracy_score(y_test, preds)
    if acc > best_model_accuracy:
        best_model_accuracy = acc
        best_model_name = name
        best_model_predictions = preds

print(f"Best Sklearn Model: {best_model_name}")
print(f"Accuracy: {best_model_accuracy}\n")

print("Classification Report — Sklearn Logistic Regression\n")
print(classification_report(y_test, best_model_predictions))

print("Confusion Matrix — Custom Logistic Regression\n")
print(confusion_matrix(y_test, y_pred_custom))

print("\nConfusion Matrix — Sklearn Logistic Regression\n")
print(confusion_matrix(y_test, best_model_predictions))


# KNN
## Question 4

How accurately can a K-Nearest Neighbors (KNN) model classify different types of glass based on a glass classification dataset consisting of 214 samples and 7 classes? Use the kaggle dataset "https://www.kaggle.com/datasets/uciml/glass".

Context: This is a Glass Identification Data Set from UCI. It contains 10 attributes including id. The response is glass type(discrete 7 values)

1. Load the data as you did in the 2nd question.
2. Extract the X and Y columns.
3. Split it into training and testing datasets.

In [None]:
df = pd.read_csv("glass.csv")
df.head()

X = df.iloc[:, 1:-1] 
y = df.iloc[:, -1]    

print("X shape:", X.shape)
print("y shape:", y.shape)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


4. Define Euclidean distance.
5. Build the KNN model.
6. Fit the model on the training data. (Note : you may require to change the type of the data from pandas dataframe to numpy arrays. To do that, just do this X=np.array(X) and so on...)

In [None]:
import numpy as np

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

from collections import Counter
import numpy as np

class KNN:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict_single(x) for x in X]
        return np.array(predictions)

    def _predict_single(self, x):
        distances = [
            euclidean_distance(x, x_train)
            for x_train in self.X_train
        ]


        k_indices = np.argsort(distances)[:self.k]


        k_nearest_labels = [self.y_train[i] for i in k_indices]

        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]


7. Make predictions. Find their accuracy using accuracy_score. Try different k values. k=3 worked well in our case.
8. Compare with the sklearn model (from sklearn.neighbors import KNeighborsClassifier)

In [None]:
from sklearn.metrics import accuracy_score

k_values = [1, 3, 5, 7, 9]

for k in k_values:
    knn = KNN(k=k)
    knn.fit(X_train_np, y_train_np)
    
    y_pred = knn.predict(X_test_np)
    acc = accuracy_score(y_test_np, y_pred)
    
    print(f"k = {k}, Accuracy = {acc:.4f}")

knn_best = KNN(k=3)
knn_best.fit(X_train_np, y_train_np)

y_pred_custom = knn_best.predict(X_test_np)

custom_accuracy = accuracy_score(y_test_np, y_pred_custom)
print("Custom KNN Accuracy (k=3):", custom_accuracy)

from sklearn.neighbors import KNeighborsClassifier

sk_knn = KNeighborsClassifier(
    n_neighbors=3,
    metric="euclidean"
)

sk_knn.fit(X_train_np, y_train_np)
y_pred_sklearn = sk_knn.predict(X_test_np)

sk_accuracy = accuracy_score(y_test_np, y_pred_sklearn)

print("Custom KNN Accuracy:", custom_accuracy)
print("Sklearn KNN Accuracy:", sk_accuracy)
