In [1]:
import numpy as np
import pandas as pd
import random

In [29]:
import numpy as np
import pandas as pd

class MyLineReg:
    def __init__(
        self,
        n_iter: int = 100,
        learning_rate: float = 0.1,
        W: np.array = None,
        metric: str = None,
        reg: str = None,
        l1_coef: float = 0.0,
        l2_coef: float = 0.0,
        sgd_sample = None,
        random_state: int = 42
    ):
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.W = None
        self.metric = metric
        self.score = 0
        self.reg = reg
        self.l1_coef = l1_coef
        self.l2_coef = l2_coef
        self.sgd_sample = sgd_sample
        self.random_state = random_state

    def mse(y_true, y_pred):
        return np.mean((y_true - y_pred)**2)
        
    def mae(y_true, y_pred):
        return np.mean(np.abs(y_true - y_pred))
        
    def rmse(y_true, y_pred):
        return np.sqrt(np.mean((y_true - y_pred)**2))
        
    def mape(y_true, y_pred):
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
        
    def r2(y_true, y_pred):
        SS_res = np.sum((y_true - y_pred)**2)
        SS_tot = np.sum((y_true - np.mean(y_true))**2)
        return 1 - (SS_res / SS_tot)
    
    metrics = {"mse": mse, "mae": mae, "rmse": rmse, "mape": mape, "r2": r2}

    def fit(
        self,
        X: pd.DataFrame,
        y: pd.Series,
        verbose: int = False
    ) -> None:
        random.seed(self.random_state)
        X = pd.concat([pd.Series(1, index=X.index, name='bias'), X], axis=1)
        n = X.shape[1]
        self.W = np.ones(n)
        
        if verbose:
            print("start | loss:",  self.metrics[self.metric](y, np.dot(X, self.W)))
        
        for i in range(self.n_iter):
            if self.sgd_sample is not None:
                sample_size = int(self.sgd_sample * X.shape[0]) if isinstance(self.sgd_sample, float) else self.sgd_sample
                sample_rows_idx = random.sample(range(X.shape[0]), sample_size)
                X_batch = X.iloc[sample_rows_idx]
                y_batch = y.iloc[sample_rows_idx]
            else:
                X_batch = X
                y_batch = y

            y_pred = np.dot(X_batch, self.W)
            loss = np.mean((y_batch - y_pred)**2)
            gradient = 2 * X_batch.T @ (y_pred - y_batch) / len(y_batch)
            
            if self.reg == 'l1':
                gradient += self.l1_coef * np.sign(self.W)
            elif self.reg == 'l2':
                gradient += 2 * self.l2_coef * self.W
            elif self.reg == 'elasticnet':
                gradient += self.l1_coef * np.sign(self.W) + 2 * self.l2_coef * self.W
            
            if callable(self.learning_rate):
                lr = self.learning_rate(i+1)
            else:
                lr = self.learning_rate

            self.W -= lr * gradient

            if verbose and (i + 1) % verbose == 0 and self.metric:
                print(f"{i + 1} | loss: {loss} | {self.metric}: {metric_value}")
        
        if self.metric:
            metric_value = self.metrics[self.metric](y, np.dot(X, self.W))
            self.score = metric_value

    def get_coef(self):
        return self.W[1:]
    
    def predict(self, X: pd.DataFrame):
        X = pd.concat([pd.Series(1, index=X.index, name='bias'), X], axis=1)
        y_pred = X @ self.W
        return y_pred
    
    def get_best_score(self):
        return self.score

    def __str__(self):
        return f"MyLineReg class: n_iter={self.n_iter}, learning_rate={self.learning_rate}"
