In [1]:
import itertools

import numpy as np
import pandas as pd

from sklearn.preprocessing import PolynomialFeatures

### expand_basis

In [None]:

def expand_basis(X, poly_deg, include_sin, include_log):
    """ Expands basis with polynomial, sine and logarithm functions

    Args:
        X (np.ndarray):
        poly_deg (int): Degree for polynomial expansion.
        include_sin  (bool): Includes sine features if True else not.
        include_log (bool): Includes log features if True else not.

    Returns:
        np.ndarray: Expanded feature vector of shape (N, D').
    """
    # Z = expand_poly(X, p)
    poly_expansion = PolynomialFeatures(degree=poly_deg)

    Z_ls = [poly_expansion.fit_transform(X)]

    if include_sin:
        Z_ls.append(np.sin(3 * X))

    if include_log:
        Z_ls.append(1 + np.log(np.where(X < 0, 0, X) + 1))

    Z = np.hstack(Z_ls)
    return Z


def expand_poly(X, p):
    """ Expands the polynomial basis of the input data

    Args:
        X (np.ndarray): Input data of shape (N, D).
        p (int): Polynomial Degree

    Returns:
        (np.ndarray): Expanded polynomial features.
    """
    N, dim = X.shape
    Z_ = np.zeros((N, (p + 1) * dim))

    for i in range(p + 1):
        Z_[:, i * dim: (i + 1) * dim] = X ** (i + 1)

    Z = np.hstack([np.ones((N, 1)), Z_])
    return Z


### Utils

In [None]:

def read_data():
    """ Reads the data for training and test

    It extracts training data inputs and labels from `traindata.txt` and test data inputs
    from `testinputs.txt`.

    Returns:
        np.ndarray: Input training data of shape (N, D).
        np.ndarray: Training data labels of shape (N, 1).
        np.ndarray: Input test data of shape (N', D).

    """
    df_train = pd.read_csv("traindata.txt", sep="   ", names=range(9), engine="python")

    df_train = df_train.sample(len(df_train))

    X_train = df_train.iloc[:, :-1].values
    y_train = df_train.iloc[:, -1].values.reshape(-1, 1)

    X_test = pd.read_csv("testinputs.txt", sep="   ", names=range(8), engine="python").values
    return X_train, y_train, X_test


def model_fit(Z, y):
    """ Fits Linear Regression model on the data

    Args:
        Z (np.ndarray): Feature engineered inputs of shape (N, D').
        y (np.ndarray): Corresponding data labels of shape (N, 1).

    Returns:
        np.ndarray: Weight for fitted linear regression model of shape (D', 1).

    """
    w = np.linalg.inv(Z.T @ Z) @ (Z.T @ y)
    # w = np.linalg.lstsq(Z.T @ Z, (Z.T @ y), rcond=None)
    return w


def all_train_fit(Xtrain, ytrain, basis):
    """ Trains Linear Regression model of the basis on the whole training data

    Args:
        Xtrain: Input training data of shape (N, D).
        ytrain: Corresponding data labels of shape (N, 1).
        basis: Basis with the least cross validation MSE loss with values (poly_degree, include_sin, include_log).

    Returns:
        np.ndarray: Weight for fitted linear regression model of shape (D', 1).

    """
    Ztrain = expand_basis(Xtrain, *basis)
    w = model_fit(Ztrain, ytrain)
    return w


def model_predict(Xtest, w_ls, basis):
    """ Runs model prediction on test data with the fitted linear regression weight

    Args:
        Xtest: Input test data of shape (N, D).
        w_ls: Weight for fitted linear regression model of shape (D', 1).
        basis: Basis with the least cross validation MSE loss with values (poly_degree, include_sin, include_log).

    Returns:
        np.ndarray: Labels predicted by the linear regerssion model on test data.

    """
    Ztest = expand_basis(Xtest, *basis)
    ytest_preds = Ztest @ w_ls
    return ytest_preds


def mse(y_true, y_pred):
    """ (float) Computes MSE loss between true and prediction values """
    return ((y_true - y_pred) ** 2).mean()

### cross validation

In [None]:

def cross_validation(Z, y, K):
    """ Applies cross validation to the data

    This module applies K-fold cross validation on the input data to avoid over-fitting on the data.
    It includes a parameter `K`, which is the number of folds to experiment on.

    Args:
        Z (np.ndarray): Feature engineered data of shape (N, D').
        y (np.ndarray): Regression labels array of shape (N, 1) .
        K (int): Number of folds to run cross validation on.

    Returns:
        float: mean MSE error for cross validation.
    """
    chunk_length = len(Z) // K

    sum_cross_val_loss = 0

    for k in range(K):
        test_start = k * chunk_length
        test_stop = (k + 1) * chunk_length

        Z_test = Z[test_start: test_stop, :]
        y_test = y[test_start: test_stop, :]

        Z_train = np.vstack((Z[: test_start, :], Z[test_stop:, :]))
        y_train = np.vstack((y[: test_start, :], y[test_stop:, :]))

        w = model_fit(Z_train, y_train)

        y_test_preds = Z_test @ w
        mse_test = mse(y_test, y_test_preds)
        sum_cross_val_loss += mse_test

    mean_cross_val_loss = sum_cross_val_loss / K
    return mean_cross_val_loss


### basis_expansion_chooser

In [None]:

def basis_expansion_chooser(X, y):
    """ Chooses basis on the based of cross validation error

    Args:
        X (np.ndarray): Input training data of shape (N, D).
        y (np.ndarray): Training data labels of shape (N, 1).

    Returns:
        tuple: Basis with the least cross validation MSE loss with values
        (poly_degree, include_sin, include_log).
    """
    least_R = np.inf
    basis = 0

    K = 5

    poly_deg_ls = range(1, 5)
    include_sin_ls = [True, False]
    include_log_ls = [True, False]

    for _basis in itertools.product(include_log_ls, include_sin_ls, poly_deg_ls):  # only has polynomial expansion
        Z = expand_basis(X, *_basis[::-1])
        mean_R = cross_validation(Z, y, K)

        if mean_R < least_R:
            least_R = mean_R
            basis = _basis

        print(_basis, "MSE: ", mean_R)
    print(f"Minimal MSE basis: {basis} Least MSE Loss: {least_R}")
    return basis


In [19]:
((ytrain - ls_predict(Xtrain, w_ls, basis)) ** 2).mean()

43.12106775016496

In [18]:
mse(ytrain, ls_predict(Xtrain, w_ls, basis))

43.12106775016496

In [None]:
# mse(ytrain, ls_predict(Xtrain, all_train_fit(Xtrain, ytrain, 5), 5))

# mse(ytrain, ls_predict(Xtrain, all_train_fit(Xtrain, ytrain, 2), 2))

# mse(ytrain, ls_predict(Xtrain, all_train_fit(Xtrain, ytrain, 1), 1))

# mse(ytrain, ls_predict(Xtrain, all_train_fit(Xtrain, ytrain, 3), 3))

# mse(ytrain, ls_predict(Xtrain, all_train_fit(Xtrain, ytrain, 4), 4))

### Sklearn

In [None]:
model = LinearRegression()
model.fit(Xtrain, ytrain)
ytest = model.predict(Xtest)

In [None]:
ytrain_ = model.predict(Xtrain)

In [None]:
mse(ytrain, ytrain_)

In [None]:
from sklearn.model_selection import KFold

In [None]:
cv = KFold()

In [None]:
df_train = pd.read_csv("traindata.txt", sep="   ", names=range(9), engine="python")
data_train = df_train.values

In [None]:
for train_idx, test_idx in cv.split(data_train):
    d_train = data_train[train_idx]
    d_test = data_train[test_idx]
    X_train, y_train = d_train[:, :-1], d_train[:, -1]
    X_test, y_test = d_test[:, :-1], d_test[:, -1]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    mse_train = mse(y_train, model.predict(X_train))
    mse_test = mse(y_test, model.predict(X_test))
    print(mse_train, mse_test)

In [None]:
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer

In [None]:
feature_eng1 = PolynomialFeatures(degree=2)
feature_eng2 = FunctionTransformer(np.sin)
feature_eng3 = FunctionTransformer(lambda x: np.log(x + 10))

data_train1 = feature_eng1.fit_transform(data_train.copy()[:, :-1])
data_train2 = feature_eng2.fit_transform(data_train1)
# data_train3 = feature_eng2.fit_transform(data_train2)

data_train3 = feature_eng3.fit_transform(data_train.copy()[:, :-1])

data_train_ = np.hstack([
    data_train1,
    data_train2, 
    data_train3, 
    data_train[:, -1].reshape(-1, 1)])

In [None]:
for train_idx, test_idx in cv.split(data_train_):
    d_train = data_train_[train_idx]
    d_test = data_train_[test_idx]
    X_train, y_train = d_train[:, :-1], d_train[:, -1]
    X_test, y_test = d_test[:, :-1], d_test[:, -1]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    mse_train = mse(y_train, model.predict(X_train))
    mse_test = mse(y_test, model.predict(X_test))
    print(mse_train, mse_test)

In [None]:
data_train1.shape

In [None]:
data_train.shape

In [None]:
import math

In [None]:
math.comb(5, 2)

In [None]:
math.comb(3, 2) + math.comb(3, 1) + math.comb(3, 0)

In [None]:
3 * 3 - 3 * 2