In [None]:
import itertools

import numpy as np
import pandas as pd

from sklearn.preprocessing import PolynomialFeatures

### Data

In [None]:
def read_data():
    df_train = pd.read_csv("traindata.txt", sep="   ", names=range(9), engine="python")
    df_train = df_train.sample(len(df_train))
    X_train = df_train.iloc[:, :-1].values
    y_train = df_train.iloc[:, -1].values.reshape(-1, 1)

    X_test = pd.read_csv("testinputs.txt", sep="   ", names=range(8), engine="python").values
    return X_train, y_train, X_test

### model_fit

In [None]:
def model_fit(Z, y):
    w = np.linalg.inv(Z.T @ Z) @ (Z.T @ y)
#     w, *_ = np.linalg.lstsq(Z.T @ Z, (Z.T @ y), rcond=None)
    return w

### cross validation

In [None]:
def cross_validation(Z, y, K):
    chunk_length = len(Z) // K

    R_cross_eval = 0

    for k in range(K):
        test_start = k * chunk_length
        test_stop = (k + 1) * chunk_length

        Z_test = Z[test_start: test_stop, :]
        y_test = y[test_start: test_stop, :]

        Z_train = np.vstack((Z[: test_start, :], Z[test_stop:, :]))
        y_train = np.vstack((y[: test_start, :], y[test_stop:, :]))

        w = model_fit(Z_train, y_train)

        # w = Z_train \ y_train

        R_test = ((Z_test @ w - y_test) ** 2).mean()
        R_cross_eval += R_test

    mean_R = R_cross_eval / K
    return mean_R

### expand_basis

In [None]:
def expand_basis(X, poly_deg, include_sin, include_log):
    # Z = expand_poly(X, p)
    feature_eng1 = PolynomialFeatures(degree=poly_deg)
    feature_eng2 = FunctionTransformer(np.sin)
    feature_eng3 = FunctionTransformer(lambda x: 1 + np.log(np.where(x < 0, 0, x) + 1))
    
    Z_ls = [feature_eng1.fit_transform(X)]
    
    if include_sin: 
#         Z_ls.append(feature_eng2.fit_transform(X))
        Z_ls.append(np.sin(3 * X))
        
    
    if include_log: 
#         Z_ls.append(feature_eng3.fit_transform(X))
        Z_ls.append(1 + np.log(np.where(X < 0, 0, X) + 1))
#         Z_ls.append(np.sin(X))

        

    Z = np.hstack(Z_ls)
    return Z


def expand_poly(X, p):
    N, dim = X.shape
    Z_ = np.zeros((N, (p + 1) * dim))

    for i in range(p + 1):
        Z_[:, i * dim: (i + 1) * dim] = X ** (i + 1)

    Z = np.hstack([np.ones((N, 1)), Z_])
    return Z


### basis_expansion_chooser

In [None]:
def basis_expansion_chooser(X, y):
    least_R = np.inf
    basis = 0

    K = 5
    
    poly_deg_ls = range(1, 5)
    include_sin_ls = [True, False]
    include_log_ls = [True, False]
    
    for _basis in itertools.product(include_log_ls, include_sin_ls, poly_deg_ls): # only has polynomial expansion
        Z = expand_basis(X, *_basis[::-1])
        mean_R = cross_validation(Z, y, K)
        
        if mean_R < least_R:
            least_R = mean_R
            basis = _basis
        
        print(_basis, "MSE: ", mean_R)
    return basis

### all_train_fit

In [None]:
def all_train_fit(Xtrain, ytrain, basis):
    Ztrain = expand_basis(Xtrain, *basis)
    w = model_fit(Ztrain, ytrain)
    return w

### predict

In [None]:
def ls_predict(Xtest, w_ls, basis):
    Ztest = expand_basis(Xtest, *basis)
    ytest_preds = Ztest @ w_ls
    return ytest_preds

### main

In [None]:
Xtrain, ytrain, Xtest = read_data()
basis = basis_expansion_chooser(Xtrain, ytrain)
w_ls = all_train_fit(Xtrain, ytrain, basis)
ytest_preds = ls_predict(Xtest, w_ls, basis)

print("Minimal MSE basis: ", basis)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

In [None]:
mse(ytrain, ls_predict(Xtrain, w_ls, 1))

In [None]:
# mse(ytrain, ls_predict(Xtrain, all_train_fit(Xtrain, ytrain, 5), 5))

# mse(ytrain, ls_predict(Xtrain, all_train_fit(Xtrain, ytrain, 2), 2))

# mse(ytrain, ls_predict(Xtrain, all_train_fit(Xtrain, ytrain, 1), 1))

# mse(ytrain, ls_predict(Xtrain, all_train_fit(Xtrain, ytrain, 3), 3))

# mse(ytrain, ls_predict(Xtrain, all_train_fit(Xtrain, ytrain, 4), 4))

### Sklearn

In [None]:
model = LinearRegression()
model.fit(Xtrain, ytrain)
ytest = model.predict(Xtest)

In [None]:
ytrain_ = model.predict(Xtrain)

In [None]:
mse(ytrain, ytrain_)

In [None]:
from sklearn.model_selection import KFold

In [None]:
cv = KFold()

In [None]:
df_train = pd.read_csv("traindata.txt", sep="   ", names=range(9), engine="python")
data_train = df_train.values

In [None]:
for train_idx, test_idx in cv.split(data_train):
    d_train = data_train[train_idx]
    d_test = data_train[test_idx]
    X_train, y_train = d_train[:, :-1], d_train[:, -1]
    X_test, y_test = d_test[:, :-1], d_test[:, -1]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    mse_train = mse(y_train, model.predict(X_train))
    mse_test = mse(y_test, model.predict(X_test))
    print(mse_train, mse_test)

In [None]:
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer

In [None]:
feature_eng1 = PolynomialFeatures(degree=2)
feature_eng2 = FunctionTransformer(np.sin)
feature_eng3 = FunctionTransformer(lambda x: np.log(x + 10))

data_train1 = feature_eng1.fit_transform(data_train.copy()[:, :-1])
data_train2 = feature_eng2.fit_transform(data_train1)
# data_train3 = feature_eng2.fit_transform(data_train2)

data_train3 = feature_eng3.fit_transform(data_train.copy()[:, :-1])

data_train_ = np.hstack([
    data_train1,
    data_train2, 
    data_train3, 
    data_train[:, -1].reshape(-1, 1)])

In [None]:
for train_idx, test_idx in cv.split(data_train_):
    d_train = data_train_[train_idx]
    d_test = data_train_[test_idx]
    X_train, y_train = d_train[:, :-1], d_train[:, -1]
    X_test, y_test = d_test[:, :-1], d_test[:, -1]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    mse_train = mse(y_train, model.predict(X_train))
    mse_test = mse(y_test, model.predict(X_test))
    print(mse_train, mse_test)

In [None]:
data_train1.shape

In [None]:
data_train.shape

In [None]:
import math

In [None]:
math.comb(5, 2)

In [None]:
math.comb(3, 2) + math.comb(3, 1) + math.comb(3, 0)

In [None]:
3 * 3 - 3 * 2