In [1]:
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns
import pandas as pd
from tqdm.notebook import tqdm

from scipy.special import expit
import numpy as np
import torch

from sklearn.linear_model import (LogisticRegression, LinearRegression)
from sklearn.model_selection import LeaveOneOut, KFold
from sklearn.metrics import classification_report
from sklearn.model_selection import ParameterGrid


from torchvision import datasets
from torchvision import transforms

# !pip install ipympl
# %matplotlib widget

import plotly.express as px



In [2]:
from sklearn.datasets import load_diabetes
# ds = load_diabetes(as_frame=True, scaled=False)
# x,y,df = ds.data, ds.target, ds.frame
# df

## Diabetes dataset

Ten baseline variables, age, sex, body mass index, average blood pressure, and six blood serum measurements were obtained for each of n = 442 diabetes patients, as well as the response of interest, a quantitative measure of disease progression one year after baseline.

All variables are real numbers and target is also real number.

In [3]:
# sns.pairplot(df, hue='target', height=2)

In [31]:
ds = load_diabetes(scaled=True, as_frame=True).frame
ds = ds.sample(frac=1).reset_index(drop=True) # shuffling
ds['ones'] = 1.
y = ds['target'].values.copy()
ds = ds.drop(columns=['target'])
x = ds.values

In [32]:
x_tr, y_tr = x[:333], y[:333]
x_ts, y_ts = x[333:], y[333:]

## Linear (SVD)

In [34]:
X = x_tr
V, d, U = np.linalg.svd(X, full_matrices=False)
D = np.eye(len(d))*d
lambd = d**2
np.max(lambd)/np.min(lambd)
list(zip(lambd, ds.columns))

[(333.004041292454, 'age'),
 (3.0885898261152747, 'sex'),
 (1.138827722894358, 'bmi'),
 (0.9140671950874255, 'bp'),
 (0.7187234334705139, 's1'),
 (0.49339777289040637, 's2'),
 (0.45919693154711405, 's3'),
 (0.42557191249660087, 's4'),
 (0.2960220789914487, 's5'),
 (0.05731014471293816, 's6'),
 (0.006871265017121119, 'ones')]

In [37]:
sorted_indices = np.argsort(y_ts)
x_ts_sr = x_ts[sorted_indices]
y_ts_sr = y_ts[sorted_indices]

out_df = [pd.DataFrame({'y': list(y_ts_sr), 't': ['real']*len(y_ts), 'x': list(range(len(y_ts)))})]
for tau in [0, 1e-5, 1e-4, 1e-3, 0.01, 0.1, 1]:
    # tau = 10
    # W = (U.T@D@np.linalg.inv(D**2 + tau*np.eye(len(D)))@V.T)@y_tr
    W = (U.T@D@np.linalg.inv(D**2 + tau*np.eye(len(D)))@V.T)@y_tr
    # W = (U.T@np.linalg.inv(D)@V.T)@y_tr


    out_df.append(pd.DataFrame({'y': list(x_ts_sr@W), 't': [f'{tau=}']*len(y_ts), 'x': list(range(len(y_ts)))}))

px.line(pd.concat(out_df), y='y', color='t', x='x').show()

## Perceprton

In [38]:
train_set =  tuple(zip(x_tr.astype(np.float32), y_tr.astype(np.float32)))
test_set = tuple(zip(x_ts.astype(np.float32), y_ts.astype(np.float32)))
# test_set

In [49]:
import torch
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error

class Perceptron(torch.nn.Module):
    @property
    def device(self):
        for p in self.parameters():
            return p.device
    
    def __init__(self, input_dim=11, num_layers=0, 
                 hidden_dim=11, output_dim=1, p=0.0):
        super(Perceptron, self).__init__()
        
        self.layers = torch.nn.Sequential()
        prev_size = input_dim
        
        for i in range(num_layers):
            self.layers.add_module('layer{}'.format(i), 
                                   torch.nn.Linear(prev_size, hidden_dim))
            self.layers.add_module('relu{}'.format(i), torch.nn.ReLU())
            self.layers.add_module('dropout{}'.format(i), torch.nn.Dropout(p=p))
            prev_size = hidden_dim
            
        self.layers.add_module('regressor', 
                               torch.nn.Linear(prev_size, output_dim))

    def forward(self, input):
        return self.layers(input).squeeze(-1)  # Remove the last dimension

def testing(model, dataset):
    generator = torch.utils.data.DataLoader(dataset, batch_size=64)
    pred = []
    real = []
    for x, y in generator:
        x = x.view([-1, 11]).to(device)  # Modify view to match the new input dimension
        y = y.to(device)
        pred.extend(model(x).cpu().detach().numpy().tolist())
        real.extend(y.cpu().numpy().tolist())
    return real, pred
    return mean_squared_error(real, pred)

def trainer(model, dataset, loss_function, optimizer, epochs):
    for epoch in range(epochs):
        generator = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)
        for x, y in generator:
            optimizer.zero_grad()
            x = x.view([-1, 11]).to(device)  # Modify view to match the new input dimension
            y = y.to(device)
            output = model(x)
            loss = loss_function(output, y)
            # print(loss)
            loss.backward()
            optimizer.step()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Perceptron(num_layers=5, p=0.05)
model.to(device)

_ = model.train()
trainer(model=model, 
        dataset=train_set, 
        loss_function=torch.nn.MSELoss(), 
        optimizer=torch.optim.Adam(model.parameters(), lr=0.01), 
        epochs=500)

_ = model.eval()
real, pred = testing(model, test_set)
print(mean_squared_error(real, pred))
# print(torch.nn.MSELoss()(np.array(real), np.array(pred)))
# print(f'Mean Squared Error: {mse}')
px.line(
    {'y': list(float(i) for i in real) + list(float(i) for i in pred), 
     'color': ['real']*len(real)+['pred']*len(pred), 
     'x':list(range(len(real)))*2},
     x='x', y='y', color='color')

3625.4448638405197


In [44]:
real = np.array(real)
pred = np.array(pred)
sorted_indices = np.argsort(real)
real = real[sorted_indices]
pred = pred[sorted_indices]

px.line(
    {'y': list(float(i) for i in real) + list(float(i) for i in pred), 
     'color': ['real']*len(real)+['pred']*len(pred), 
     'x':list(range(len(real)))*2},
     x='x', y='y', color='color')

## SVM

In [63]:
from sklearn.svm import SVC, SVR
model = SVR(kernel='poly')
_ = model.fit(x_tr, y_tr)
# model.get_params()

{'C': 1.0,
 'cache_size': 200,
 'coef0': 0.0,
 'degree': 3,
 'epsilon': 0.1,
 'gamma': 'scale',
 'kernel': 'poly',
 'max_iter': -1,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [62]:
sorted_indices = np.argsort(y_ts)
x_ts_sr = x_ts[sorted_indices]
y_ts_sr = y_ts[sorted_indices]

out_df = [pd.DataFrame({'y': list(y_ts), 't': ['real']*len(y_ts), 'x': list(range(len(y_ts)))})]

out_df.append(pd.DataFrame({'y': list(model.predict(x_ts)), 't': ['pred']*len(y_ts), 'x': list(range(len(y_ts)))}))

px.line(pd.concat(out_df), y='y', color='t', x='x').show()