In [22]:
# For data handling
import numpy as np
import pandas as pd

# For linear models
from sklearn import linear_model
# Weak learners for Gradient Boosting
from sklearn.ensemble import RandomForestRegressor
# Scaler hyperparams
from sklearn.preprocessing import StandardScaler, QuantileTransformer, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split as tts, KFold, GridSearchCV
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from scipy.spatial.distance import cdist
import collections
import warnings


# testing
from sklearn.model_selection import train_test_split
from tqdm import tqdm
# For making SKlearn compliant funcs
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

# Extreme gradients
import xgboost

In [2]:
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Question 1:

## Gradient Boosting w/ Locally Weighted Regression Class

In [3]:
# Defining Kernel Types, Weight function

In [4]:
def Gaussian(x):
  return np.where(np.abs(x)>4,0,1/(np.sqrt(2*np.pi))*np.exp(-1/2*x**2))
def Tricubic(x):
  return np.where(np.abs(x)>1,0,(1-np.abs(x)**3)**3)
def Epanechnikov(x):
  return np.where(np.abs(x)>1,0,3/4*(1-np.abs(x)**2))
def Quartic(x):
  return np.where(np.abs(x)>1,0,15/16*(1-np.abs(x)**2)**2)

In [5]:
def weight_function(u,v,kern=Gaussian,tau=0.5):
    return kern(cdist(u, v, metric='euclidean')/(2*tau))

In [None]:
# LOWESS class w/o triangularization from lecture

In [10]:
# Lowess class from lecture

class Lowess:
    def __init__(self, kernel = Gaussian, tau=0.05):
        self.kernel = kernel
        self.tau = tau

    def fit(self, x, y):
        kernel = self.kernel
        tau = self.tau
        self.xtrain_ = x
        self.yhat_ = y

    def predict(self, x_new):
        check_is_fitted(self)
        x = self.xtrain_
        y = self.yhat_
        lm = linear_model.Ridge(alpha=0.001)
        w = weight_function(x,x_new,self.kernel,self.tau)

        if np.isscalar(x_new):
          lm.fit(np.diag(w)@(x.reshape(-1,1)),np.diag(w)@(y.reshape(-1,1)))
          yest = lm.predict([[x_new]])[0][0]
        else:
          n = len(x_new)
          yest_test = np.zeros(n)
          for i in range(n):
            lm.fit(np.diag(w[:,i])@x,np.diag(w[:,i])@y)
            yest_test[i] = lm.predict(x_new[i].reshape(1,-1))
        return yest_test

In [11]:
# Getting Data (concrete dataset)

In [12]:
data = pd.read_csv('/content/drive/MyDrive/ML2023/data/concrete.csv')

In [13]:
x = data.loc[:,'cement':'age'].values
y = data['strength'].values

In [14]:
# Gradient Boosting Class

In [15]:
class GradientBoosting_LOWESS:
  def __init__(self, x, y, scaler=StandardScaler(), num_steps=1, n_estimators=150, max_depth=7, tau=0.9):
    self.x_train = scaler.fit_transform(x)
    self.y_train = y
    self.fitted = False
    self._scaler = scaler
    self._num_steps = num_steps
    self._n_estimators = n_estimators
    self._max_depth = max_depth
    self._tau = tau

  def is_fitted(self):
    return self.fitted

  def fit(self, x, y):

    x_train = self.x_train
    y_train = self.y_train

    # Will hold the number of weak learners per the number of boosting steps
    self._weak_learners = []
    # Strong learner (Lowess)
    self.regressor = Lowess(kernel=Gaussian, tau=self._tau)
    self.regressor.fit(x_train, y_train)
    yhat_train = self.regressor.predict(x_train)
    residuals_regressor = y_train.flatten() - yhat_train
    # Creating the weak learners
    for i in range(self._num_steps):
      self._weak_learners.append(RandomForestRegressor(n_estimators=self._n_estimators, max_depth=self._max_depth))

    # For each weak learner, train it on the residuals of the previous model
    cur_residuals = residuals_regressor
    for weak_learner in self._weak_learners:
      # Train the current weak learner on the current residuals
      weak_learner.fit(x_train, cur_residuals)
      # Get the new residuals to train the next weak learner on
      cur_yhat_train = weak_learner.predict(x_train)
      cur_residuals = cur_residuals - cur_yhat_train

  def predict(self, x_new):
    # Transform new x w/ scaler
    x_new = self._scaler.fit_transform(x_new)
    # Get strong model predictions
    yhat_regressor = self.regressor.predict(x_new)

    # For each weak learner, append to their predictions to the result
    weak_residuals = []
    for weak_learner in self._weak_learners:
      weak_residuals.append(weak_learner.predict(x_new))

    prediction = yhat_regressor
    for item in weak_residuals:
      prediction += item


    return prediction

## Showing results of Xtreme Boosting

In [17]:
mse_xtreme = []
scaler = StandardScaler()
kf = KFold(n_splits=10,shuffle=True,random_state=1234)

for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = x[idxtest]
  xtrain = scaler.fit_transform(xtrain)
  xtest = scaler.transform(xtest)

  model_xgboost = xgboost.XGBRFRegressor(n_estimators=200,max_depth=7)

  model_xgboost.fit(xtrain, ytrain)

  mse_xtreme.append(mse(ytest,model_xgboost.predict(xtest)))

print('The Cross-validated Mean Squared Error for Extreme Gradient Boosting is : '+str(np.mean(mse_xtreme)))

The Cross-validated Mean Squared Error for Extreme Gradient Boosting is : 31.453653856769517


## Showing results of my tuned Gradient Booster Class

I found these parameters after doing KFold tuning with the max_depth, number of steps, number of estimators, scaler type (and number of quantiles for Quantile Transformer) and tau value.

In [18]:
mse_gd = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)


hyperparams = dict()
mse_gd = []
for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  ytrain = y[idxtrain]
  ytest = y[idxtest]
  xtest = x[idxtest]

  model = GradientBoosting_LOWESS(xtrain, ytrain, max_depth=7, num_steps=3, n_estimators=250, scaler=QuantileTransformer(n_quantiles=38), tau=0.2)

  model.fit(xtrain, ytrain)
  output = model.predict(xtest)

  mse_gd.append(mse(ytest,output))

print("MSE for GradientBoosting")
print(np.mean(mse_gd))

MSE for GradientBoosting
31.012359652900688


## Examples of Hyperparam Tuning

This is a summary of the tuning I did. It started getting messy so this is a cleaned up version demonstrating the effects of the different hyperparameters.

### Effect Of Different scaler types

In [19]:
mse_gd = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)

scalers = [MinMaxScaler(),StandardScaler(),QuantileTransformer(n_quantiles=39)]

hyperparams = dict()
for scaler in scalers:
    mse_gd = []
    for idxtrain, idxtest in kf.split(x):
      xtrain = x[idxtrain]
      ytrain = y[idxtrain]
      ytest = y[idxtest]
      xtest = x[idxtest]

      model = GradientBoosting_LOWESS(xtrain, ytrain, num_steps=3, n_estimators=250, scaler=scaler, tau=0.2)

      model.fit(xtrain, ytrain)
      output = model.predict(xtest)

      mse_gd.append(mse(ytest,output))

    print("Scaler: " + str(scaler))
    print("Average MSE across KFolds: ", np.mean(mse_gd))

Scaler: MinMaxScaler()
Average MSE across KFolds:  44.18563060176424
Scaler: StandardScaler()
Average MSE across KFolds:  133.7455194065987
Scaler: QuantileTransformer(n_quantiles=39)
Average MSE across KFolds:  31.26649405991544


### Effect of different numbers of quantiles

In [23]:
mse_gd = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)

quantiles = [10,20,30,40,50,60,70,80,90,100]

hyperparams = dict()
for quantile in quantiles:
    mse_gd = []
    for idxtrain, idxtest in kf.split(x):
      xtrain = x[idxtrain]
      ytrain = y[idxtrain]
      ytest = y[idxtest]
      xtest = x[idxtest]

      model = GradientBoosting_LOWESS(xtrain, ytrain, num_steps=3, scaler=QuantileTransformer(n_quantiles=quantile), tau=0.2)

      model.fit(xtrain, ytrain)
      output = model.predict(xtest)

      mse_gd.append(mse(ytest,output))

    print("Quantile #: " + str(quantile))
    print("Average MSE across KFolds: ", np.mean(mse_gd))

Quantile #: 10
Average MSE across KFolds:  31.895061550653175
Quantile #: 20
Average MSE across KFolds:  31.367150675377985
Quantile #: 30
Average MSE across KFolds:  31.1919539540773
Quantile #: 40
Average MSE across KFolds:  31.978771450104052
Quantile #: 50
Average MSE across KFolds:  31.780451181835712
Quantile #: 60
Average MSE across KFolds:  31.26475397016718
Quantile #: 70
Average MSE across KFolds:  31.539284454380144
Quantile #: 80
Average MSE across KFolds:  32.206149644466294
Quantile #: 90
Average MSE across KFolds:  31.16484862685557
Quantile #: 100
Average MSE across KFolds:  31.516660572901692


### Effect of different taus

In [24]:
mse_gd = []
kf = KFold(n_splits=10,shuffle=True,random_state=1234)

taus = [.1,.2,.3,.4,.5,.6,.7,.8,.9, 1.0]
hyperparams = dict()
for tau in tqdm(taus):
    mse_gd = []
    for idxtrain, idxtest in kf.split(x):
      xtrain = x[idxtrain]
      ytrain = y[idxtrain]
      ytest = y[idxtest]
      xtest = x[idxtest]

      model = GradientBoosting_LOWESS(xtrain, ytrain, max_depth=7, num_steps=3, n_estimators=250, scaler=QuantileTransformer(n_quantiles=38), tau=tau)

      model.fit(xtrain, ytrain)
      output = model.predict(xtest)

      mse_gd.append(mse(ytest,output))

    print("Tau value: " + str(tau))
    print("Average MSE across KFolds: ", np.mean(mse_gd))

 10%|█         | 1/10 [01:10<10:37, 70.78s/it]

Tau value: 0.1
Average MSE across KFolds:  35.36241348359499


 20%|██        | 2/10 [02:22<09:29, 71.15s/it]

Tau value: 0.2
Average MSE across KFolds:  31.309926744857552


 30%|███       | 3/10 [03:34<08:23, 71.87s/it]

Tau value: 0.3
Average MSE across KFolds:  33.759599519821954


 40%|████      | 4/10 [04:46<07:09, 71.63s/it]

Tau value: 0.4
Average MSE across KFolds:  33.46264606610201


 50%|█████     | 5/10 [05:58<05:59, 71.96s/it]

Tau value: 0.5
Average MSE across KFolds:  33.10789228951623


 60%|██████    | 6/10 [07:09<04:46, 71.56s/it]

Tau value: 0.6
Average MSE across KFolds:  32.75381779337831


 70%|███████   | 7/10 [08:20<03:34, 71.35s/it]

Tau value: 0.7
Average MSE across KFolds:  32.48706474716718


 80%|████████  | 8/10 [09:32<02:23, 71.56s/it]

Tau value: 0.8
Average MSE across KFolds:  32.46664821219609


 90%|█████████ | 9/10 [10:42<01:11, 71.14s/it]

Tau value: 0.9
Average MSE across KFolds:  32.48652291387075


100%|██████████| 10/10 [11:52<00:00, 71.21s/it]

Tau value: 1.0
Average MSE across KFolds:  32.42022146276115





# Question 2:

## Example of KNN Regression Class w/ uSearch

In [27]:
from usearch.index import search, MetricKind, Matches, BatchMatches

In [26]:
!pip install usearch

Collecting usearch
  Downloading usearch-2.9.0-cp310-cp310-manylinux_2_28_x86_64.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: usearch
Successfully installed usearch-2.9.0


In [29]:
class KNN_Regression:
  def __init__(self, k=2):
    self._k = k

  def fit (self, x, y):
    self.xtrain_ = x
    self.ytrain_ = y

  def predict(self, x_new):
    check_is_fitted(self)
    x = self.xtrain_
    y = self.ytrain_

    # For each observation in x_new, find its closest neighbors via uSearch Matches
    # Subset the training data to only the n closest neighbors (number of neighbors)
    # Train a Ridge model on this subset of data
    # The prediction for this observation is the prediction of this Ridge model
    if np.isscalar(x_new):
      lm = linear_model.Ridge(alpha=0.02)
      one_in_many: Matches = search(xtrain, x_new, self._k, MetricKind.L2sq, exact=True)
      nearest_x = xtrain[one_in_many.keys]
      nearest_y = ytrain[one_in_many.keys]
      lm.fit(nearest_x, nearest_y)
      return lm.predict(x_new.reshape(1,-1))
    else:
      y_est = []
      for vector in x_new:
        lm = linear_model.Ridge(alpha=0.02)
        one_in_many: Matches = search(xtrain, vector, self._k, MetricKind.L2sq, exact=True)
        nearest_x = x[one_in_many.keys]
        nearest_y = y[one_in_many.keys]
        lm.fit(nearest_x, nearest_y)
        pred = lm.predict(vector.reshape(1,-1))
        y_est.append(pred)
      return y_est


In [33]:
scaler = QuantileTransformer(n_quantiles=30)
kf = KFold(n_splits=10,shuffle=True,random_state=1234)
k_vals = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]

for k_val in k_vals:
  mse_k = []
  for idxtrain, idxtest in kf.split(x):
    xtrain = x[idxtrain]
    ytrain = y[idxtrain]
    ytest = y[idxtest]
    xtest = x[idxtest]
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.transform(xtest)

    model = KNN_Regression(k=k_val)
    model.fit(xtrain, ytrain)
    output = model.predict(xtest)
    mse_k.append(mse(ytest,output))

  print("The current number of neighbors is ", k_val)
  print("Average MSE across KFolds: ", np.mean(mse_k))

The current number of neighbors is  1
Average MSE across KFolds:  45.03599980582525
The current number of neighbors is  2
Average MSE across KFolds:  32.241090652921365
The current number of neighbors is  3
Average MSE across KFolds:  27.24198118112987
The current number of neighbors is  4
Average MSE across KFolds:  26.365188572902678
The current number of neighbors is  5
Average MSE across KFolds:  24.848138331403526
The current number of neighbors is  6
Average MSE across KFolds:  23.903780929707043
The current number of neighbors is  7
Average MSE across KFolds:  23.95452595592333
The current number of neighbors is  8
Average MSE across KFolds:  23.529809835269877
The current number of neighbors is  9
Average MSE across KFolds:  23.3055924114512
The current number of neighbors is  10
Average MSE across KFolds:  23.2065687892969
The current number of neighbors is  11
Average MSE across KFolds:  23.511165788647208
The current number of neighbors is  12
Average MSE across KFolds:  23.