In [1]:
# Libraries of functions need to be imported
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cdist
from sklearn.utils.validation import check_is_fitted

## Question 1

Create your class that implements the Gradient Boosting concept, based on the locally weighted regression method (Lowess class), and that allows a user-prescribed number of boosting steps. The class you develop should have all the mainstream useful options, including “fit,” “is_fitted”,  and “predict,” methods.  Show applications with real data for regression, 10-fold cross-validations and compare the effect of different scalers, such as the “StandardScaler”, “MinMaxScaler”, and the “QuantileScaler”.  In the case of the “Concrete” data set, determine a choice of hyperparameters that yield lower MSEs for your method when compared to the eXtream Gradient Boosting library.

In [2]:
# Gaussian Kernel
def Gaussian(w):
  return np.where(w>4,0,1/(np.sqrt(2*np.pi))*np.exp(-1/2*w**2))

# Tricubic Kernel
def Tricubic(w):
  return np.where(w>1,0,70/81*(1-w**3)**3)

# Quartic Kernel
def Quartic(w):
  return np.where(w>1,0,15/16*(1-w**2)**2)

# Epanechnikov Kernel
def Epanechnikov(w):
  return np.where(w>1,0,3/4*(1-w**2))

In [3]:
class GradBoostedLowess():

  def __init__(self, kernel = Gaussian, tau = .02):
    self.tau = tau
    self.kernel = kernel

  def fit(self, x, y):
      kernel = self.kernel
      tau = self.tau
      self.xtrain_ = x
      self.yhat_ = y

  def is_fitted(self):
    if self.xtrain_ != None:
      return True
    else:
      return False

  def predict(self, x_new, boosts = 0):
      if boosts == 0:
        return self.single_predict(x_new)
      else:
        final_preds = np.zeros(x_new.shape[0])
        resids = self.yhat_
        for i in range(boosts):
          #model = GradBoostedLowess()
          #model.fit(self.xtrain_, resids)
          self.fit(self.xtrain_, resids)
          #new_preds = model.single_predict(self.xtrain_)
          new_preds = self.single_predict(self.xtrain_)
          #final_preds = final_preds + model.single_predict(x_new)
          final_preds = final_preds + self.single_predict(x_new)
          resids = resids - new_preds

        #model = GradBoostedLowess()
        #model.fit(self.xtrain_, resids)
        self.fit(self.xtrain_, resids)
        #new_preds = model.single_predict(x_new)
        new_preds = self.single_predict(x_new)
        final_preds = final_preds + new_preds
        return final_preds

  def single_predict(self, x_new):
      check_is_fitted(self)
      x = self.xtrain_
      y = self.yhat_
      lm = linear_model.Ridge(alpha=0.001)
      w = self.kernel(cdist(x, x_new, metric='euclidean')/(2*self.tau))

      if np.isscalar(x_new):
        lm.fit(np.diag(w)@(x.reshape(-1,1)),np.diag(w)@(y.reshape(-1,1)))
        yest = lm.predict([[x_new]])[0][0]
      else:
        n = len(x_new)
        yest_test = []
        #Looping through all x-points
        for i in range(n):
          lm.fit(np.diag(w[:,i])@x,np.diag(w[:,i])@y)
          yest_test.append(lm.predict([x_new[i]]))
      return np.array(yest_test).flatten()


In [4]:
data = pd.read_csv('drive/MyDrive/Adv. App. Machine Learning/concrete.csv')

In [5]:
data.head()

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [6]:
x = data.drop(columns = ['strength']).values
y = data['strength'].values

In [7]:
mse_SScale = []
mse_MMScale = []
mse_QScale = []
mse_XGBoost = []
SScale = StandardScaler()
MMScale = MinMaxScaler()
QScale = QuantileTransformer(n_quantiles = 300)
kf = KFold(n_splits = 10, shuffle = True, random_state = 7)

model_XG = XGBRegressor(objective ='reg:squarederror',n_estimators=100,reg_lambda=20,alpha=1,gamma=10,max_depth=3)
model_SS = GradBoostedLowess(tau=.3)
model_MM = GradBoostedLowess(tau=.3)
model_QS = GradBoostedLowess(tau=.3)

for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  ytrain = y[idxtrain].ravel()
  ytest = y[idxtest].ravel()
  xtest = x[idxtest]

  xtrain_S = SScale.fit_transform(xtrain)
  xtest_S = SScale.transform(xtest)
  xtrain_M = MMScale.fit_transform(xtrain)
  xtest_M = MMScale.transform(xtest)
  xtrain_Q = QScale.fit_transform(xtrain)
  xtest_Q = QScale.transform(xtest)

  model_SS.fit(xtrain_S, ytrain)
  S_pred = model_SS.predict(xtest_S, boosts = 3)
  model_MM.fit(xtrain_M, ytrain)
  M_pred = model_MM.predict(xtest_M, boosts = 3)
  model_QS.fit(xtrain_Q, ytrain)
  Q_pred = model_QS.predict(xtest_Q, boosts = 3)
  model_XG.fit(xtrain_S, ytrain)
  X_pred = model_XG.predict(xtest_S)

  mse_SScale.append(mse(ytest, S_pred))
  mse_MMScale.append(mse(ytest,M_pred))
  mse_QScale.append(mse(ytest,Q_pred))
  mse_XGBoost.append(mse(ytest,X_pred))

print('The Cross-validated Mean Squared Error for StandardScaler is: '+str(np.mean(mse_SScale)))
print('The Cross-validated Mean Squared Error for MinMaxScaler is: '+str(np.mean(mse_MMScale)))
print('The Cross-validated Mean Squared Error for QuantileTransformer is: '+str(np.mean(mse_QScale)))
print('The Cross-validated Mean Squared Error for XGBoost method: '+str(np.mean(mse_XGBoost)))

The Cross-validated Mean Squared Error for StandardScaler is: 40.28905266637976
The Cross-validated Mean Squared Error for MinMaxScaler is: 47.60993849907447
The Cross-validated Mean Squared Error for QuantileTransformer is: 27.37296985378024
The Cross-validated Mean Squared Error for XGBoost method: 21.823977435435875


In [8]:
mse_GBLow = []
mse_XGBoost = []
scale = StandardScaler()

kf = KFold(n_splits = 10, shuffle = True, random_state = 7)
x = data.drop(columns = ['strength']).values
y = data['strength'].values

model_XG = XGBRegressor(objective ='reg:squarederror',n_estimators=100,reg_lambda=20,alpha=1,gamma=10,max_depth=3)
model = GradBoostedLowess(tau=5, kernel = Gaussian)

for idxtrain, idxtest in kf.split(x):
  xtrain = x[idxtrain]
  ytrain = y[idxtrain].ravel()
  ytest = y[idxtest].ravel()
  xtest = x[idxtest]

  xtrain = scale.fit_transform(xtrain)
  xtest = scale.transform(xtest)

  model.fit(xtrain_S, ytrain)
  pred = model.predict(xtest, boosts = 10)
  model_XG.fit(xtrain_S, ytrain)
  X_pred = model_XG.predict(xtest)

  mse_GBLow.append(mse(ytest, pred))
  mse_XGBoost.append(mse(ytest,X_pred))

print('The Cross-validated Mean Squared Error for our class is: '+str(np.mean(mse_GBLow)))
print('The Cross-validated Mean Squared Error for XGBoost method: '+str(np.mean(mse_XGBoost)))

The Cross-validated Mean Squared Error for our class is: 139.94009515907564
The Cross-validated Mean Squared Error for XGBoost method: 147.83071136774723


Hooray! We found hyperparameters for our model that yield a lower MSE than the XGBoost method

## Question 2

Based on the Usearch library, create your own class that computes the k_Nearest Neighbors for Regression.

In [9]:
!pip install usearch



In [10]:
import usearch
from usearch.index import search, MetricKind, Matches, BatchMatches
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

In [11]:
class K_Nearest_Neighbors():

  def __init__(self, k):
    self.k = k

  def fit(self, x, y):
    self.x = x
    self.y = y
    self.full_set =  np.array(zip(x, y))

  def predict(self, x_new):
    predictions = []
    for row in x_new:
      neighbors = self._get_neighbors(row)
      row_preds = []
      for index in neighbors:
        row_preds.append(self.y[index])
      predictions.append(Counter(row_preds).most_common(1)[0][0])
    return predictions

  def _dist_calc(self, row):
    distances: Matches = search(self.x, row, self.x.shape[0], MetricKind.L2sq, exact=True)
    return distances

  def _get_neighbors(self, new_row):
    neighbors = []
    distances = self._dist_calc(new_row)
    for i in range(self.k):
      neighbors.append(distances.to_list()[i][0])
    return neighbors

In [12]:
data = pd.read_csv('drive/MyDrive/Adv. App. Machine Learning/mobile.csv')

In [13]:
data

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


In [14]:
x = data.drop(columns = ['price_range']).values
y = data['price_range'].values

In [15]:
scale = StandardScaler()

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 7)
xtrain = scale.fit_transform(xtrain)
xtest = scale.transform(xtest)

model = K_Nearest_Neighbors(k=250)
model.fit(xtrain, ytrain)
pred = model.predict(xtest)
print("The accuracy is: " + str(accuracy_score(ytest, pred)))

The accuracy is: 0.72


## Question 3

Host your project on your GitHub page.

https://willcameron2002.github.io/DATA441/