The RANSACRegressor is a robust regression algorithm that fits a linear model to the data while ignoring outliers. It achieves this by iteratively fitting the model to random subsets of the data (inliers) and identifying the data points that are close to the fitted model (consensus set). The final model is then fit to the consensus set.

In [8]:
import numpy as np
import pandas as pd
train=pd.read_csv('IndoreHP_Train.csv')
test=pd.read_csv('IndoreHP_Test.csv')

X=train.drop(columns=['MEDV'])
y=train['MEDV']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=42)


from sklearn.preprocessing import MinMaxScaler
# Perform Min-Max scaling on the features
scaler = MinMaxScaler(feature_range=(0,1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
test=scaler.transform(test)

In [22]:
from sklearn.linear_model import RANSACRegressor, TheilSenRegressor, HuberRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [10]:
# RANSAC Regression
ransac = RANSACRegressor()
ransac_params = {'residual_threshold': np.linspace(1, 10, 10)}
ransac_grid = GridSearchCV(ransac, ransac_params, scoring='neg_mean_squared_error', cv=5)
ransac_grid.fit(X_train, y_train)
ransac_best = ransac_grid.best_estimator_
ransac_pred = ransac_best.predict(X_test)
ransac_mse = mean_squared_error(y_test, ransac_pred)
ransac_r2 = r2_score(y_test, ransac_pred)

In [11]:
# Theil-Sen Regression
theilsen = TheilSenRegressor()
theilsen_params = {'max_subpopulation': [10, 20, 30]}
theilsen_grid = GridSearchCV(theilsen, theilsen_params, scoring='neg_mean_squared_error', cv=5)
theilsen_grid.fit(X_train, y_train)
theilsen_best = theilsen_grid.best_estimator_
theilsen_pred = theilsen_best.predict(X_test)
theilsen_mse = mean_squared_error(y_test, theilsen_pred)
theilsen_r2 = r2_score(y_test, theilsen_pred)

In [12]:
huber = HuberRegressor()
huber_params = {'epsilon': np.linspace(1.1, 2.0, 5)}
huber_grid = GridSearchCV(huber, huber_params, scoring='neg_mean_squared_error', cv=5)
huber_grid.fit(X_train, y_train)
huber_best = huber_grid.best_estimator_
huber_pred = huber_best.predict(X_test)
huber_mse = mean_squared_error(y_test, huber_pred)
huber_r2 = r2_score(y_test, huber_pred)

In [13]:
# Print results
print("RANSAC Regression:")
print("Best Parameters:", ransac_best)
print("Mean Squared Error:", ransac_mse)
print("R^2 Score:", ransac_r2)
print()

print("Theil-Sen Regression:")
print("Best Parameters:", theilsen_best)
print("Mean Squared Error:", theilsen_mse)
print("R^2 Score:", theilsen_r2)
print()

print("Huber Regression:")
print("Best Parameters:", huber_best)
print("Mean Squared Error:", huber_mse)
print("R^2 Score:", huber_r2)

RANSAC Regression:
Best Parameters: RANSACRegressor(residual_threshold=9.0)
Mean Squared Error: 18.093565944292575
R^2 Score: 0.7538117687401666

Theil-Sen Regression:
Best Parameters: TheilSenRegressor(max_subpopulation=30)
Mean Squared Error: 20.659151915862896
R^2 Score: 0.7189033889088723

Huber Regression:
Best Parameters: HuberRegressor(epsilon=1.55)
Mean Squared Error: 13.810801754078668
R^2 Score: 0.812084756173266


Huber Regression works better then others, generally. Let's fine it a little bit more and train is properly on the dataset we have.

In [14]:
from skopt import BayesSearchCV

In [17]:
# Define the parameter search space
param_space = {
    'epsilon': (1.0, 10.0),
    'alpha': (1e-5, 1e-1, 'log-uniform'),
    'max_iter': (100, 1000)
}

In [18]:
# Perform hyperparameter tuning with Bayesian optimization
huber = HuberRegressor()
huber_opt = BayesSearchCV(huber, param_space, scoring='neg_mean_squared_error', n_iter=20, cv=5)
huber_opt.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [19]:
# Get the best estimator and make predictions
huber_best = huber_opt.best_estimator_
huber_pred = huber_best.predict(X_test)
huber_mse = mean_squared_error(y_test, huber_pred)
huber_r2 = r2_score(y_test, huber_pred)

In [20]:
# Print results
print("Huber Regression (with Bayesian Optimization):")
print("Best Parameters:", huber_opt.best_params_)
print("Mean Squared Error:", huber_mse)
print("R^2 Score:", huber_r2)

Huber Regression (with Bayesian Optimization):
Best Parameters: OrderedDict([('alpha', 0.0003187733198338806), ('epsilon', 1.522618862156072), ('max_iter', 522)])
Mean Squared Error: 13.832234236784991
R^2 Score: 0.8117931373168602


In [23]:
mae = mean_absolute_error(y_test, huber_pred)
print("Validation MAE:", mae)

Validation MAE: 2.9203928514224633


In [21]:
y_pred1=huber_best.predict(test)

submission=pd.read_csv('IndoreHP_Sample.csv')
submission['MEDV']=y_pred1
submission.to_csv('./submission2.csv', index=False)
submission.head()

Unnamed: 0,ID,MEDV
0,400,27.215499
1,401,16.864044
2,402,34.623084
3,403,27.964253
4,404,32.787391
