<a href="https://colab.research.google.com/github/tawaqalt/arbritrary/blob/master/Tawakalitu_Yusuf_Ensemble__Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing relevant libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.svm import SVR
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
#using GRlivArea as explanatory variable and Salesprice as the dependent variable
X = df[['GrLivArea', 'YearBuilt']]
y = df['SalePrice']

In [4]:
X.shape, y.shape

((1460, 2), (1460,))

In [5]:
#splitting data into train and test spits
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state =0, shuffle=True)

In [6]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1168, 2), (292, 2), (1168,), (292,))

# [Problem 1] Scratch implementation of blending

# Example 1

In [7]:
#Trying out different models
#standardizing the data
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#instantiating the logistic regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print('mean squared error of Logistic regression: ', mean_squared_error(y_test, y_pred_lr))

#instantiating the SVR model
svr = SVR()
svr.fit(X_train, y_train)
y_pred_svr = svr.predict(X_test)
print('mean squared error of SVR: ', mean_squared_error(y_test, y_pred_svr))


#instantiating the DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
y_pred_dtr = dtr.predict(X_test)
print('mean squared error of DecisionTreeRegressor: ', mean_squared_error(y_test, y_pred_dtr))

mean squared error of Logistic regression:  3836195261.0
mean squared error of SVR:  7221625115.268777
mean squared error of DecisionTreeRegressor:  3170580008.480974


In [8]:
#blending of the models
#stacking the models
mean_pred = np.vstack((y_pred_lr, y_pred_svr, y_pred_dtr))

mean_pred = y_pred_lr*0.2 + y_pred_svr*0.3 + y_pred_dtr*0.5
print('mean squared error of blending: ', mean_squared_error(y_test, mean_pred))

mean squared error of blending:  2745361518.8375845


# Example 2

In [9]:
#Instantiating Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
print('mse of lasso: ', mean_squared_error(y_test, y_pred_lasso))

#instantiating the SVR
svr_n = SVR()
svr_n.fit(X_train, y_train)
y_pred_svr_n = svr_n.predict(X_test)
print('mse of SVR: ', mean_squared_error(y_test, y_pred_svr_n))

#instantiating the Ridge
ridge = Ridge(alpha=0.1)
ridge.fit(X_train, y_train)
y_pred_ridge = ridge.predict(X_test)
print('mse of ridge: ', mean_squared_error(y_test, y_pred_ridge))

#stacking the model
mean_pred_n = np.vstack((y_pred_lasso, y_pred_svr_n, y_pred_ridge))

mean_pred = y_pred_lasso*0.4 + y_pred_svr_n*0.2 + y_pred_ridge*0.4
print('mean squared error of blending: ', mean_squared_error(y_test, mean_pred))

mse of lasso:  2942065459.3163877
mse of SVR:  7221625115.268777
mse of ridge:  2941990621.0942945
mean squared error of blending:  2959122142.8199334


# Example 3

In [10]:
# Instantiating the Decision tree
dtr_n = DecisionTreeRegressor()
dtr_n.fit(X_train, y_train)
y_pred_dtr_n = dtr_n.predict(X_test)
print('mse of DecisionTreeRegressor: ', mean_squared_error(y_test, y_pred_dtr_n))

#Instantiating the SVR
svr_nn = SVR()
svr_nn.fit(X_train, y_train)
y_pred_svr_nn = svr_nn.predict(X_test)
print('mse of SVR: ', mean_squared_error(y_test, y_pred_svr_nn))

#Instantiating the Random Forest Model
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print('mse of RandomForestRegressor: ', mean_squared_error(y_test, y_pred_rf))

mean_pred = np.vstack((y_pred_dtr_n, y_pred_svr_nn, y_pred_rf))

mean_pred = y_pred_dtr_n*0.2 + y_pred_svr_nn*0.1 + y_pred_rf*0.6
print('mean squared error of blending: ', mean_squared_error(y_test, mean_pred))

mse of DecisionTreeRegressor:  3241093707.1898785
mse of SVR:  7221625115.268777
mse of RandomForestRegressor:  1801967333.890562
mean squared error of blending:  2435781124.263607


# [Problem 2] Scratch implementation of bagging

# Using the Bagging method alone vs using with other models

In [21]:
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.tree import DecisionTreeRegressor

class Bagging(BaseEstimator, RegressorMixin):
    def __init__(self, n_estimators=10, random_state=None):
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.base_models = [DecisionTreeRegressor(random_state=self.random_state) for _ in range(self.n_estimators)]

    def fit(self, X, y):
        for model in self.base_models:
            # Use iloc to access elements by integer position
            indices = np.random.choice(len(X), size=len(X), replace=True)
            X_sampled, y_sampled = X[indices], y.iloc[indices]
            model.fit(X_sampled, y_sampled)

    def predict(self, X):
        predictions = np.array([model.predict(X) for model in self.base_models])
        return np.mean(predictions, axis=0)

bagging_model = Bagging(n_estimators=10, random_state=42)
bagging_model.fit(X_train, y_train)
y_pred = bagging_model.predict(X_test)

print('mean squared error of bagging: ', mean_squared_error(y_test, y_pred))

mean squared error of bagging:  1810897775.501013


In [12]:
# Instantiate a base decision tree classifier
base_model = DecisionTreeRegressor(max_depth=5)

# Instantiate the bagging classifier
bagging_model = BaggingRegressor(base_model, n_estimators=100, random_state=42)

# Train the bagging model
bagging_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = bagging_model.predict(X_test)

# Evaluate accuracy
print('mse of the bagging model: ' , mean_squared_error(y_test, y_pred))

mse of the bagging model:  1769773666.0348601


In [31]:
from sklearn.ensemble import GradientBoostingRegressor, BaggingRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Instantiate individual models
grb = GradientBoostingRegressor(n_estimators=100, max_depth=5)
xg = xgb.XGBRegressor(n_estimators=100, max_depth=5)


# Train individual models
grb.fit(X_train, y_train)
xg.fit(X_train, y_train)


# Combine predictions from individual models
grb_pred = grb.predict(X_test)
xg_pred = xg.predict(X_test)
combined_predictions = (grb_pred + xg_pred ) / 2.0


# Calculate mean squared errors
grb_mse = mean_squared_error(y_test, grb_pred)
xg_mse = mean_squared_error(y_test, xg_pred)
combined_mse = mean_squared_error(y_test, combined_predictions)


#print mean squared error
print('mse of GradientBoostingRegressor: ', grb_mse)
print('mse of XGBRegressor: ', xg_mse)
print('mse of the bagging model: ', combined_mse)

mse of GradientBoostingRegressor:  1391672215.8584366
mse of XGBRegressor:  1472362583.6215823
mse of the bagging model:  1367068146.1896343


In [14]:
bagging = BaggingRegressor()
bagging.fit(X_train, y_train)
y_pred_bagging = bagging.predict(X_test) # Remove y_test from here
print('mean squared error of bagging: ', mean_squared_error(y_test, y_pred_bagging))

mean squared error of bagging:  2217460163.742679


# this also confirms that combination of two models provides better aacuracy than one model

# [Problem 3] Scratch implementation of stacking

In [15]:
from sklearn.base import BaseEstimator, ClassifierMixin

class Stacking(BaseEstimator, ClassifierMixin):
    def __init__(self, base_models):
        self.base_models = base_models
        self.meta_model = LogisticRegression()

    def fit(self, X, y):
        # Split data into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train base models
        base_predictions = []
        for model in self.base_models:
            model.fit(X_train, y_train)
            base_pred = model.predict(X_val)
            base_predictions.append(base_pred)

        # Create meta-features (stacked predictions)
        meta_features = np.column_stack(base_predictions)

        # Train meta-model
        self.meta_model.fit(meta_features, y_val)

    def predict(self, X):
        base_predictions = []
        for model in self.base_models:
            base_pred = model.predict(X)
            base_predictions.append(base_pred)
        meta_features = np.column_stack(base_predictions)
        return self.meta_model.predict(meta_features)




# Initialize base models
base_model1 = DecisionTreeRegressor()
base_model2 = RandomForestRegressor()

# Initialize the stacking model
stacking_model = Stacking(base_models=[base_model1, base_model2])

# Fit the stacking model
stacking_model.fit(X_train, y_train)

# Make predictions using the stacked model
stacked_pred = stacking_model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, stacked_pred)
print('mse of the Stacking model:', mean_squared_error(y_test, stacked_pred))


mse of the Stacking model: 10888600451.496574


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Combining wiht other models

In [19]:
#combining with other models
from sklearn.neighbors import KNeighborsRegressor

estimators = [
    ('lasso', LassoCV()),
    ('ridge', RidgeCV()),
    ('knn', KNeighborsRegressor(n_neighbors=7))
]

final_estimator = GradientBoostingRegressor(n_estimators=25, subsample=0.5,
                                           min_samples_leaf=25, max_features=2,
                                           random_state=42)

# Pass the list of estimators to the StackingRegressor
reg = StackingRegressor(estimators=estimators, final_estimator=final_estimator)

# Train the StackingRegressor
reg.fit(X_train, y_train)

print('mse of Stacking model: ', mean_squared_error(y_test, reg.predict(X_test)))

mse of Stacking model:  2221994390.687851


from the stacking model, it can be seen that cmmbining with other models has a higher accuracy than using the model alone