In [7]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import pickle

In [3]:
train_slice_df = pd.read_csv("slice_std_train.csv")
test_slice_df = pd.read_csv("slice_std_test.csv")
print(train_slice_df.shape, test_slice_df.shape)

(42800, 361) (10700, 361)


In [5]:
# Open the pickle file in binary read mode
with open('result.pickle', 'rb') as file:
    old_results  = pickle.load(file)

In [8]:
X_train = train_slice_df.drop('Y', axis=1)
y_train = train_slice_df['Y']
X_test = test_slice_df.drop('Y', axis=1)
y_test = test_slice_df['Y']

## Ridge Model

In [9]:
# 1. Fit ridge regression to the train data for a range of lambda values
lambd_values_ridge = np.logspace(1, 8, 100)

In [10]:
# 2. Fit Ridge regression with built-in cross-validation
ridge_cv = RidgeCV(alphas=lambd_values_ridge, store_cv_values=True)
ridge_cv.fit(X_train, y_train)

In [11]:
# 3. Fit the model with the selected lambda
ridge_model = Ridge(alpha=ridge_cv.alpha_)
ridge_model.fit(X_train, y_train)

In [12]:
# 4. Extract the ridge coefficients
ridge_coefficients = ridge_model.coef_

In [13]:
# 5. Predict the test set
y_pred_ridge = ridge_model.predict(X_test)

In [15]:
# Performance Metrics for Ridge
validation_ridge = {
    'AIC': None, # AIC is not typically used in ridge regression
    'R2': r2_score(y_test, y_pred_ridge),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_ridge)),
    'MAE': mean_absolute_error(y_test, y_pred_ridge),
    'num_variables': np.sum(ridge_coefficients != 0)
}
validation_ridge

{'AIC': None,
 'R2': 0.858229094231964,
 'RMSE': 8.420103292960205,
 'MAE': 6.213675468649426,
 'num_variables': 360}

## Lasso Model

In [27]:
# 6. Fit lasso model for a range of hyperparameters
lambd_values_lasso = np.logspace(-2, 2, 100)

In [18]:
# 7. Use cross validation (LassoCV) to find lambda parameter value
lasso_cv = LassoCV(alphas=lambd_values_lasso, cv=5, max_iter=10000, n_jobs=-1)
lasso_cv.fit(X_train, y_train)

In [19]:
# 8. Fit the model with the selected lambda
lasso_model = Lasso(alpha=lasso_cv.alpha_)
lasso_model.fit(X_train, y_train)

In [20]:
# 9. Find lasso coefficients and check if any of them were removed by regularization
lasso_coefficients = lasso_model.coef_

In [21]:
# 10. Predict the test set
y_pred_lasso = lasso_model.predict(X_test)

In [22]:
# Performance Metrics for Lasso
validation_lasso = {
    'AIC': None, # AIC is not typically used in lasso regression
    'R2': r2_score(y_test, y_pred_lasso),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_lasso)),
    'MAE': mean_absolute_error(y_test, y_pred_lasso),
    'num_variables': np.sum(lasso_coefficients != 0)
}
validation_lasso

{'AIC': None,
 'R2': 0.8580481710035675,
 'RMSE': 8.425474305254362,
 'MAE': 6.2043910454752424,
 'num_variables': 325}

## Dump Answer

In [24]:
old_results['Validation']['validation_ridge'] = validation_ridge
old_results['Validation']['validation_lasso'] = validation_lasso
old_results['Validation']

{'validation_reduced_lm': {'AIC': 302514.48178790975,
  'R2': 0.8638882045172942,
  'RMSE': 8.450981711744108,
  'MAE': 6.249744339096156,
  'num_variables': 245},
 'validation_PCR': {'AIC': 302506.1882273548,
  'R2': 0.8638954982621725,
  'RMSE': 8.447360569858315,
  'MAE': 6.245095210025417,
  'num_variables': 242},
 'validation_ridge': {'AIC': None,
  'R2': 0.858229094231964,
  'RMSE': 8.420103292960205,
  'MAE': 6.213675468649426,
  'num_variables': 360},
 'validation_lasso': {'AIC': None,
  'R2': 0.8580481710035675,
  'RMSE': 8.425474305254362,
  'MAE': 6.2043910454752424,
  'num_variables': 325}}

In [25]:
res = {'Validation': old_results['Validation'],
        'Ridge_Model' : ridge_model,
        'Lasso_Model' : lasso_model}

In [26]:
with open('result1.pickle', 'wb') as f:
    pickle.dump(res,f)