# High dimensional data analysis by lasso and boosting

This is a script for selecting variables from high dimensional data by Lasso regression, train model based on the subset, and obtain prediction in test set by Stochastic Gradient Boosting (SGB).

# Package

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statistics
import math

from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

# Data - train test split

In [None]:
X_train_df = # put some data
y_train_df = # put some data
X_test_df = # put some data

# Lasso hyperparameter tuning with grid search cross validation

To get an alpha hyperparameter for a lasso model, we use grid search cross validation, then select variables.

In [None]:
k = 10
alpha_lower = 0.01
alpha_upper = 0.1

# Setup the parameter grid
alpha_space = np.arange(alpha_lower, alpha_upper, 0.01)
param_grid = {'alpha':alpha_space}

# Instantiate a lasso regression
lasso = Lasso(normalize = True)

# Instantiate the GridSearchCV object
lasso_cv = GridSearchCV(lasso, param_grid, cv = k)

# Fit it to data
lasso_cv.fit(X_train_df.values, y_train_df.values)

# Calculate training accuracy by RMSE
y_pred = lasso_cv.predict(X_train_df.values)
rmse = math.sqrt(statistics.mean((y_train_df.values - y_pred)**2))

# Predictors
best_alpha = lasso_cv.best_params_['alpha']
lasso = Lasso(alpha = best_alpha, normalize = True)
lasso.fit(X_train_df.values, y_train_df.values)
lasso_coef = lasso.coef_
p = sum(abs(lasso_coef) > 0)

# Print the tuned parameters and score
print("Tuned lasso regression hyperparameters: {}".format(lasso_cv.best_params_))
print("Best score: {0:.2f}".format(lasso_cv.best_score_))
print("RMSE: {0:.2f}".format(rmse))
print("Number of predictors: {}".format(p))

# Variable selection

We use variables whose absolute value of coefficients is greater than zero to trian our prediction models.

In [None]:
COLUMNS = X_train_df.columns
var = pd.Series(COLUMNS[abs(lasso_coef) > 0])
coef = pd.Series(lasso_coef[abs(lasso_coef) > 0])
lasso_result = pd.concat(objs = [var, coef],
                         axis = 1,
                         keys = ['Variable', 'Lasso_coefficient'])
print(lasso_result.iloc[0:5,:])

In [None]:
COLUMNS = np.array(lasso_result['Variable'])
subset = X_train_df[COLUMNS]
print(subset.shape)

# Train validation split

Testing several boosting methods with random hyperparameters which are cross validated later, and decide which one we use.

In [None]:
SEED = 1
SIZE = 0.3

X_train, X_vali, y_train, y_vali = train_test_split(subset, y_train_df, test_size = SIZE, random_state = SEED)

# AdaBoost

In [None]:
adb = AdaBoostRegressor(n_estimators = 100,
                        random_state = SEED)

In [None]:
adb.fit(X_train, y_train)
y_pred_train = adb.predict(X_train)
y_pred_vali = adb.predict(X_vali)

In [None]:
rmse_train = MSE(y_train, y_pred_train)**(1/2)
rmse_vali = MSE(y_vali, y_pred_vali)**(1/2)
print("Training set RMSE: {:.2f}".format(rmse_train))
print("Validation set RMSE: {:.2f}".format(rmse_vali))

# Gradient boosting

In [None]:
gbt = GradientBoostingRegressor(n_estimators = 300,
                                max_depth = 1,
                                random_state = SEED)

In [None]:
gbt.fit(X_train, y_train)
y_pred_train = gbt.predict(X_train)
y_pred_vali = gbt.predict(X_vali)

In [None]:
rmse_train = MSE(y_train, y_pred_train)**(1/2)
rmse_vali = MSE(y_vali, y_pred_vali)**(1/2)
print("Training set RMSE: {:.2f}".format(rmse_train))
print("Validation set RMSE: {:.2f}".format(rmse_vali))

# Stochastic gradient boosting (SGB)

In [None]:
sgbt = GradientBoostingRegressor(max_depth = 1, # Defining decision stamp
                                 subsample = 0.8, # Sample proportion of each tree
                                 max_features = 0.2, # Maximum selected feature proportion to available features
                                 n_estimators = 300, # Number of decision stamp
                                 random_state = SEED)

In [None]:
sgbt.fit(X_train, y_train)
y_pred_train = sgbt.predict(X_train)
y_pred_vali = sgbt.predict(X_vali)

In [None]:
rmse_train = MSE(y_train, y_pred_train)**(1/2)
rmse_vali = MSE(y_vali, y_pred_vali)**(1/2)
print("Training set RMSE: {:.2f}".format(rmse_train))
print("Validation set RMSE: {:.2f}".format(rmse_vali))

# SGB hyperparameters tuning with grid search cross validation

We try SGB for prediction model.

In [None]:
sgbt = GradientBoostingRegressor(random_state = SEED)
print(sgbt.get_params())

In [None]:
# Hyperparameters grids
params_sgbt = {
    'max_depth': [1],
    'subsample': [0.2, 0.5, 0.8],
    'max_features': [0.2, 0.5, 0.8],
    'n_estimators': [100, 300, 500]
}

In [None]:
grid_sgbt = GridSearchCV(estimator = sgbt,
                         param_grid = params_sgbt,
                         scoring = 'r2',
                         cv = 10,
                         n_jobs = -1)

In [None]:
# grid_sgbt.fit(subset, y_train_df)
grid_sgbt.fit(X_train, y_train)

In [None]:
best_hyperparams = grid_sgbt.best_params_
print("Best hyperparameters:\n", best_hyperparams)

In [None]:
best_CV_score = grid_sgbt.best_score_
print("Best CV R-squared: {:.2f}".format(best_CV_score))

In [None]:
best_model = grid_sgbt.best_estimator_

vali_acc = best_model.score(X_vali, y_vali)

print("Validation set R-squared of best model: {:.2f}".format(vali_acc))

In [None]:
y_pred_train = best_model.predict(X_train)
y_pred_vali = best_model.predict(X_vali)
rmse_train = MSE(y_train, y_pred_train)**(1/2)
rmse_vali = MSE(y_vali, y_pred_vali)**(1/2)
print("Training set RMSE: {:.2f}".format(rmse_train))
print("Validation set RMSE: {:.2f}".format(rmse_vali))

# Prediction of test set

In [None]:
# check best hyperparameters obtained from cross validation
best_hyperparams = grid_sgbt.best_params_
print("Best hyperparameters:\n", best_hyperparams)

In [None]:
# Input best hyperparameters
sgbt = GradientBoostingRegressor(max_depth = 1, # Defining decision stamp
                                 subsample = 0.5, # Sample proportion of each tree
                                 max_features = 0.2, # Maximum selected feature proportion to available features
                                 n_estimators = 300, # Number of decision stamp
                                 random_state = SEED)

In [None]:
sgbt.fit(X_train_df, y_train_df)
y_pred_test = sgbt.predict(X_test_df).round(1) # DREAM allows only 1 decimal point
result = pd.concat([pd.Series(ID_test_df.values), pd.Series(y_pred_test)],
                   axis = 1,
                   keys = ['SampleID', 'GA'])
result.head()