In [53]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from joblib import dump
genes_train = pd.read_csv("https://cs307.org/lab-05/data/genes-train.csv")
X_train = genes_train.drop("y", axis=1)
y_train = genes_train["y"]

In [30]:
row = genes_train.shape[0]
col = genes_train.shape[1]
row, col

(900, 751)

In [52]:
ridge = Ridge()
param_grid = {
    'alpha': [0.1, 1.0, 10.0]
}
grid_search = GridSearchCV(ridge,param_grid,cv=5, n_jobs=-1,scoring='neg_mean_squared_error', verbose = 0)
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_train)
best_param = grid_search.best_params_
rmse = np.sqrt(-grid_search.best_score_)
rmse, best_param, best_param

(0.7487071364994092, {'alpha': 10.0}, {'alpha': 10.0})

In [87]:
lasso = Lasso()
param_grid = {
    'alpha': [0.04,0.045,0.05,0.055,],
    'max_iter':[100,150, 180, 200,220,250, 300,400],
    'selection': ['random','cyclic'], ## these don't matter
    'tol': [0.1, 0.05, 0.01,0.1,0.001,0.0001],
}
grid_search0 = GridSearchCV(lasso,param_grid,cv=10, n_jobs=-1,scoring='neg_mean_squared_error', verbose = 0)
grid_search0.fit(X_train, y_train)
y_pred = grid_search0.predict(X_train)
best_param = grid_search0.best_params_
rmse = np.sqrt(-grid_search0.best_score_)
rmse, best_param, best_param

(0.36526801942259074,
 {'alpha': 0.045, 'max_iter': 150, 'selection': 'random', 'tol': 0.001},
 {'alpha': 0.045, 'max_iter': 150, 'selection': 'random', 'tol': 0.001})

In [28]:

boost = HistGradientBoostingRegressor()

param_grid = {
    "max_iter": [100,300,500,800],
    "learning_rate": [0.1,0.01,0.05],
    "max_leaf_nodes": [2],
    "max_depth": [2]
}
grid_search1 = GridSearchCV(boost, param_grid, n_jobs=-1, cv = 5,scoring='neg_root_mean_squared_error', verbose=0)
grid_search1.fit(X_train,y_train)
best_params = grid_search1.best_params_
best_score = np.sqrt(-grid_search1.best_score_)
y_pred = grid_search1.predict(X_train)
best_params, best_score

({'learning_rate': 0.1, 'max_depth': 2, 'max_iter': 500, 'max_leaf_nodes': 2},
 1.426389262614254)

In [23]:
dt = DecisionTreeRegressor(max_depth=10)
dt.fit(X_train, y_train)
preds = dt.predict(X_train)
np.sqrt(np.mean((y_train - preds) ** 2)) # Looks like the very deep single decision tree get the job done!

0.8666835020782406

In [29]:
rf = RandomForestRegressor(max_features=20)

param_grid = {
    "n_estimators": [100, 1000],
    "max_leaf_nodes": [3],
    "max_depth": [3],
    "min_samples_split" : [3]
}
grid_search2 = GridSearchCV(rf, param_grid,n_jobs=-1,cv = 5,scoring='neg_root_mean_squared_error', verbose=0)
grid_search2.fit(X_train,y_train)
best_params = grid_search2.best_params_
best_score = np.sqrt(-grid_search2.best_score_)
best_params,best_score

({'max_depth': 3,
  'max_leaf_nodes': 3,
  'min_samples_split': 3,
  'n_estimators': 1000},
 2.846460043596208)

In [88]:
dump(grid_search0,'gene-expression.joblib')

['gene-expression.joblib']