In [None]:
### Ridge, Lasso

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV

In [15]:
SEED = 2021 # random seed #
num_CV = 5  # number of crossvalidation #
test_size = 0.3 # test set ratio #

df1 = pd.read_csv("/home/youho/data/cerebral_palsy/cleaned_data/cp_data_final.csv")
X = df1.iloc[:, 0:-1]
y = df1.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify = y, random_state=SEED)

alphas = np.logspace(-3, 3, 13)

In [20]:
### RIDGE ###
   
ridge = RidgeCV(alphas = alphas, cv = num_CV)
ridge.fit(X_train, y_train)

ridge_preds_train = ridge.predict(X_train)
ridge_preds_test = ridge.predict(X_test)

coeff_used = np.sum(ridge.coef_!=0)
print ("########################### RIDGE CV ")
print ("Number of crossvalidation: ", num_CV)
print ("Alpha", ridge.alpha_)
print ("Number of features used: ", coeff_used)
print ("\nCoeffs:", ridge.coef_)

ridge_performance_train = roc_auc_score(y_train, ridge_preds_train)
print ('\nTraining data: Area under the ROC curve = {}'.format(ridge_performance_train))

ridge_performance_test = roc_auc_score(y_test, ridge_preds_test)
print ('\nTest data: Area under the ROC curve = {}'.format(ridge_performance_test))

########################### RIDGE CV 
Number of crossvalidation:  5
Alpha 1000.0
Number of features used:  25

Coeffs: [ 2.18525876e-03  2.50566449e-04 -6.43046623e-04  1.01414436e-03
  8.05760118e-04  1.87290192e-03  7.18441473e-04  3.58691354e-04
  2.96435845e-03 -1.65721692e-04 -8.68267021e-04 -1.25212300e-03
 -3.82169270e-05  1.10607187e-03 -2.57546917e-03 -5.81884040e-04
 -2.90920464e-04  4.31314279e-03  6.64081448e-04  1.79288501e-03
  1.96302193e-03 -5.37565686e-05  1.17047302e-03  5.61829799e-05
 -1.56825108e-02]

Training data: Area under the ROC curve = 0.9196428571428572

Test data: Area under the ROC curve = 0.71


In [21]:
### LASSO ###
   
lasso = LassoCV(alphas = alphas, cv = num_CV, random_state=SEED)
lasso.fit(X_train, y_train)

lasso_preds_train = lasso.predict(X_train)
lasso_preds_test = lasso.predict(X_test)

coeff_used = np.sum(lasso.coef_!=0)
print ("###################### LASSO CV ")
print ("Number of crossvalidation: ", num_CV)
print ("Alpha", lasso.alpha_)
print ("Number of features used: ", coeff_used)
print ("\nCoeffs:", lasso.coef_)

lasso_performance_train = roc_auc_score(y_train, lasso_preds_train)
print ('\nTraining data: Area under the ROC curve = {}'.format(lasso_performance_train))

lasso_performance_test = roc_auc_score(y_test, lasso_preds_test)
print ('\nTest data: Area under the ROC curve = {}'.format(lasso_performance_test))

###################### LASSO CV 
Number of crossvalidation:  5
Alpha 10.0
Number of features used:  1

Coeffs: [ 0.00000000e+00  0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -6.29216775e-05 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00]

Training data: Area under the ROC curve = 0.7982142857142857

Test data: Area under the ROC curve = 0.49


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
