In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes
from sklearn.metrics import r2_score

In [65]:
diabetes = load_diabetes()
data = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
target = pd.DataFrame(diabetes.target, columns=['target'])
df = pd.concat([data, target], axis=1)

In [66]:
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [67]:
df.to_csv('diabetes.csv')

In [68]:
# Splitting data into features and target
x = df.drop('target', axis=1)
y = df['target']

In [69]:
# Splitting data into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [70]:
len(x_train)

353

In [71]:
len(x_test)

89

# Linear Regression

In [72]:
#applying linear regression
model = LinearRegression()
model.fit(x_train, y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [73]:
# calculating r2 score
y_prediction = model.predict(x_test)
r2_model = r2_score(y_test, y_prediction)
print('R2 score of base model: ', r2_model)

R2 score of base model:  0.3322332173106183


# Ridge & Lasso Regression

In [74]:
from sklearn.linear_model import Ridge, Lasso

In [75]:
# applying ridge regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(x_train, y_train)
y_prediction_ridge = ridge_model.predict(x_test)
r2_model_ridge = r2_score(y_test, y_prediction_ridge)
print('R2 score of ridge model with alpha 1.0: ', r2_model_ridge)

R2 score of ridge model with alpha 1.0:  0.3409800318493462


In [76]:
# applying ridge regression
ridge_model = Ridge(alpha=0.5)
ridge_model.fit(x_train, y_train)
y_prediction_ridge = ridge_model.predict(x_test)
r2_model_ridge = r2_score(y_test, y_prediction_ridge)
print('R2 score of ridge model with alpha 0.5: ', r2_model_ridge)

R2 score of ridge model with alpha 0.5:  0.3566824234353274


In [77]:
# applying ridge regression
ridge_model = Ridge(alpha=0.1)
ridge_model.fit(x_train, y_train)
y_prediction_ridge = ridge_model.predict(x_test)
r2_model_ridge = r2_score(y_test, y_prediction_ridge)
print('R2 score of ridge model with alpha 0.1: ', r2_model_ridge)

R2 score of ridge model with alpha 0.1:  0.34230494489707397


In [78]:
# applying ridge regression
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(x_train, y_train)
y_prediction_lasso = lasso_model.predict(x_test)
r2_model_lasso = r2_score(y_test, y_prediction_lasso)
print('R2 score of lasso model with alpha 1.0: ', r2_model_lasso)

R2 score of lasso model with alpha 1.0:  0.26132445466089016


In [79]:
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [80]:
# Identifying the coefficients with bad slope results
bad_features = np.where(lasso_model.coef_==0)[0]
print('Features with bad slope results:', list(x.columns[bad_features]))

Features with bad slope results: ['age', 'sex', 'bp', 's1', 's2', 's3', 's4', 's6']


In [81]:
# Dropping the coefficients with bad slope results
x_train_filtered = x_train.drop(x_train.columns[bad_features], axis=1)
x_test_filtered = x_test.drop(x_test.columns[bad_features], axis=1)

In [82]:
# Applying linear & lasso regression on FILTERED data
model_filtered_linear = LinearRegression()
model_filtered_linear.fit(x_train_filtered, y_train)

model_filtered_lasso = Lasso(alpha=0.1)
model_filtered_lasso.fit(x_train_filtered, y_train)

y_prediction_linear_filtered = model_filtered_linear.predict(x_test_filtered) 
r2_model_linear = r2_score(y_test, y_prediction_linear_filtered)
print('R2 score of new linear regression model: ', r2_model_linear)

y_prediction_lasso_filtered = model_filtered_lasso.predict(x_test_filtered)
r2_model_lasso = r2_score(y_test, y_prediction_lasso_filtered)
print('R2 score of lasso model with alpha 1.0: ', r2_model_lasso)

R2 score of new linear regression model:  0.2835404995530828
R2 score of lasso model with alpha 1.0:  0.28836135763003234


In [86]:
# Identifying the coefficients with values closer to 0
small_coefficients = np.where((lasso_model.coef_ < 0.05) & (lasso_model.coef_ >- 0.05))[0]
print('Features with very low coefficients: ', list(x.columns[small_coefficients]))

Features with very low coefficients:  ['age', 'sex', 'bp', 's1', 's2', 's3', 's4', 's6']


In [103]:
small_coefficients

array([0, 1, 3, 4, 5, 6, 7, 9])

In [104]:
# Dropping the coefficients with bad slope results
x_train_new = x_train.drop(x_train.columns[small_coefficients], axis=1)
x_test_new = x_test.drop(x_test.columns[small_coefficients], axis=1)
# Applying Lasso Regression
model_filtered_small_coefficients = Lasso(alpha=0.1)
model_filtered_small_coefficients.fit(x_train_new, y_train)

y_prediction_new = model_filtered_linear.predict(x_test_new)
r2_model_new = r2_score(y_test, y_prediction_new)
print('R2 Score of the new Lasso regression model is: ', r2_model_new)

R2 Score of the new Lasso regression model is:  0.2835404995530828


In [109]:
import itertools

best_r2_score = 0
best_combination = []

for r in range(1, len(small_coefficients) + 1):
    for combination in itertools.combinations(small_coefficients, r):
        cols_to_drop = x_train.columns[list(combination)]
        x_train_temp = x_train.drop(cols_to_drop, axis=1)
        x_test_temp = x_test.drop(cols_to_drop, axis=1)

        model = Lasso(alpha=0.1).fit(x_train_temp, y_train)
        r2_temp = r2_score(y_test, model.predict(x_test_temp))

        if r2_temp > best_r2_score:
            best_r2_score = r2_temp
            best_combination = combination

print("Best R2 score:", best_r2_score)
print("Best combination of coefficients:", list(best_combination))

Best R2 score: 0.3475341632439666
Best combination of coefficients: [0, 4, 5]
