<a href="https://colab.research.google.com/github/vankiee/ML_for_data_analysis/blob/main/LassoRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd 
from sklearn import metrics, datasets
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pylab as plt
from matplotlib import pyplot

# Clean data

In [None]:
df = pd.read_csv('/content/gapminder.csv')

# change data type of all but 1st column
df[df.columns[1:]] = df.iloc[:, 1:].apply(pd.to_numeric, errors = 'coerce') 

# remove rows which are NaN in target column
dataset = df[df['lifeexpectancy'].notna()].reset_index(drop = True)
dataset.shape

(191, 16)

In [None]:
dataset.isnull().sum()

country                   0
incomeperperson          15
alcconsumption           15
armedforcesrate          27
breastcancerper100th     19
co2emissions              7
femaleemployrate         15
hivrate                  44
internetuserate          12
lifeexpectancy            0
oilperperson            129
polityscore              31
relectricperperson       57
suicideper100th          13
employrate               15
urbanrate                 3
dtype: int64

In [None]:
# remove columns with null values more than 80% of rows
thresh = len(dataset) * .8
dataset = dataset.dropna(thresh=thresh, axis=1)
dataset.shape

(191, 13)

In [None]:
dataset['lifeexpectancy'] = [1 if x >= 76.6 else 0 for x in dataset['lifeexpectancy']]

# Fit model

In [None]:
x = dataset[dataset.columns.drop(['country', 'lifeexpectancy'])]
y = dataset['lifeexpectancy']

train_ratio = .75
validation_ratio = .15
test_ratio = .10

# train is 75% of dataset
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=1-train_ratio, 
                                                    random_state=123)

# test is 10% of initial dataset
# validation is 15% of initial dataset
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test,
                                                test_size = test_ratio/(test_ratio + validation_ratio))

In [None]:
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()
model = Lasso()

steps = [('imputer', imputer), 
         ('scaler', scaler),
         ('model', model)]
pipeline = Pipeline(steps=steps)

search = GridSearchCV(pipeline,
                      {'model__alpha': np.arange(0.1, 10, 0.1)},
                      cv=5,
                      scoring='neg_mean_squared_error',
                      verbose=3)

In [None]:
search.fit(x_train, y_train)

Fitting 5 folds for each of 99 candidates, totalling 495 fits
[CV] model__alpha=0.1 ................................................
[CV] ................... model__alpha=0.1, score=-0.075, total=   0.0s
[CV] model__alpha=0.1 ................................................
[CV] ................... model__alpha=0.1, score=-0.064, total=   0.0s
[CV] model__alpha=0.1 ................................................
[CV] ................... model__alpha=0.1, score=-0.077, total=   0.0s
[CV] model__alpha=0.1 ................................................
[CV] ................... model__alpha=0.1, score=-0.058, total=   0.0s
[CV] model__alpha=0.1 ................................................
[CV] ................... model__alpha=0.1, score=-0.154, total=   0.0s
[CV] model__alpha=0.2 ................................................
[CV] ................... model__alpha=0.2, score=-0.133, total=   0.0s
[CV] model__alpha=0.2 ................................................
[CV] ..........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ................... model__alpha=0.6, score=-0.167, total=   0.0s
[CV] model__alpha=0.6 ................................................
[CV] ................... model__alpha=0.6, score=-0.183, total=   0.0s
[CV] model__alpha=0.6 ................................................
[CV] ................... model__alpha=0.6, score=-0.141, total=   0.0s
[CV] model__alpha=0.6 ................................................
[CV] ................... model__alpha=0.6, score=-0.247, total=   0.0s
[CV] model__alpha=0.7000000000000001 .................................
[CV] .... model__alpha=0.7000000000000001, score=-0.219, total=   0.0s
[CV] model__alpha=0.7000000000000001 .................................
[CV] .... model__alpha=0.7000000000000001, score=-0.167, total=   0.0s
[CV] model__alpha=0.7000000000000001 .................................
[CV] .... model__alpha=0.7000000000000001, score=-0.183, total=   0.0s
[CV] model__alpha=0.7000000000000001 .................................
[CV] .

[Parallel(n_jobs=1)]: Done 495 out of 495 | elapsed:    3.6s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('imputer',
                                        SimpleImputer(add_indicator=False,
                                                      copy=True,
                                                      fill_value=None,
                                                      missing_values=nan,
                                                      strategy='median',
                                                      verbose=0)),
                                       ('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('model',
                                        Lasso(alpha=1.0, copy_X=True,
                                              fit_intercept=True,

In [None]:
search.best_params_

{'model__alpha': 0.1}

In [None]:
coefficients = search.best_estimator_.named_steps['model'].coef_
importance = np.abs(coefficients)
dict(zip(x.columns, importance))

{'alcconsumption': 0.0,
 'armedforcesrate': 0.0,
 'breastcancerper100th': 0.042712897712305645,
 'co2emissions': 0.0,
 'employrate': 0.0,
 'femaleemployrate': 0.0,
 'incomeperperson': 0.20900114197951875,
 'internetuserate': 0.0,
 'polityscore': 0.0,
 'suicideper100th': 0.0,
 'urbanrate': 0.0}