In [3]:
import os
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression, Ridge, Lasso 
from sklearn.model_selection import train_test_split, cross_val_score 
from statistics import mean

In [4]:
# Loading data
path = '.'
def load_dataset():
    csv_path= os.path.join(path,'diabetes.csv')
    return pd.read_csv(csv_path)
dataset = load_dataset()

# Imputation with median strategy
from sklearn.impute import SimpleImputer
imputer_median = SimpleImputer(missing_values = 0, strategy = 'median')
dataset.iloc[:,1:6] = pd.DataFrame(imputer_median.fit_transform(dataset.values[:, 1:6]), # ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
                               columns=dataset.columns.values[1:6])

# Create features and labels 
features = dataset.drop(['Outcome'], axis=1)
labels = dataset['Outcome']

# Create training and test set 
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.25, random_state=123)

# Data up-sampling
from sklearn.utils import resample

df = pd.concat([features_train,labels_train],axis=1)
df.rename(columns={'Outcome':'balance'}, inplace=True)

# Separate majority (n=367) and minority (n=209) classes
df_majority = df[df['balance']==0] 
df_minority = df[df['balance']==1]

# Upsample samples with replacement for minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True, # sample with replacement
                                 n_samples=381, # to match majority class
                                 random_state=123) # reproducible results
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled.balance.value_counts()

# re-preparing the training sets
features_train = df_upsampled.drop(['balance'], axis=1)
labels_train = df_upsampled[['balance']]
labels_train.rename(columns={'balance':'Outcome'}, inplace=True)

# Implement scaling using standardiation
from sklearn.preprocessing import StandardScaler
standardScaler = StandardScaler()
features_train_std = standardScaler.fit_transform(features_train)
features_test_std = standardScaler.transform(features_test)

#### Linear Regression model

In [5]:
# Bulding and fitting the Linear Regression model 
linearModel = LinearRegression() 
linearModel.fit(features_train_std, labels_train)
# Evaluating the Linear Regression model 
print(linearModel.score(features_test_std, labels_test))

0.3725020361537602


#### Logistic regression model

In [79]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV

C=[0.01,0.01,0.1,1,10,100,1000]
penalty=['None','l1','l2']

grid_params = {'penalty' : penalty, 'C' : C}
gs=GridSearchCV(LogisticRegression(),grid_params,verbose=1,cv=5,n_jobs=-1)
gs_results=gs.fit(features_train_std, labels_train)

Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:    6.5s finished


In [80]:
gs_results.best_score_

0.7638630890952873

In [81]:
gs_results.best_params_

{'C': 0.1, 'penalty': 'l2'}

#### Ridge regression model

In [64]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
alpha=[i*0.25 for i in range(1,9)]
# C=[0.001,0.01,0.1,1,10,100,1000]
# penalty=['None','l1','l2']
grid_params = {'alpha' : alpha}
gs=GridSearchCV(Ridge(),grid_params,verbose=1,cv=5,n_jobs=-1)
gs_results=gs.fit(features_train_std, labels_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    9.4s finished


In [63]:
Ridge().get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'normalize', 'random_state', 'solver', 'tol'])

In [46]:
gs_results.best_score_

0.0700238653386351

In [48]:
gs_results.best_params_

{'alpha': 0.25}

#### Lasso Regression Model

In [50]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
alpha=[i*0.25 for i in range(1,9)]
grid_params = {'alpha' : alpha}
gs=GridSearchCV(Lasso(),grid_params,verbose=1,cv=5,n_jobs=-1)
gs_results=gs.fit(features_train_std, labels_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.0s finished


In [65]:
Lasso().get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'normalize', 'positive', 'precompute', 'random_state', 'selection', 'tol', 'warm_start'])

In [51]:
gs_results.best_score_

-5.404161009647801e-05

In [52]:
gs_results.best_estimator_

Lasso(alpha=0.25)

In [53]:
gs_results.best_params_

{'alpha': 0.25}

In [54]:
GridSearchCV.get_params(gs).keys()

dict_keys(['cv', 'error_score', 'estimator__alpha', 'estimator__copy_X', 'estimator__fit_intercept', 'estimator__max_iter', 'estimator__normalize', 'estimator__positive', 'estimator__precompute', 'estimator__random_state', 'estimator__selection', 'estimator__tol', 'estimator__warm_start', 'estimator', 'iid', 'n_jobs', 'param_grid', 'pre_dispatch', 'refit', 'return_train_score', 'scoring', 'verbose'])

#### Elastic Net Model

In [56]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
alpha=[i*0.25 for i in range(1,9)]
grid_params = {'alpha' : alpha}
gs=GridSearchCV(ElasticNet(),grid_params,verbose=1,cv=5,n_jobs=-1)
gs_results=gs.fit(features_train_std, labels_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.0s finished


In [57]:
gs_results.best_score_

0.03347020938888649

In [58]:
gs_results.best_estimator_

ElasticNet(alpha=0.25)

In [59]:
gs_results.best_params_

{'alpha': 0.25}

In [60]:
GridSearchCV.get_params(gs).keys()

dict_keys(['cv', 'error_score', 'estimator__alpha', 'estimator__copy_X', 'estimator__fit_intercept', 'estimator__l1_ratio', 'estimator__max_iter', 'estimator__normalize', 'estimator__positive', 'estimator__precompute', 'estimator__random_state', 'estimator__selection', 'estimator__tol', 'estimator__warm_start', 'estimator', 'iid', 'n_jobs', 'param_grid', 'pre_dispatch', 'refit', 'return_train_score', 'scoring', 'verbose'])