In [17]:
# Question 3a

# Load the OpenML diabetes dataset
from sklearn.datasets import fetch_openml
diabetes = fetch_openml(data_id = 37)

# Study the attributes in the diabetes dataset
print(dir(diabetes))

# Feature data's shape and target's shape
print("Data's shape: ", diabetes.data.shape) 
print("Target's shape: ", diabetes.target.shape)

# Feature names and target names
print("Feature names: ", diabetes.feature_names)
print("Target names: ", diabetes.target_names)

# textual description
print("Textual description: ", diabetes.DESCR,sep="\n")

['DESCR', 'categories', 'data', 'details', 'feature_names', 'frame', 'target', 'target_names', 'url']
Data's shape:  (768, 8)
Target's shape:  (768,)
Feature names:  ['preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age']
Target names:  ['class']
Textual description: 
**Author**: [Vincent Sigillito](vgs@aplcen.apl.jhu.edu)  

**Source**: [Obtained from UCI](https://archive.ics.uci.edu/ml/datasets/pima+indians+diabetes) 

**Please cite**: [UCI citation policy](https://archive.ics.uci.edu/ml/citation_policy.html)  

1. Title: Pima Indians Diabetes Database
 
 2. Sources:
    (a) Original owners: National Institute of Diabetes and Digestive and
                         Kidney Diseases
    (b) Donor of database: Vincent Sigillito (vgs@aplcen.apl.jhu.edu)
                           Research Center, RMI Group Leader
                           Applied Physics Laboratory
                           The Johns Hopkins University
                           Johns Hopkins Road
              

In [18]:
# Question 3b

import pandas as pd
import numpy as np

# Create a pandas DataFrame
diabetes_df = pd.DataFrame(diabetes.data, columns = diabetes.feature_names)

# Get the name of target column stored in list 
name = diabetes.target_names[0]

# Add a new column with the target name (class) and column data in dataframe
diabetes_df[name] = diabetes.target
                     
# Display DataFrame and descriptive statistics
display(diabetes_df)
display(diabetes_df.describe())

# Actually, diabetes.data are stored in DataFrame already in the OpenML dataset.
# print(type(diabetes.data))
# It is not necessary to create a DataFrame from pandas. but I just follow the question requirement.

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,tested_positive
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,tested_negative
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,tested_positive
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,tested_negative
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,tested_positive
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0,tested_negative
764,2.0,122.0,70.0,27.0,0.0,36.8,0.340,27.0,tested_negative
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,tested_negative
766,1.0,126.0,60.0,0.0,0.0,30.1,0.349,47.0,tested_positive


Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


#### As many libraries may use repeatly starting from Question 3c, I will consider each question as a separated one from now on and import relevant libraries each time for completeness.

In [28]:
# Question 3c
from sklearn.datasets import fetch_openml
diabetes = fetch_openml(data_id = 37)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import SGDClassifier

X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size = 0.3)
# usually cannot converge with deafult max_iter = 100
clf = LogisticRegression(max_iter=200)

# the score is more stable when using LogisticRegression(max_iter=200)
# clf = make_pipeline(StandardScaler(), SGDClassifier(loss="log_loss"))
clf.fit(X_train, y_train)

score = clf.score(X_test, y_test)
print(score)

0.7662337662337663


In [20]:
# Question 4d
from sklearn.datasets import fetch_openml
diabetes = fetch_openml(data_id = 37)

from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import shuffle

X, y = shuffle(diabetes.data, diabetes.target)
nb = GaussianNB()
scores = cross_val_score(nb, X, y, cv=10)
print(scores, scores.mean())

[0.80519481 0.74025974 0.81818182 0.81818182 0.68831169 0.74025974
 0.76623377 0.81818182 0.65789474 0.72368421] 0.7576384142173616


In [21]:
# Question 3e
# Use the aggregate dataset in Question 3b, group by target column and count each category
diabetes_df.groupby([name])[name].count()

class
tested_negative    500
tested_positive    268
Name: class, dtype: int64

Question 3e continue

There are 268 tested_positive examples and 500 tested_negative examples.

The majority class is the tested_negative and the dummy model will always predict tested_negative.

Average accuracy = $\frac{500}{(500 + 268)}$ = 0.6510

Both logistic regression model in part c and the naive bayes model in part d give a test score of around 0.75 (on average), which is better than this dummy model with score of 0.6510 by $\frac{0.75-0.6510}{0.6510} x 100\% = 15\%$

In [22]:
# Question 4a
# Load the boston dataset and neglect warnings
import warnings
from sklearn.datasets import load_boston

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    boston = load_boston()

In [23]:
# Question 4a (Continue)
# Predict numerical label, use Regressor instead of Classifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle

# As study guide says, train_test_split will shuffle data but cross_val_score will not
# shuffle using shuffle() function
X, y = shuffle(boston.data, boston.target)
boost = AdaBoostRegressor()

# Default cv = 5
scores = cross_val_score(boost, X, y, n_jobs=-1)
# Print the 5 scores and the mean
print(scores, scores.mean())

[0.86897933 0.84157567 0.83844111 0.82438257 0.4536    ] 0.7653957375113629


In [30]:
# Question 4b
# Load the boston dataset and neglect warnings
import warnings
from sklearn.datasets import load_boston

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    boston = load_boston()
    
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle

from sklearn.svm import SVR
    
X, y = shuffle(boston.data, boston.target)
boost = AdaBoostRegressor(base_estimator=SVR(C=100))

scores = cross_val_score(boost, X, y, n_jobs=-1)
print(scores, scores.mean())

[0.59374811 0.48948145 0.46494085 0.61145902 0.62458709] 0.5568433034297416


In [29]:
# Question 4c
import warnings
from sklearn.datasets import load_boston

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    boston = load_boston()
    
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle

from sklearn.svm import SVR

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X, y = shuffle(boston.data, boston.target)
pipe = make_pipeline(StandardScaler(), SVR(C=100))
boost = AdaBoostRegressor(base_estimator=pipe)

scores = cross_val_score(boost, X, y, n_jobs=-1)
print(scores, scores.mean())

[0.83286318 0.778907   0.87695733 0.84771706 0.89781902] 0.8468527173955602


Question 4c (Continue)

The mean score of part b is around 0.55 and the mean score of part c is around 0.86. Since part c results in a much better score than that of part b, performance in part b is worse than that of part c.

The difference is that part b does not apply standardization using StandardScaler() but part c does.
The reason for the difference is that many machine learning algorithms, including SVR are sensitive to the data range. These algorithms can work better (i.e. higher prediction accuracy) when the features are with smaller and similiar data range.

Part c will use StandardScaler() to scale the features with smaller and similiar data range before applying SVR, which leads to a better performance of SVR and the performance.

In [26]:
# Question 4d
from sklearn.ensemble import AdaBoostRegressor

boost = AdaBoostRegressor()
display(boost.get_params())

{'base_estimator': None,
 'learning_rate': 1.0,
 'loss': 'linear',
 'n_estimators': 50,
 'random_state': None}

In [45]:
# Question 4e
import warnings
from sklearn.datasets import load_boston

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    boston = load_boston()
    
from sklearn.ensemble import AdaBoostRegressor
from sklearn.utils import shuffle

from sklearn.svm import SVR

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import RandomizedSearchCV 
import numpy as np

X, y = shuffle(boston.data, boston.target)
boost = AdaBoostRegressor()

# The question did not specific if it is default SVR() or SVR(C=100)
# SVR() has a short calculation time, but score is not as good as SVR(C=100)
# Also it is not comparable with question 4c
# pipe = make_pipeline(StandardScaler(), SVR())
pipe = make_pipeline(StandardScaler(), SVR(C=100))

param_dist = {"base_estimator": (None, pipe),
              "learning_rate": np.arange(0.1, 2, 0.1), 
              "n_estimators": range(10, 101, 10) 
             }
grid = RandomizedSearchCV(boost, param_dist, n_iter = 30, n_jobs=-1)
grid.fit(X, y)
display(grid.best_estimator_, grid.best_score_, grid.best_params_)

0.8772066108339814

{'n_estimators': 20,
 'learning_rate': 0.4,
 'base_estimator': Pipeline(steps=[('standardscaler', StandardScaler()), ('svr', SVR(C=100))])}