In [1]:
import os
import sys

import IPython
import ipywidgets as widgets
import matplotlib.pyplot as plt
import mglearn
import numpy as np
import pandas as pd


%matplotlib inline
pd.set_option("display.max_colwidth", 200)

In [10]:
from IPython.display import HTML, display
from ipywidgets import interact, interactive
# from plotting_functions import *
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import (
    ColumnTransformer,
    make_column_transformer
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.linear_model import LogisticRegression
# from utils import *

In [16]:
df = pd.read_csv(
    "../../data/raw/abalone.data",
    names=[
        "Sex",
        "Length",
        "Diameter",
        "Height",
        "Whole weight",
        "Shucked weight",
        "Viscera weight",
        "Shell weight",
        "Rings",
    ],
    header=0,
)

# Add a classification target columns. If rings > 11, then classified as old
df["Is old"] = np.where(df["Rings"] > 11, "old", "young")
df.head()
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
X_train = train_df.drop(columns=['Is old'])
X_test = test_df.drop(columns=['Is old'])
y_train = train_df['Is old']
y_test = test_df['Is old']

In [17]:
# create column transformer for analysis pipeline

categorical_feature = ['Sex']
numerical_features = ['Length', 'Diameter', 'Height', 'Whole weight', 
'Shucked weight', 'Viscera weight', 'Shell weight']
target = 'Is old'
drop_feature = ['Rings']

preprocessor = make_column_transformer(
    (StandardScaler(), numerical_features),
    (OneHotEncoder(handle_unknown="ignore", sparse=False), 
     categorical_feature),
    ("drop", drop_feature),
)

In [18]:
# fit the model by using random search CV

lr = LogisticRegression(max_iter=2000)
pipe = make_pipeline(preprocessor, lr)
param_grid = {"logisticregression__C": 10.0 ** np.arange(-3, 4)}

random_search = RandomizedSearchCV(pipe,
                                   param_distributions = param_grid,
                                   n_jobs = -1,
                                   n_iter = 10,
                                   cv = 5,
                                   random_state = 123)
random_search.fit(pd.DataFrame(X_train), y_train)



RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('standardscaler',
                                                                               StandardScaler(),
                                                                               ['Length',
                                                                                'Diameter',
                                                                                'Height',
                                                                                'Whole '
                                                                                'weight',
                                                                                'Shucked '
                                                                                'weight',
                                                                                'V

In [24]:
# view the training results in a data frame

train_results = pd.DataFrame(random_search.cv_results_)[
    [   "mean_test_score",
        "param_logisticregression__C",
        "mean_fit_time",
        "rank_test_score",
    ]
].set_index("rank_test_score").sort_index().T

In [30]:
# pick the final model and predict test data

best_c = train_results.iloc[1, 0]

pipe_best = make_pipeline(preprocessor, LogisticRegression(C = best_c, max_iter=2000))
pipe_best.fit(X_train, y_train)

pipe_best.score(X_test, y_test)

0.8421052631578947