# Sklearn Datasets

In [1]:
# Miscellaneous operating system interfaces
import os

# A compendium of commonly-used regular expressions.
import re

# JSON encoder and decoder
import json

# Unix style pathname pattern expansion
from glob import glob

# Basic date and time types
from datetime import date, datetime, timedelta

# Set the absolute directory path.
BASE_PATH = os.path.abspath('')
dirs = os.path.dirname(BASE_PATH)
CONFIG_PATH = os.path.join(dirs, 'configs')
DATASET_PATH = os.path.join(dirs, 'datasets')
MIDDLEWARE_PATH = os.path.join(dirs, 'middlewares')
MODEL_PATH = os.path.join(dirs, 'models')
ROUTE_PATH = os.path.join(dirs, 'routes')
TEMPLATE_PATH = os.path.join(dirs, 'templates')

In [2]:
# The implementation of import
import importlib

# A Fast, Extensible Progress Bar for Python and CLI
from tqdm import tqdm

In [3]:
# The fundamental package for scientific computing with Python.
import numpy as np

# Flexible and powerful data analysis / manipulation library for Python, providing labeled data structures similar to R data.frame objects, statistical functions, and much more
import pandas as pd

In [4]:
# Statistical data visualization using matplotlib.
import seaborn as sns

# matplotlib: plotting with Python.
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager

# Set the matplotlib color cycle using a seaborn palette.
sns.set_palette('pastel')

# plot outputs appear and be stored within the notebook.
%matplotlib inline

# A module for finding, managing, and using fonts across platforms.
if sys.platform.startswith('darwin'):
    mpl.rc('font', family='AppleGothic')
elif sys.platform.startswith('win32'):
    mpl.rc('font', family='Malgun Gothic')

mpl.rc('axes', unicode_minus=False)

In [30]:
# Machine learning in Python
from sklearn import datasets
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Computing with Python functions.
import joblib

## Datasets

In [6]:
filename = 'breast_cancer'
train_df = pd.read_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_train.csv'.format(filename)))
test_df = pd.read_csv(os.path.join(DATASET_PATH, 'sklearn', '{}_test.csv'.format(filename)))
train_df.shape, test_df.shape

((426, 31), (143, 31))

## Preprocessing

In [7]:
train_df.head(1)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,25.73,17.46,174.2,2010.0,0.1149,0.2363,0.3368,0.1913,0.1956,0.06121,...,23.58,229.3,3234.0,0.153,0.5937,0.6451,0.2756,0.369,0.08815,0


In [8]:
y_train = train_df['target'].values
scaler = MinMaxScaler()
X_train = scaler.fit_transform(train_df.drop(columns='target', axis=1))
X_train.shape, y_train.shape

((426, 30), (426,))

In [9]:
test_df.head(1)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,20.26,23.03,132.4,1264.0,0.09078,0.1313,0.1465,0.08683,0.2095,0.05649,...,31.59,156.1,1750.0,0.119,0.3539,0.4098,0.1573,0.3689,0.08368,0


In [10]:
y_test = test_df['target'].values
X_test = scaler.fit_transform(test_df.drop(columns='target', axis=1))
X_test.shape, y_test.shape

((143, 30), (143,))

## LogisticRegression

In [11]:
lr = LogisticRegression()
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

## GridSearchCV

In [28]:
%%time
param_grid = {
    'C': np.arange(0, 100, 2)
}
gs = GridSearchCV(lr, param_grid=param_grid, scoring='accuracy', cv=5)
gs.fit(X_train, y_train)
print('Best Score: {:.4f}'.format(gs.best_score_))
print('Best Params: ', gs.best_params_)

Best Score: 0.9718
Best Params:  {'C': 4}
CPU times: user 14.7 s, sys: 794 ms, total: 15.5 s
Wall time: 34 s


In [29]:
%%time
minimum = gs.best_params_.get('C') - 1
maximum = gs.best_params_.get('C') + 2
param_grid = {
    'C': np.arange(minimum, maximum, 1)
}
gs = GridSearchCV(lr, param_grid=param_grid, scoring='accuracy', cv=5)
gs.fit(X_train, y_train)
print('Best Score: {:.4f}'.format(gs.best_score_))
print('Best Params: ', gs.best_params_)

Best Score: 0.9741
Best Params:  {'C': 5}
CPU times: user 520 ms, sys: 27.7 ms, total: 548 ms
Wall time: 550 ms


In [24]:
best_estimator = gs.best_estimator_
pred = best_estimator.predict(X_test)

accuracy_score(y_test, pred)

0.972027972027972

## Pipeline

## Export

In [27]:
# joblib.dump(best_lr, '../static/model/cancer_lr.pkl')