## Clone Estimator 

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification

# create dataset`
X1, y1 = make_classification(n_classes=2, n_features=5, random_state=1)

# create estimators 
logistic_classifier_1 = LogisticRegression()

In [2]:
logistic_classifier_1 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [3]:
from sklearn.base import clone, is_classifier, is_regressor 

#  duplicae the first classifier with clone function 
logistic_classifier_2 = clone(logistic_classifier_1)

logistic_classifier_2 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Identify model if is a Classifier or Regressior 

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

# create estimators 
model_1 = LinearRegression()
model_2 = RandomForestClassifier() 

In [5]:
# check if it classifier 
from sklearn.base import is_classifier 

is_classifier(model_2)

True

In [6]:
# check if it regressor 
from sklearn.base import is_regressor 

is_regressor(model_1)

True

## make column selector

In [7]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

# create a dataframe with different data types
data = pd.DataFrame(
    {"gender": ["male", "female", "female", "male"],
     "age": [23, 5, 11, 8]}
)


# create a column transformer with make_column_selector

ct = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),  # ages
    (OneHotEncoder(), make_column_selector(dtype_include=object)),  # genders
) 

transformed_data = ct.fit_transform(data)

transformed_data 

array([[ 1.6464639 ,  0.        ,  1.        ],
       [-0.98787834,  1.        ,  0.        ],
       [-0.10976426,  1.        ,  0.        ],
       [-0.5488213 ,  0.        ,  1.        ]])

## Plot Decision Tree

In [8]:
# import libraries
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text 
from sklearn.datasets import load_iris

#load data 
iris = load_iris()

# create our instances
model = DecisionTreeClassifier()

# train test split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 0)

# fit and predict
model.fit(X_train, y_train)

# plot the tree
plt.figure(figsize = (20, 10))
plot_tree(model,feature_names=iris.feature_names, filled = True) 
plt.show() 

<Figure size 2000x1000 with 1 Axes>

## Fetch dataset from Openml.

In [9]:
from sklearn.datasets import fetch_openml 

#fetch by using data name
bank_marketing = fetch_openml(name="bank-marketing")

# seperate independent variables and target variable 
x = bank_marketing.data 
y = bank_marketing.target 

  " {version}.".format(name=name, version=res[0]['version']))


In [10]:
x[:2]

array([[ 5.800e+01,  4.000e+00,  1.000e+00,  2.000e+00,  0.000e+00,
         2.143e+03,  1.000e+00,  0.000e+00,  2.000e+00,  5.000e+00,
         8.000e+00,  2.610e+02,  1.000e+00, -1.000e+00,  0.000e+00,
         3.000e+00],
       [ 4.400e+01,  9.000e+00,  2.000e+00,  1.000e+00,  0.000e+00,
         2.900e+01,  1.000e+00,  0.000e+00,  2.000e+00,  5.000e+00,
         8.000e+00,  1.510e+02,  1.000e+00, -1.000e+00,  0.000e+00,
         3.000e+00]])

In [11]:
y[:2]

array(['1', '1'], dtype=object)

In [12]:
# fetch by using id from this link https://www.openml.org/d/1461
bank_marketing = fetch_openml(data_id=1461)

# seperate independent variables and target variable 
x = bank_marketing.data 
y  = bank_marketing.target 

In [13]:
x[:1] 

array([[ 5.800e+01,  4.000e+00,  1.000e+00,  2.000e+00,  0.000e+00,
         2.143e+03,  1.000e+00,  0.000e+00,  2.000e+00,  5.000e+00,
         8.000e+00,  2.610e+02,  1.000e+00, -1.000e+00,  0.000e+00,
         3.000e+00]])

In [14]:
y[1] 

'1'

## Learning Curve

In [15]:
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import learning_curve

# create dataset`
X, y = make_classification(n_classes=2, n_features=10, n_samples=5000, random_state=1)

# create estimator
KNN_classifier = KNeighborsClassifier() 


#
train_sizes, train_scores, test_scores = learning_curve(
estimator = KNN_classifier,
X = X, y = y,train_sizes=np.linspace(0.1, 1.0, 5), shuffle=True, cv = 5)

In [16]:
# the train size to generate learning curve
train_sizes 

array([ 400, 1300, 2200, 3100, 4000])

In [17]:
# Scores on training sets
train_scores 

array([[0.8575    , 0.8975    , 0.9       , 0.8775    , 0.8475    ],
       [0.90153846, 0.88846154, 0.88230769, 0.89153846, 0.88692308],
       [0.88863636, 0.89409091, 0.89181818, 0.88818182, 0.88636364],
       [0.8983871 , 0.89387097, 0.89032258, 0.88903226, 0.88580645],
       [0.893     , 0.894     , 0.88825   , 0.893     , 0.88625   ]])

In [18]:
# show validation scores
test_scores 

array([[0.828, 0.815, 0.835, 0.839, 0.814],
       [0.823, 0.823, 0.843, 0.848, 0.837],
       [0.833, 0.821, 0.854, 0.851, 0.843],
       [0.837, 0.825, 0.854, 0.857, 0.834],
       [0.834, 0.828, 0.849, 0.86 , 0.835]])

In [19]:
# find the mean of training scores and validation scores 
train_scores_mean = train_scores.mean(axis = 1)
print("Training Scores mean:{}".format(train_scores_mean))
test_scores_mean = test_scores.mean(axis = 1)
print("Test Scores mean:{}".format(test_scores_mean))

Training Scores mean:[0.876      0.89015385 0.88981818 0.89148387 0.8909    ]
Test Scores mean:[0.8262 0.8348 0.8404 0.8414 0.8412]


## Cross Validation and Prediction

In [20]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestRegressor
#load dataet
diabetes = datasets.load_diabetes()

X = diabetes.data
y = diabetes.target

RF_regressor = RandomForestRegressor()

# perfrom cross validation and prediction
y_pred = cross_val_predict(estimator=RF_regressor, X= X, y=y, cv=5)

In [21]:
#show prediction
y_pred[:10] 

array([229.04,  93.12, 195.18, 171.14,  86.27, 115.27,  75.69, 155.59,
       153.06, 168.94])

## Select From Model

In [36]:
from sklearn import datasets, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
#load dataet
diabetes = datasets.load_diabetes()

X = diabetes.data
y = diabetes.target

lg_regressor = LogisticRegression()

# identify and select important fatures by using SelectFromModel
selector = SelectFromModel(estimator=lg_regressor).fit(X, y)

#show estimator coefficient 
selector.estimator_.coef_


array([[-0.01631211, -0.04448689, -0.01041713, ..., -0.03925967,
        -0.02122777, -0.03405436],
       [ 0.00188878, -0.04444519, -0.00816801, ..., -0.03918144,
        -0.06436135, -0.05463903],
       [-0.02699287, -0.04433151, -0.06285579, ..., -0.0756844 ,
        -0.05557734, -0.06683906],
       ...,
       [ 0.03415162,  0.05040128,  0.11077166, ..., -0.00292399,
         0.027618  ,  0.07302442],
       [ 0.03416799,  0.05030017,  0.12469165, ...,  0.10747183,
        -0.00019805,  0.02747969],
       [-0.04907612, -0.04462806,  0.16038187, ...,  0.0340123 ,
         0.02773604,  0.01114488]])

In [37]:
x.shape 

(45211, 16)

In [23]:
# show the treshold value
selector.threshold_

12.197550946960686

In [24]:
# transform data to selected features 
transformed = selector.transform(X)
transformed[:3]

array([[ 0.05068012,  0.06169621,  0.02187235, -0.04340085, -0.00259226,
         0.01990842],
       [-0.04464164, -0.05147406, -0.02632783,  0.07441156, -0.03949338,
        -0.06832974],
       [ 0.05068012,  0.04445121, -0.00567061, -0.03235593, -0.00259226,
         0.00286377]])

In [38]:
transformed.shape 

(442, 6)

## FunctionTransformer

In [25]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer

X = np.array([[89,34,9, 1,5,87,54,22,67,44], [12, 63,67,2,9,45,81,54,22,73]])

#create FunctionTransformer
log_transformer = FunctionTransformer(np.log)

#transform the data
log_transformer.transform(X)

array([[4.48863637, 3.52636052, 2.19722458, 0.        , 1.60943791,
        4.46590812, 3.98898405, 3.09104245, 4.20469262, 3.78418963],
       [2.48490665, 4.14313473, 4.20469262, 0.69314718, 2.19722458,
        3.80666249, 4.39444915, 3.98898405, 3.09104245, 4.29045944]])

## Determine the target data type

In [26]:
from sklearn.utils.multiclass import type_of_target
from sklearn.linear_model import LogisticRegression
#load dataet
diabetes = datasets.load_diabetes()

X = diabetes.data
y = diabetes.target

type_of_target(y)

'multiclass'

## add dummy feature

In [27]:
import numpy as np 
from sklearn.preprocessing import add_dummy_feature

p = np.array([[89,34], [12, 63]])

add_dummy_feature(p, value=5)
 

array([[ 5., 89., 34.],
       [ 5., 12., 63.]])

## Impute Missing Values with Iterative Imputer

In [28]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer

In [29]:
# Create dataset with missing values
data = [[61, 22, 43,np.nan,67],
        [np.nan, 6, 27, 8, 11],
        [83, 51, np.nan, 32, 9],
        [74, np.nan, 35, 26, 97],
        [np.nan, 4, 13,45, 33]]

In [30]:
# Impute missing values using iterative imputer
iter_imp = IterativeImputer(random_state= 42)
iter_imp.fit_transform(data)



array([[61.        , 22.        , 43.        , 27.74898065, 67.        ],
       [50.92363568,  6.        , 27.        ,  8.        , 11.        ],
       [83.        , 51.        , 28.62176528, 32.        ,  9.        ],
       [74.        , 20.72107515, 35.        , 26.        , 97.        ],
       [67.54222006,  4.        , 13.        , 45.        , 33.        ]])

## Hyperparameter Tuning Using Random Search

In [31]:
from sklearn import linear_model, datasets
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats import randint

# Load data
iris = datasets.load_iris()
X = iris.data
y = iris.target

# create model
classifier = XGBClassifier()

# Create Hyperparameter Search Space
param_dist = {
     # randomly sample numbers from 50 to 400 estimators
    "n_estimators":  randint(50,400),
    "learning_rate": [0.01, 0.03, 0.05],
    "subsample": [0.5, 0.7],
    "max_depth": [3, 4, 5],
    "min_child_weight": [1, 2, 3],
}

# create random search

# Create randomized search 5-fold cross validation and 100 iterations
clf = RandomizedSearchCV(
    estimator=classifier,
    param_distributions=param_dist,
    random_state=1,
    n_iter=100,
    cv=5,
    verbose=0,
    n_jobs=-1,
)

# Fit randomized search
best_model = clf.fit(X, y)

In [32]:
# View best hyperparameters
print('Best n_estimator:', best_model.best_estimator_.get_params()['n_estimators'])
print('Best learning_rate:', best_model.best_estimator_.get_params()['learning_rate'])
print('Best subsample:', best_model.best_estimator_.get_params()['subsample'])
print('Best max_depth:', best_model.best_estimator_.get_params()['max_depth'])
print('Best min_child_weight:', best_model.best_estimator_.get_params()['min_child_weight'])

Best n_estimator: 259
Best learning_rate: 0.03
Best subsample: 0.5
Best max_depth: 3
Best min_child_weight: 1


## Load Text files

In [33]:
from sklearn.datasets import load_files

news_reports = load_files(
    container_path="news_report/",
    description="News reports in 2020",
    load_content=True,
)

In [34]:
# show target names 
news_reports.target_names

['business', 'healthy', 'international', 'sport']

In [35]:
# specify independent variable and target variable 
X = news_reports.data
y  = news_reports.target 