# Занятие 12. Ансамблевые методы и настройка алгоритмов для повышения качества моделирования, отложенное использование модели

In [1]:
import os
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
import numpy
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from pickle import dump
from pickle import load
from joblib import dump
from joblib import load

In [2]:
os.chdir("C:/Users/HP/Documents/analysis/Marketing/data/")

## Ансамблевые методы

### Bagging

#### Bagged Decision Trees

In [3]:
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)

In [4]:
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

In [5]:
kfold = KFold(n_splits=10, random_state=7, shuffle=True)

In [9]:
# Bagged Decision Trees for Classification
cart = DecisionTreeClassifier()
num_trees = 100
seed = 7
model = BaggingClassifier(base_estimator=cart, с=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7578263841421736


#### Random Forest

In [10]:
# Random Forest Classification
num_trees = 100
max_features = 3
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7695317840054683


#### Extra Trees

In [11]:
# Extra Trees Classification
num_trees = 100
max_features = 7
model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7643198906356801


### Boosting

#### AdaBoost

In [12]:
# AdaBoost Classification
num_trees = 30
seed=7
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7552802460697198


#### Stochastic Gradient Boosting (Gradient Boosting Machines)

In [13]:
# Stochastic Gradient Boosting Classification
seed = 7
num_trees = 100
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7591934381408066


#### XGBoost

In [14]:
# Extreme Gradient Boosting Classification
seed = 7
num_trees = 100
model = XGBClassifier(seed=seed,
                      n_estimators=num_trees,
                      max_depth=6,
                      learning_rate=0.3)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.7252563226247437


### Voting

In [16]:
# Voting Ensemble for Classification
# create the sub models
estimators = []
model1 = LogisticRegression(solver='liblinear')
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))
model3 = SVC(gamma='auto')
estimators.append(('svm', model3))

In [17]:
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X, Y, cv=kfold)
print(results.mean())

0.7526828434723172


## Настройка алгоритмов

### Grid Search Parameter Tuning

In [18]:
# Grid Search for Algorithm Tuning
alphas = numpy.array([1,0.1,0.01,0.001,0.0001,0])
param_grid = dict(alpha=alphas)
model = RidgeClassifier()
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid.fit(X, Y)
print(grid.best_score_)
print(grid.best_estimator_.alpha)

0.7708333333333334
1.0


### Random Search Parameter Tuning

In [19]:
# Randomized for Algorithm Tuning
param_grid = {'alpha': uniform()}
model = RidgeClassifier()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100,cv=3, random_state=7)
rsearch.fit(X, Y)
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)

0.7708333333333334
0.07630828937395717


## Отложенное использование модели

### Finalize Your Model with pickle

In [20]:
# Save Model Using Pickle
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7)

In [21]:
# Fit the model on 33%
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)

LogisticRegression(solver='liblinear')

In [22]:
# save the model to disk
filename = 'finalized_model.sav'
dump(model, open(filename, 'wb'))

In [23]:
# some time later...
# load the model from disk
loaded_model = load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

0.7559055118110236


### Finalize Your Model with Joblib

In [6]:
# Save Model Using joblib
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7)

In [7]:
# Fit the model on 33%
model = LogisticRegression(solver='liblinear')
model.fit(X_train, Y_train)

LogisticRegression(solver='liblinear')

In [8]:
# save the model to disk
filename = 'finalized_model.sav'
dump(model, filename)

['finalized_model.sav']

In [9]:
# some time later...
# load the model from disk
loaded_model = load(filename)
result = loaded_model.score(X_test, Y_test)
print(result)

0.7559055118110236
