In [35]:
import pandas as pd
import numpy as np

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.datasets import load_digits

#from tqdm import tqdm

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import randint

from sklearn.preprocessing import StandardScaler

dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

def compute_meta_feature_mean(clf, X_train, X_test, y_train, cv):
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(X_train), n_classes), dtype=np.float32)
    X_meta_test = np.zeros((len(X_test), n_classes), dtype=np.float32)
    test_index = 0
    #X_meta_test = np.empty([len(X_test), n_classes], dtype=np.float32)
    for train_fold_index, predict_fold_index in cv.split(X_train):
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]

        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
        
        folded_test = folded_clf.predict_proba(X_test)
        #print(X_meta_test.shape)
        #print(folded_test.shape)
        X_meta_test = np.sum([folded_test, X_meta_test], axis = 0, dtype=float) 
        test_index += 1

    X_meta_test = np.divide(X_meta_test, test_index)
    return X_meta_train, X_meta_test

def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in classifiers
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

def compute_metric(clf, X_train=X_train, X_test=X_test, y_train=y_train):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

In [36]:
scaler = StandardScaler()
SX_train = scaler.fit_transform(X_train)
SX_test = scaler.transform(X_test)

In [5]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
#problem 6.6.2 meta features generation
p662_l1 = LogisticRegression(random_state = 42, penalty = 'l1', C = 0.001, solver = 'saga', multi_class = 'ovr', max_iter=2000)
p662_l2 = LogisticRegression(random_state = 42, penalty = 'l2', C = 0.001, solver = 'saga', multi_class = 'multinomial', max_iter=2000)
p662_rf = RandomForestClassifier(random_state = 42, n_estimators = 300)
p662_gb = GradientBoostingClassifier(random_state = 42, n_estimators = 200)

p662_meta_X_train, p662_meta_X_test = generate_meta_features([p662_l1, p662_l2, p662_rf, p662_gb], SX_train, SX_test, y_train, cv)

In [None]:
#problem 6.6.2 classification 
p662_meta_lr = LogisticRegression(random_state = 42, penalty = None, solver = 'lbfgs', multi_class = 'auto')
compute_metric(p662_meta_lr, p662_meta_X_train, p662_meta_X_test)

In [40]:
#problem 6.6.3 meta features generation
p663_rf = RandomForestClassifier(random_state = 42, n_estimators = 300)
p663_et = ExtraTreesClassifier(random_state = 42, n_estimators = 200)

p663_meta_X_train, p663_meta_X_test = generate_meta_features([p663_rf, p663_et], SX_train, SX_test, y_train, cv)

In [41]:
#problem 6.6.3 classification 
p663_meta_lr = LogisticRegression(random_state = 42, penalty = None, solver = 'lbfgs', multi_class = 'auto')
compute_metric(p663_meta_lr, p663_meta_X_train, p663_meta_X_test)

0.98449

In [10]:
#problem 6.6.4 meta features generation
p664_et = ExtraTreesClassifier(random_state = 42, n_estimators = 300)
p664_kn = KNeighborsClassifier()

p664_meta_X_train, p664_meta_X_test = generate_meta_features([p664_kn, p664_et], X_train, X_test, y_train, cv)

In [11]:
#problem 6.6.4 classification 
p664_meta_lr = LogisticRegression(random_state = 42, penalty = None, solver = 'lbfgs', multi_class = 'auto')
compute_metric(p664_meta_lr, p664_meta_X_train, p664_meta_X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.989904

In [12]:
#problem 6.6.5 meta features generation
p665_l1 = LogisticRegression(random_state = 42, penalty = 'l1', C = 0.001, solver = 'saga', multi_class = 'ovr', max_iter=2000)
p665_et = ExtraTreesClassifier(random_state = 42, n_estimators = 300)
p665_ab = AdaBoostClassifier(random_state = 42)
p664_kn = KNeighborsClassifier()

p665_meta_X_train, p665_meta_X_test = generate_meta_features([p665_l1, p665_et, p665_ab, p664_kn], SX_train, SX_test, y_train, cv)

In [13]:
#problem 6.6.5 classification 
p665_meta_lr = LogisticRegression(random_state = 42, penalty = None, solver = 'lbfgs', multi_class = 'auto')
compute_metric(p665_meta_lr, p665_meta_X_train, p665_meta_X_test)

0.984567

In [19]:
#problem 6.6.6 meta features generation
p666_rf = RandomForestClassifier(random_state = 42, n_estimators = 300)
p666_et = ExtraTreesClassifier(random_state = 42, n_estimators = 300)

p666_skf = StratifiedKFold(n_splits=20)
p666_meta_X_train, p666_meta_X_test = generate_meta_features([p666_rf, p666_et], SX_train, SX_test, y_train, p666_skf)

In [20]:
#problem 6.6.6 classification 
p666_meta_lr = LogisticRegression(random_state = 42, penalty = None, solver = 'lbfgs', multi_class = 'auto')
compute_metric(p666_meta_lr, p666_meta_X_train, p666_meta_X_test)

0.981069

In [22]:
#problem 6.6.8 meta features generation
p668_rf = RandomForestClassifier(random_state = 42, n_estimators = 300)
p668_et = ExtraTreesClassifier(random_state = 42, n_estimators = 300)

p668_skf = StratifiedKFold(n_splits=5)
p668_meta_X_train, p668_meta_X_test = generate_meta_features([p668_rf, p668_et], SX_train, SX_test, y_train, p668_skf)

In [23]:
#problem 6.6.8 classification 
p668_meta_lr = RandomForestClassifier(random_state = 42)
compute_metric(p668_meta_lr, p668_meta_X_train, p668_meta_X_test)

0.985118

In [24]:
#problem 6.6.9 classification 
p669_meta_lr = KNeighborsClassifier()
compute_metric(p669_meta_lr, p668_meta_X_train, p668_meta_X_test)

0.984162

In [25]:
#problem 6.6.10 classification 
p6610_meta_lr = GradientBoostingClassifier(random_state = 42)
compute_metric(p6610_meta_lr, p668_meta_X_train, p668_meta_X_test)

0.987404

In [26]:
#problem 6.6.11 meta features generation
p6611_rf = RandomForestClassifier(random_state = 42, n_estimators = 300, criterion='gini', max_depth=24)
p6611_et = ExtraTreesClassifier(random_state = 42, n_estimators = 300)

p6611_skf = StratifiedKFold(n_splits=3)
p6611_meta_X_train, p6611_meta_X_test = generate_meta_features([p6611_rf, p6611_et], SX_train, SX_test, y_train, p6611_skf)

In [27]:
#problem 6.6.11 classification 
p6611_meta_lr = ExtraTreesClassifier(random_state = 42, n_estimators = 100)
compute_metric(p6611_meta_lr, p6611_meta_X_train, p6611_meta_X_test)

0.984309

In [86]:
#problem 6.6.12
p6612_rf = RandomForestClassifier(random_state = 42, n_estimators = 300, criterion='gini', max_depth=24)
p6612_et = ExtraTreesClassifier(random_state = 42, n_estimators = 300)
p6612_lr = LogisticRegression(random_state = 42)

p6612_rf.fit(X_train, y_train)
p6612_et.fit(X_train, y_train)
p6612_lr.fit(X_train, y_train)

p6612_y_test_pred = np.zeros_like(y_test)
p6612_y_test_pred = np.sum([p6612_rf.predict(X_test), p6612_y_test_pred], axis = 0)
p6612_y_test_pred = np.sum([p6612_et.predict(X_test), p6612_y_test_pred], axis = 0)
p6612_y_test_pred = np.sum([p6612_lr.predict(X_test), p6612_y_test_pred], axis = 0)
#print(p6612_y_test_pred)
p6612_y_test_pred = np.floor_divide(p6612_y_test_pred, 3)
#print(p6612_y_test_pred)
np.round(f1_score(y_test, p6612_y_test_pred, average='macro'), 6)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.965569