In [193]:
from sklearn.ensemble import VotingClassifier, VotingRegressor, BaggingClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold, RandomizedSearchCV 
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, roc_auc_score, r2_score , log_loss
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVC
from ISLP import load_data
from sklearn.ensemble import RandomForestClassifier
import warnings

In [28]:
import pandas as pd
import numpy as np

from sklearn.ensemble import StackingClassifier

# Stack ensembling
- final estimator should be the best performer because other estimators are like feature extractors.

In [42]:
cancer = pd.read_csv("Cases/Wisconsin/BreastCancer.csv", index_col = 0)
cancer

Unnamed: 0_level_0,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
61634,5,4,3,1,2,2,2,3,1,Benign
63375,9,1,2,6,4,10,7,7,2,Malignant
76389,10,4,7,2,2,8,6,1,1,Malignant
95719,6,10,10,10,8,10,7,10,7,Malignant
128059,1,1,1,1,2,5,5,1,1,Benign
...,...,...,...,...,...,...,...,...,...,...
1369821,10,10,10,10,5,10,10,10,7,Malignant
1371026,5,10,10,10,4,10,5,6,3,Malignant
1371920,5,1,1,1,2,1,3,2,1,Benign
8233704,4,1,1,1,1,1,2,1,1,Benign


In [44]:
X = cancer.drop('Class', axis =1)
y = cancer.Class

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 24, test_size = .3, stratify = y)

In [48]:
knn = KNeighborsClassifier()
nb = GaussianNB()
dtc = DecisionTreeClassifier(random_state = 24)
svm = SVC(probability = True, random_state = 24)
lr = LogisticRegression(random_state = 24)
stack = StackingClassifier(estimators = [('KNN', knn), ('NB', nb), ('TREE', dtc), ('SVM', svm)], final_estimator = lr)

In [50]:
stack.fit(X_train, y_train)

In [51]:
y_pred = stack.predict(X_test)
accuracy_score(y_test, y_pred)

0.9714285714285714

In [61]:
y_pred_proba = stack.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_proba)

0.990841384863124

### doin it manually

In [133]:
knn.fit(X_train, y_train)
y0 = knn.predict(X_train)
x0 = knn.predict(X_test)
nb.fit(X_train, y_train)
y1 = nb.predict(X_train)
x1 = nb.predict(X_test)
dtc.fit(X_train, y_train)
y2 = dtc.predict(X_train)
x2 = dtc.predict(X_test)
svm.fit(X_train, y_train)
y3 = svm.predict(X_train)
x3 = svm.predict(X_test)

In [135]:
x0 = np.concatenate([x0[:, np.newaxis], x1[:, np.newaxis], x2[:, np.newaxis], x3[:, np.newaxis]], axis=1)

In [137]:
y0 = np.concatenate([y0[:, np.newaxis], y1[:, np.newaxis], y2[:, np.newaxis], y3[:, np.newaxis]], axis=1)

In [145]:
y0.shape

(489, 4)

In [141]:
x0.shape

(210, 4)

In [143]:
y0 = np.where(y0 == 'Malignant', 1, 0)

In [147]:
y0


array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       ...,
       [0, 0, 0, 0],
       [1, 1, 1, 1],
       [0, 0, 0, 0]])

In [149]:
lr.fit(y0, y_train)

In [151]:
x0 = np.where(x0 == 'Malignant', 1, 0)

In [153]:
y0_pred = lr.predict(x0)

In [157]:
accuracy_score(y_test, y0_pred)

0.9571428571428572

# using stacking classifier `passthrough`

In [162]:
knn = KNeighborsClassifier()
nb = GaussianNB()
dtc = DecisionTreeClassifier(random_state = 24)
svm = SVC(probability = True, random_state = 24)
lr = LogisticRegression(random_state = 24)
stack = StackingClassifier(estimators = [('KNN', knn), ('NB', nb), ('TREE', dtc), ('SVM', svm)], final_estimator = lr, passthrough = True)

In [164]:
stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
accuracy_score(y_test, y_pred)

0.9714285714285714

In [166]:
y_pred_proba = stack.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_proba)

0.9959742351046699

## doin g it on other dataset

In [182]:
glass= pd.read_csv("Cases/Glass Identification/Glass.csv")
glass.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,building_windows_float_processed
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,building_windows_float_processed
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,building_windows_float_processed
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,building_windows_float_processed
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,building_windows_float_processed


In [175]:
X = glass.drop('Type', axis = 1)
y = glass.Type

In [239]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 24, stratify = y)

- with passthrough

In [242]:
rf= RandomForestClassifier(random_state=24)
stack = StackingClassifier(estimators = [('KNN', knn), ('NB', nb), ('TREE', dtc), ('SVM', svm)], final_estimator = rf, passthrough = True)
stack.fit(X_train, y_train)

In [244]:
y_pred = stack.predict(X_test)
print('Accuracy Score: ',accuracy_score(y_test, y_pred))
y_pred_prob= stack.predict_proba(X_test)
print('Log Loss: ',log_loss(y_test,y_pred_prob))

Accuracy Score:  0.7209302325581395
Log Loss:  0.6353093387132638


- withuot passthrough

In [247]:
stack = StackingClassifier(estimators = [('KNN', knn), ('NB', nb), ('TREE', dtc), ('SVM', svm)], final_estimator = rf, passthrough = False)
stack.fit(X_train, y_train)
y_pred = stack.predict(X_test)
y_pred_prob= stack.predict_proba(X_test)

print('Log Loss: ',log_loss(y_test, y_pred_prob))

Log Loss:  0.749800813401577


# using gcv

In [269]:
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 24)
rb = RandomForestClassifier(random_state = 24)
stack = StackingClassifier(estimators = [('KNN', knn), ('NB', nb), ('TREE', dtc), ('SVM', svm)], final_estimator = rf, passthrough = False)


In [261]:
stack.get_params()

{'cv': None,
 'estimators': [('KNN', KNeighborsClassifier()),
  ('NB', GaussianNB()),
  ('TREE', DecisionTreeClassifier(random_state=24)),
  ('SVM', SVC(probability=True, random_state=24))],
 'final_estimator__bootstrap': True,
 'final_estimator__ccp_alpha': 0.0,
 'final_estimator__class_weight': None,
 'final_estimator__criterion': 'gini',
 'final_estimator__max_depth': None,
 'final_estimator__max_features': 'sqrt',
 'final_estimator__max_leaf_nodes': None,
 'final_estimator__max_samples': None,
 'final_estimator__min_impurity_decrease': 0.0,
 'final_estimator__min_samples_leaf': 1,
 'final_estimator__min_samples_split': 2,
 'final_estimator__min_weight_fraction_leaf': 0.0,
 'final_estimator__monotonic_cst': None,
 'final_estimator__n_estimators': 100,
 'final_estimator__n_jobs': None,
 'final_estimator__oob_score': False,
 'final_estimator__random_state': 24,
 'final_estimator__verbose': 0,
 'final_estimator__warm_start': False,
 'final_estimator': RandomForestClassifier(random_stat

In [283]:
params = {'final_estimator__max_depth': [3, 4, 5 ] , 'final_estimator__n_estimators':[10,50], 'SVM__C': np.linspace(0.001, 3, 5), 'TREE__max_depth': [None, 2, 4], 'passthrough': [True, False]}
gcv = GridSearchCV(stack, param_grid = params, cv = kfold, verbose = 3, scoring = 'neg_log_loss')
gcv.fit(X,y)

Fitting 5 folds for each of 180 candidates, totalling 900 fits
[CV 1/5] END SVM__C=0.001, TREE__max_depth=None, final_estimator__max_depth=3, final_estimator__n_estimators=10, passthrough=True;, score=-0.696 total time=   0.1s
[CV 2/5] END SVM__C=0.001, TREE__max_depth=None, final_estimator__max_depth=3, final_estimator__n_estimators=10, passthrough=True;, score=-0.924 total time=   0.1s
[CV 3/5] END SVM__C=0.001, TREE__max_depth=None, final_estimator__max_depth=3, final_estimator__n_estimators=10, passthrough=True;, score=-0.974 total time=   0.1s
[CV 4/5] END SVM__C=0.001, TREE__max_depth=None, final_estimator__max_depth=3, final_estimator__n_estimators=10, passthrough=True;, score=-0.867 total time=   0.1s
[CV 5/5] END SVM__C=0.001, TREE__max_depth=None, final_estimator__max_depth=3, final_estimator__n_estimators=10, passthrough=True;, score=-0.919 total time=   0.1s
[CV 1/5] END SVM__C=0.001, TREE__max_depth=None, final_estimator__max_depth=3, final_estimator__n_estimators=10, pass

In [284]:
gcv.best_score_, gcv.best_params_

(-0.7486144895037242,
 {'SVM__C': 3.0,
  'TREE__max_depth': 2,
  'final_estimator__max_depth': 5,
  'final_estimator__n_estimators': 50,
  'passthrough': True})

In [285]:
from tqdm import tqdm

# serializing the fitted object

In [287]:
best_stack = gcv.best_estimator_

In [294]:
import pickle
with open("Cases/Glass Identification/stack_gls.pkl", 'wb') as f:
    pickle.dump(best_stack, f)

## with satellite dataset

In [302]:
sat = pd.read_csv("Cases/Satellite Imaging/Satellite.csv", sep = ';')
y = sat.classes
X = sat.drop('classes', axis = 1)

In [304]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 24, stratify = y)

In [308]:
knn = KNeighborsClassifier()
nb = GaussianNB()
dtc = DecisionTreeClassifier(random_state = 24)
svm = SVC(probability = True, random_state = 24)
lr = LogisticRegression(random_state = 24)
rb = RandomForestClassifier(random_state = 24)

In [340]:
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 24)
stack = StackingClassifier(estimators = [('KNN', knn),('SVM', svm), ('LR', lr)], final_estimator = rb, passthrough = False)


In [342]:
import warnings

params = {'final_estimator__max_depth': [3,5] , 'final_estimator__n_estimators':[10,50]}
gcv = GridSearchCV(stack, param_grid = params, cv = kfold, verbose = 3, scoring = 'neg_log_loss')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    gcv.fit(X,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END final_estimator__max_depth=3, final_estimator__n_estimators=10;, score=-0.528 total time=  14.0s
[CV 2/5] END final_estimator__max_depth=3, final_estimator__n_estimators=10;, score=-0.521 total time=  14.2s
[CV 3/5] END final_estimator__max_depth=3, final_estimator__n_estimators=10;, score=-0.535 total time=  13.8s
[CV 4/5] END final_estimator__max_depth=3, final_estimator__n_estimators=10;, score=-0.536 total time=  14.0s
[CV 5/5] END final_estimator__max_depth=3, final_estimator__n_estimators=10;, score=-0.541 total time=  14.1s
[CV 1/5] END final_estimator__max_depth=3, final_estimator__n_estimators=50;, score=-0.508 total time=  14.3s
[CV 2/5] END final_estimator__max_depth=3, final_estimator__n_estimators=50;, score=-0.497 total time=  14.1s
[CV 3/5] END final_estimator__max_depth=3, final_estimator__n_estimators=50;, score=-0.511 total time=  13.8s
[CV 4/5] END final_estimator__max_depth=3, final_estimator__

In [346]:
gcv.best_score_

-0.2893193190553941

In [348]:
with open('Cases/Satellite Imaging/model.pkl', 'wb') as f:
    pickle.dump(gcv.best_estimator_, f)


In [350]:
with open('Cases/Satellite Imaging/model.pkl', 'rb') as f:
    model = pickle.load(f)


In [352]:
tst = pd.read_csv('Cases/Satellite Imaging/tst_satellite.csv')

In [355]:
tst['pred_type'] = model.predict(tst)

In [357]:
tst.head()

Unnamed: 0,x.1,x.2,x.3,x.4,x.5,x.6,x.7,x.8,x.9,x.10,...,x.28,x.29,x.30,x.31,x.32,x.33,x.34,x.35,x.36,pred_type
0,104,97,106,79,94,91,85,87,106,92,...,80,110,102,87,105,89,81,100,75,grey soil
1,99,105,99,95,101,110,91,101,96,83,...,84,75,107,85,94,100,96,79,110,grey soil
2,98,78,91,104,105,103,84,91,106,82,...,81,76,99,97,95,88,78,103,75,grey soil
3,75,98,98,104,89,90,100,81,88,88,...,86,88,86,106,89,76,79,79,91,grey soil
4,92,108,89,89,92,108,78,94,84,88,...,106,84,106,96,81,91,76,84,106,grey soil


In [361]:
pip install gradio

Collecting gradio
  Downloading gradio-5.5.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.11-cp312-none-win_amd64.whl.metadata (52 kB)
     ---------------------------------------- 0.0/52.0 kB ? eta -:--:--
     ---------------------------------------- 52.0/52.0 kB ? eta 0:00:00
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-mult

In [383]:
import gradio as gr
import pickle
#import numpy as np 
import pandas as pd 
import os 
# os.chdir("C:/Training/Academy/Statistics (Python)/Cases/Glass Identification")

def predict(RI, Na, Mg, Al, Si, K, Ca, Ba, Fe):
    tst = pd.DataFrame([[RI, Na, Mg, Al, Si, K, Ca, Ba, Fe]],
          columns=['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'])    
    filehandler = open("Cases/Glass Identification/stack_gls.pkl", "rb")
    bm_loaded = pickle.load(filehandler)
    # print(tst)
    return bm_loaded.predict(tst)[0] 
      

# demo = gr.Interface(
#     fn=predict,
#     inputs=["number"] * 9,
#     outputs=["text"]
# )

with gr.Blocks() as demo:
    with gr.Row():
      RI = gr.Number(label='RI')
      Na = gr.Number(label='Na')
      Mg = gr.Number(label='Mg')
    with gr.Row():
      Al = gr.Number(label='Al')
      Si = gr.Number(label='Si')
      K = gr.Number(label='K')
    with gr.Row():
      Ca = gr.Number(label='Ca')
      Ba = gr.Number(label='Ba')
      Fe = gr.Number(label='Fe')
    with gr.Row(): 
      Type = gr.Text(label='Type') 
    with gr.Row():  
      button = gr.Button(value="Which Glass?")
      button.click(predict,
            inputs=[RI, Na, Mg, Al, Si, K, Ca, Ba, Fe],
            outputs=[Type])



demo.launch()


* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




### on cancer

In [391]:
df = pd.read_csv("Cases/Wisconsin/BreastCancer.csv", index_col = 0)
X = df.drop("Class", axis = 1)
y = df.Class

In [393]:
gcv.fit(X,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END final_estimator__max_depth=3, final_estimator__n_estimators=10;, score=-0.079 total time=   0.2s
[CV 2/5] END final_estimator__max_depth=3, final_estimator__n_estimators=10;, score=-0.090 total time=   0.2s
[CV 3/5] END final_estimator__max_depth=3, final_estimator__n_estimators=10;, score=-0.088 total time=   0.2s
[CV 4/5] END final_estimator__max_depth=3, final_estimator__n_estimators=10;, score=-0.154 total time=   0.2s
[CV 5/5] END final_estimator__max_depth=3, final_estimator__n_estimators=10;, score=-0.083 total time=   0.2s
[CV 1/5] END final_estimator__max_depth=3, final_estimator__n_estimators=50;, score=-0.077 total time=   0.3s
[CV 2/5] END final_estimator__max_depth=3, final_estimator__n_estimators=50;, score=-0.089 total time=   0.2s
[CV 3/5] END final_estimator__max_depth=3, final_estimator__n_estimators=50;, score=-0.082 total time=   0.2s
[CV 4/5] END final_estimator__max_depth=3, final_estimator__

In [394]:
with open ('Cases/Wisconsin/model.pkl', 'wb') as f:
    pickle.dump(gcv.best_estimator_, f)

# making ui

In [396]:
df.head()

Unnamed: 0_level_0,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
61634,5,4,3,1,2,2,2,3,1,Benign
63375,9,1,2,6,4,10,7,7,2,Malignant
76389,10,4,7,2,2,8,6,1,1,Malignant
95719,6,10,10,10,8,10,7,10,7,Malignant
128059,1,1,1,1,2,5,5,1,1,Benign


In [397]:
df.columns.tolist()

['Clump',
 'UniCell_Size',
 'Uni_CellShape',
 'MargAdh',
 'SEpith',
 'BareN',
 'BChromatin',
 'NoemN',
 'Mitoses',
 'Class']

In [410]:
import gradio as gr
import pickle
#import numpy as np 
import pandas as pd 

def predict(a, b, c, d, e,f ,g, h, i):
    tst = pd.DataFrame([[a, b, c, d, e,f ,g, h, i]], columns = [
 'Clump',
 'UniCell_Size',
 'Uni_CellShape',
 'MargAdh',
 'SEpith',
 'BareN',
 'BChromatin',
 'NoemN',
 'Mitoses'])
    with open ('Cases/Wisconsin/model.pkl', 'rb') as f:
        model = pickle.load(f)
    return f"{model.predict(tst)[0]}, lucky you are safe, less money for us :("
## ui
with gr.Blocks() as demo:
    with gr.Row():
        a = gr.Number(label='Clump')
        b = gr.Number(label='UniCell_Size')
        c = gr.Number(label='Uni_CellShape')
    with gr.Row():
        d = gr.Number(label='MargAdh')
        e = gr.Number(label='SEpith')
        f = gr.Number(label='BareN')
    with gr.Row():
        g = gr.Number(label='BChromatin')
        h = gr.Number(label='NoemN')
        i = gr.Number(label='Mitoses')
    with gr.Row():
        Type = gr.Text(label='Type') 
    with gr.Row():  
      button = gr.Button(value="how serious?")
      button.click(predict,
            inputs=[a, b, c, d, e, f, g, h, i],
            outputs=[Type])
demo.launch()

* Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.


