# Meta model

Najprej sem uvozil podatkovja iz openml ki ustrezajo: 
- za podatkovje obstaja task z nadzarovano ucenje klasifikacije
- stevila stolpcev med 20-150 
- stevilo vrtsiv med 500-3000

Potem sem precistil podatkovja in obdrzal tistak ki so primerna za primerjavo nasim podatkov.


In [30]:
import numpy as np
import pandas as pd
import openml
from openml.tasks import TaskType

import warnings

# ne izpisuje opozoril
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)


def get_id_with_taks():
    """
    Iz openml pridobimo ID datasetov za katere so izvedeni taski nadzarovanega ucenja kasifikacije, ki vstrezajo poguju dimezij.
    Vrne ID datatesot
    """
    d = openml.datasets.list_datasets(number_instances='500..3000', number_features='20..150', output_format="dataframe")
    did = list(d["did"]) #id vseh podatkovij
    tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe")
    tid = list(tasks['tid']) #id task
    tdid = list(tasks['did']) # id podatkovij za pripadajoci task
    ID = []
    for i in tdid:
        if i in did:
           ID.append(i)
    return ID


def precisti_podatkovja(podatkovja, did):
    """Precisti podatkovja tako da izbere samo primerne za nas primer
    Vrne primerna podatvoja in kljuc za pridobitev imen podatkovij {data_name : data_id}"""
    
    podatkovja_ok = {}
    name_key = {}
    for n, podatkovje in enumerate(podatkovja):
        name_podatkovja = podatkovje.name
        target = podatkovje.default_target_attribute
        X, _, nominal, names = podatkovje.get_data()
        # Vec pogojev, da podatkovje obdrzimo:

        # ce ga nismo ze prej (tj. neke druge verzije)
        name_ok = name_podatkovja not in podatkovja_ok

        # ce ima znan target in je target en sam (in ne npr. "Spol,Starost")
        targ_ok = target is not None and "," not in target

        # ce je target nominalen (klasifikacija) in podatki nimajo nominalnih atributov
        i_target = names.index(target) if targ_ok else 0
        nomi_ok = nominal[i_target] and sum(nominal) == 1  # natanko en nominalen in to je target
        if name_ok and targ_ok and nomi_ok:
            podatkovja_ok[name_podatkovja] = n
            name_key[name_podatkovja] = did[n]

    n_vsa = len(podatkovja)
    n_ok = len(podatkovja_ok)
    print(f"Obdrzal sem {n_ok} podatkovij ({100 * n_ok / n_vsa:.1f}%)")
    return podatkovja_ok, name_key


ID = get_id_with_taks()
podatkovja_all = openml.datasets.get_datasets(ID)
podatkovja_ok, name_key = precisti_podatkovja(podatkovja_all, ID)


Obdrzal sem 50 podatkovij (12.7%)


### Izbira metaznacilk

Izbral sem znacilke tipa "general", "statistical", "info-theory" in "model-based" (odlocitveno drevo) ter upirabil **pymfe**.

In [31]:
from pymfe.mfe import MFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer

warnings.simplefilter(action='ignore', category=RuntimeError)

def generate_meta_ds(data_all, data_ok):
    """ Sprejme vsa podatkovja in slovar primernih podatkovij.
    Izvede generirane meta podatkovja za izbrane metaznacilke, za podatkoja ki vsebujejo nan izvede Imputacijo.
    Vrne meta podatkovje."""

    meta_ds = {"name": []}
    for  data_name, n in data_ok.items():
        ds = data_all[n]
        target = ds.default_target_attribute
        X, y,_,_ = ds.get_data(target=target)
        X = np.array(X)
        y = np.array([str(t) for t in y])
        try:
            typ = type(X[0,np.isnan(X).any(axis=0)])
        except:
            continue
        if typ == str:
            imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit(X)
        else:
            imp = SimpleImputer(missing_values=np.nan, strategy='mean').fit(X)
        X = imp.transform(X)
        mfe = MFE(groups=["general", "statistical", "info-theory"])
        mfe.fit(X, y)
        att_names, att_values = mfe.extract()

        mfeM = MFE(groups=["model-based"])
        tree = DecisionTreeClassifier()
        tree.fit(X, y)
        att_namesM, att_valuesM = mfeM.extract_from_model(tree)

        att_names += att_namesM
        att_values += att_valuesM

        is_first = len(meta_ds) == 1
        meta_ds["name"].append(data_name)
        for a_name, a_value in zip(att_names, att_values):
            if is_first:
                meta_ds[a_name] = [a_value]
            else:
                meta_ds[a_name].append(a_value)
    return pd.DataFrame(data=meta_ds, index=None)

metaDS = generate_meta_ds(data_all=podatkovja_all, data_ok=podatkovja_ok)
metaDS.to_csv("meta_learning/metaDS.csv", index=False)
print(metaDS.head())

             name  attr_conc.mean  attr_conc.sd  attr_ent.mean   attr_ent.sd  \
0   mfeat-fourier        0.013520      0.008569       3.584956  1.472658e-06   
1  mfeat-karhunen        0.008939      0.002906       3.584957  1.083121e-06   
2   mfeat-zernike        0.022655      0.064117       3.584957  1.795561e-15   
3         segment        0.073432      0.090473       3.157775  1.288127e+00   
4       oil_spill        0.062405      0.055971       2.848414  7.736935e-01   

   attr_to_inst  can_cor.mean  can_cor.sd  cat_to_num  class_conc.mean  ...  \
0      0.038000      0.721437    0.230074         0.0         0.038622  ...   
1      0.032000      0.809847    0.114468         0.0         0.036046  ...   
2      0.023500      0.716099    0.255786         0.0         0.053979  ...   
3      0.008225      0.746033    0.262599         0.0         0.145581  ...   
4      0.052295      0.652479         NaN         0.0         0.013217  ...   

   nodes_repeated.mean  nodes_repeated.sd  t

### Iskanej podobih datasetov

Sledi transalcija mojih podatkov v prostor meta znacilk, za namen iskanja najblizjih sosedov. Izbral sem tri podatkovja iz openml ki se najblizja mojim 
podatkov s pomocjo **NearestNeighbors** za parameter k=3.

In [32]:
dt = pd.read_csv('podatki.csv')
X = np.array(dt.drop('y', axis=1))
Y = dt['y']
clas = Y.unique()
Y = Y.replace(clas[0], 0)
Y = Y.replace(clas[1], 1)
y = np.array(Y)

#genereramo opis podatkov z statistikami ipd
mfe = MFE(groups=["general", "statistical", "info-theory"])
mfe.fit(X, y)
attributs_names, attributs_values = mfe.extract()
mfeM = MFE(groups=["model-based"])
tree = DecisionTreeClassifier()
tree.fit(X, y)
att_namesM, att_valuesM = mfeM.extract_from_model(tree)

attributs_names += att_namesM
attributs_values += att_valuesM

dt_mfe = pd.DataFrame(data={attributs_names[i]:attributs_values[i] for i in range(len(attributs_names))}, index=[1])
np_mfe = dt_mfe.to_numpy()


dt_meta = pd.read_csv('meta_learning/metaDS.csv')
df_names = dt_meta['name'] #imena podatkov
np_meta = dt_meta.drop('name', axis=1).to_numpy()
np_meta[np.isinf(np_meta)] = np.nan

from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean').fit(np_meta)
np_meta = imp_mean.transform(np_meta)
np_mfe = imp_mean.transform(np_mfe)

from sklearn.neighbors import NearestNeighbors

neigh = NearestNeighbors(n_neighbors=3)
neigh.fit(np_meta)

sosedi = neigh.kneighbors(np_mfe, 3, return_distance=False)[0]
sosedi = np.array(df_names[sosedi])
print(sosedi)

['PieChart2' 'cardiotocography' 'wdbc']


### Izbira najboljsega modela

Za izbrane najbljizje sosede sem preveril njihove rezultate na **OpenML**-ju. Ker za dano podatkoje obstaja vec moznih taksov sem preveril vsa ter izbral modele, ki so imela najboljsi "area_under_roc_curve".

In [33]:

def get_best_model(sosedi, name_key, metric):
    '''
    Sprejme najblizje sosede, kljuc za pridobitev id podatkovij iz imen ter metriko
    vrne najbolsi model glede na metrico za dane sosede
    '''
    id_key = {v: k for k, v in name_key.items()}
    tasks = openml.tasks.list_tasks(task_type=TaskType.SUPERVISED_CLASSIFICATION, output_format="dataframe")
    k_tid = {name_key[sosedi[0]]:[], name_key[sosedi[1]]:[], name_key[sosedi[2]]:[]}
    for i, iD in enumerate(tasks['did']):
        if iD in k_tid:
            k_tid[iD].append(np.array(tasks['tid'])[i])

    models = {}
    for data_id, task_ids in k_tid.items():
        for task_id in task_ids:
            evals = openml.evaluations.list_evaluations(function=metric, 
                                                        tasks=[task_id], 
                                                        output_format="dataframe")
            if evals.empty:
                continue
            else:
                ix = evals['value'].idxmax()
                flow_name = evals['flow_name'].iloc[ix]
                value = evals['value'].iloc[ix]
                name_data = id_key[data_id]
                models[name_data] = [flow_name]
                print(f"Na podatlkih {name_data} so bili izvedeni taski:\nBest model: {flow_name}\nBest AUC: {value}\n")

    return models

metric = "area_under_roc_curve"
models = get_best_model(sosedi, name_key, metric)

Na podatlkih PieChart2 so bili izvedeni taski:
Best model: weka.kf.AttributeSelection-BestFirst-CfsSubsetEval-ReplaceMissingValues-NaiveBayes(1)
Best AUC: 0.846022

Na podatlkih cardiotocography so bili izvedeni taski:
Best model: weka.kf.RandomForest(1)
Best AUC: 0.99804

Na podatlkih cardiotocography so bili izvedeni taski:
Best model: weka.kf.RandomForest(1)
Best AUC: 0.99804

Na podatlkih wdbc so bili izvedeni taski:
Best model: sklearn.pipeline.Pipeline(imputation=openmlstudy14.preprocessing.ConditionalImputer,hotencoding=sklearn.preprocessing.data.OneHotEncoder,variencethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,classifier=sklearn.ensemble.weight_boosting.AdaBoostClassifier(base_estimator=sklearn.tree.tree.DecisionTreeClassifier))(1)
Best AUC: 0.996049363141483

Na podatlkih wdbc so bili izvedeni taski:
Best model: mlr.classif.xgboost(6)
Best AUC: 0.995851

Na podatlkih wdbc so bili izvedeni taski:
Best model: sklearn.pipeline.Pipeline(imputer=sklearn

### Testiranje modelov

Modele sem preizkusil na mojih podatkih, AUC sem izmeril na testnih podatkih. Potem sem prezkusil se skaliranje podatov in ponovno preveril AUC na testnih podatkih.

In [34]:

data = pd.read_csv('podatki.csv')
X = data.drop('y', axis=1)
Y = data['y']
clas = Y.unique()
Y = Y.replace(clas[0], 0)
Y = Y.replace(clas[1], 1)

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

np.random.seed(0)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


from sklearn.naive_bayes import GaussianNB
gNB = GaussianNB()
gNB.fit(X_train, y_train)
pred1 = gNB.predict(X_test)
aucNB= roc_auc_score(pred1, y_test)
print(f'GaussianNB with AUC: {aucNB}')

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
pred2 = rf.predict(X_test)
aucrf = roc_auc_score(pred2, y_test)
print(f'RandomForestClassifier with AUC: {aucrf}')



from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
adaBoost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
adaBoost.fit(X_train, y_train)
pred3 = adaBoost.predict(X_test)
aucada = roc_auc_score(pred3, y_test)
print(f'AdaBoostClassifier with AUC: {aucada}')


import xgboost as xgb
xgBoost =xgb.XGBClassifier()
xgBoost.fit(X_train, y_train)
pred4 = xgBoost.predict(X_test)
aucxg = roc_auc_score(pred4, y_test)
print(f'XGBClassifier with AUC: {aucxg}')



GaussianNB with AUC: 0.6634495641344956
RandomForestClassifier with AUC: 0.8066801619433198
AdaBoostClassifier with AUC: 0.7159819347319347
XGBClassifier with AUC: 0.8135118306351182


In [35]:
from sklearn.preprocessing import StandardScaler

np.random.seed(0)

print("Skalirani!")
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_train_sc = pd.DataFrame(X_train_sc,columns=X.columns)
X_test_sc = scaler.transform(X_test)
X_test_sc = pd.DataFrame(X_test_sc,columns=X.columns)

gNBsc = GaussianNB()
gNBsc.fit(X_train_sc, y_train)
pred5 = gNBsc.predict(X_test_sc)
aucNB= roc_auc_score(pred5, y_test)
print(f'GaussianNB with AUC: {aucNB}')

rfsc = RandomForestClassifier()
rfsc.fit(X_train_sc, y_train)
pred6 = rfsc.predict(X_test_sc)
aucrf = roc_auc_score(pred6, y_test)
print(f'RandomForestClassifier with AUC: {aucrf}')

adaBoostsc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
adaBoostsc.fit(X_train_sc, y_train)
pred7 = adaBoostsc.predict(X_test_sc)
aucada = roc_auc_score(pred7, y_test)
print(f'AdaBoostClassifier with AUC: {aucada}')

xgBoostsc =xgb.XGBClassifier()
xgBoostsc.fit(X_train_sc, y_train)
pred8 = xgBoostsc.predict(X_test_sc)
aucxg = roc_auc_score(pred8, y_test)
print(f'XGBClassifier with AUC: {aucxg}')

Skalirani!
GaussianNB with AUC: 0.7406003159557661
RandomForestClassifier with AUC: 0.8066801619433198
AdaBoostClassifier with AUC: 0.7159819347319347
XGBClassifier with AUC: 0.8135118306351182
