# 2 Meta učenje

In [28]:
import pandas as pd
import numpy as np
import openml as oml
PODATKI = 'podatki.csv'
RANDOM_SEED = 420

data = pd.read_csv(PODATKI)
print('Število primerov: ', len(data))
features = data.columns[:-1]
print('Število značilk:', len(features))
MAX_N_FEATURES =  100 + len(features)
MIN_N_FEATURES = len(features)
print('Maksimalno število značilk originala: ', MAX_N_FEATURES)
MAX_N_EXAMPLES = 1000 + len(data)
MIN_N_EXAMPLES = len(data)
print('Maksimalno število primerov originala: ', MAX_N_EXAMPLES)

data['y'].value_counts()

# za lažje delo spremenim y 
data['y'] = data['y'].map(lambda x : True if x == 'teamEdward' else False)
data['y'].value_counts()

Število primerov:  1203
Število značilk: 30
Maksimalno število značilk originala:  130
Maksimalno število primerov originala:  2203


True     1025
False     178
Name: y, dtype: int64

Meta atributi so izbrani na [podlagi opisov na spletni strani](https://pymfe.readthedocs.io/en/latest/auto_pages/meta_features_description.html). 

In [29]:
meta_features=[ 
    # kompleksnost (izpustimo model perfomranes, ker trajajo):
    'c1', 'c2', 'f1', 'f2', 'f3', 'f4',
    # general: (izpustimo number of ..., ker vemo, da je original drugačne oblike)
    'attr_to_inst', 'cat_to_num', 'freq_class', 'nr_attr',
    # info-theory 
    'attr_conc', # koncentracija
    'attr_ent', # entropija
    'class_conc', # per class 
    'class_ent', # entropija /razred
    'eq_num_attr', 
    'joint_ent',
    'mut_inf',
    'ns_ratio',
    # statistical
    'can_cor',
    'cor',
    'cov',
    'eigenvalues',
    'g_mean',
    'gravity',
    'h_mean',
    'iq_range',
    'kurtosis',
    'lh_trace',
    'mad',
    'max',
    'mean',
    'median',
    'min',
    'nr_cor_attr',
    'nr_disc',
    'nr_norm',
    'nr_outliers',
    'p_trace',
    'range',
    'roy_root',
    'sd',
    'sd_ratio',
    'skewness',
    'sparsity',
    't_mean',
    'var',
    'w_lambda'
]

In [30]:
from pymfe.mfe import MFE

# mfe = MFE(groups=["general", "statistical"])
mfe = MFE(features=meta_features)
X = data.drop(columns=['y']).to_numpy()
y = data['y'].to_numpy()
mfe.fit(X,y)

attribute_names, attribute_values = mfe.extract()

print(list(zip(attribute_names, attribute_values)))
# used meta features

  np.log(np.linalg.det(S_i)) for S_i in sample_cov_matrices


[('attr_conc.mean', 0.184477170590798), ('attr_conc.sd', 0.18626145238963987), ('attr_ent.mean', 2.7993325529650517), ('attr_ent.sd', 0.7011380143695096), ('attr_to_inst', 0.02493765586034913), ('can_cor.mean', 0.5821867443429916), ('can_cor.sd', nan), ('cat_to_num', 0.0), ('class_conc.mean', 0.019535145762771742), ('class_conc.sd', 0.029033032007713847), ('class_ent', 0.6047202206385474), ('cor.mean', 0.3875109674449386), ('cor.sd', 0.25029717850985206), ('cov.mean', 814587.8949260311), ('cov.sd', 12329224.492271649), ('eigenvalues.mean', 149054282.89592808), ('eigenvalues.sd', 816289673.4614395), ('eq_num_attr', 10.095791708413493), ('freq_class.mean', 0.5), ('freq_class.sd', 0.4978548991396557), ('g_mean.mean', 90.06182629979813), ('g_mean.sd', 372.3042944085837), ('gravity', 24782.739382070275), ('h_mean.mean', 8.598977349283047), ('h_mean.sd', 7.040187261369319), ('iq_range.mean', 477.52449999999993), ('iq_range.sd', 2310.4910544504232), ('joint_ent.mean', 3.344154527076326), ('jo

Najprej bomo naložili kandidate iz OpenML, nato pa iz vsakega poračunali meta atribute. 

In [31]:
# kandidate bom umejil s številom značilk in primerov. Uporablili bomo le podatkovja s 
# številom značilk in primerov nekje med našimi podatki in originalom podatkov.
datasets = oml.datasets.list_datasets(number_instances=f'{MIN_N_EXAMPLES-100}..{MAX_N_EXAMPLES+100}', number_features=f'{MIN_N_FEATURES-10}..{MAX_N_FEATURES+10}', output_format="dataframe")
data_ids = list(datasets["did"])
print(datasets)

         did                                               name  version  \
14        14                                      mfeat-fourier        1   
16        16                                     mfeat-karhunen        1   
22        22                                      mfeat-zernike        1   
315      315                                           us_crime        1   
930      930                                    colleges_usnews        2   
...      ...                                                ...      ...   
44788  44788  KDDCup09-Upselling_seed_0_nrows_2000_nclasses_...        1   
44789  44789  KDDCup09-Upselling_seed_1_nrows_2000_nclasses_...        1   
44790  44790  KDDCup09-Upselling_seed_2_nrows_2000_nclasses_...        1   
44791  44791  KDDCup09-Upselling_seed_3_nrows_2000_nclasses_...        1   
44792  44792  KDDCup09-Upselling_seed_4_nrows_2000_nclasses_...        1   

      uploader  status format  MajorityClassSize  MaxNominalAttDistinctValues  \
14    

Nalaganje podatkov:

In [32]:
all_datasets = oml.datasets.get_datasets(data_ids)
print(all_datasets)

[OpenML Dataset
Name..........: mfeat-fourier
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:20:17
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/14/mfeat-fourier.arff
OpenML URL....: https://www.openml.org/d/14
# of features.: 77
# of instances: 2000, OpenML Dataset
Name..........: mfeat-karhunen
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:20:30
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/16/mfeat-karhunen.arff
OpenML URL....: https://www.openml.org/d/16
# of features.: 65
# of instances: 2000, OpenML Dataset
Name..........: mfeat-zernike
Version.......: 1
Format........: ARFF
Upload Date...: 2014-04-06 23:21:00
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/22/mfeat-zernike.arff
OpenML URL....: https://www.openml.org/d/22
# of features.: 48
# of instances: 2000, OpenML Dataset
Name..........: us_crime
Version.......: 1
Format........: ARFF


In [33]:
from tqdm import tqdm
datasets = {}
# koda iz vaj - odstrani vse neustrezne datasete (numerične spremenljivke; podvojeni dataseti; večdimenzionalen y)
for n, podatkovje in tqdm(enumerate(all_datasets), total=len(all_datasets)):
    ime_podatkovja = podatkovje.name
    target = podatkovje.default_target_attribute
    X, _, nominal, names = podatkovje.get_data()
    # Vec pogojev, da podatkovje obdrzimo:

    # ce ga nismo ze prej (tj. neke druge verzije)
    name_ok = ime_podatkovja not in datasets

    # ce ima znan target in je target en sam (in ne npr. "Spol,Starost")
    targ_ok = target is not None and "," not in target

    # ce je target nominalen (klasifikacija) in podatki nimajo nominalnih atributov
    i_target = names.index(target) if targ_ok else 0
    nomi_ok = nominal[i_target] and sum(nominal) == 1  # natanko en nominalen in to je target
    if name_ok and targ_ok and nomi_ok:
        datasets[ime_podatkovja] = n

n_vsa = len(all_datasets)
n_ok = len(datasets)
print(f"Obdrzal sem {n_ok} podatkovij ({100 * n_ok / n_vsa:.1f}%)")


100%|██████████| 340/340 [00:05<00:00, 63.98it/s] 

Obdrzal sem 159 podatkovij (46.8%)





Izračun najbižjih sosedov

In [34]:
target_name = all_datasets[0].default_target_attribute
X, y, _, _ = all_datasets[0].get_data(target=target_name)


In [35]:
# pripravil bom tabelo obilke 
# meta atributi | indeks v all_datasets


# mfe = MFE(features=meta_features)

attribute_names, attribute_values = mfe.extract()

meta_space = []
meta_space_indexes = []

for _, index in tqdm(datasets.items(), total=len(datasets)):
    dataset = all_datasets[index]
    target_name = dataset.default_target_attribute
    X, y, _, _ = dataset.get_data(target=target_name)


    # izračuna meta atribute teh podatkov
    mfe.fit(X.to_numpy(),y.to_numpy(), suppress_warnings=True)
    test, ex = mfe.extract()

    if set(test) != set(attribute_names):
        print(test.symmetric_difference(attribute_names))
    

    meta_space.append(ex)
    meta_space_indexes.append(index)
    

  np.log(np.linalg.det(S_i)) for S_i in sample_cov_matrices
 Exception message: ZeroDivisionError('division by zero').
 Will set it as 'np.nan' for all summary functions.
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
 Exception message: ValueError('Harmonic mean only defined if all elements greater than or equal to zero').
 Will set it as 'np.nan' for all summary functions.
  np.log(np.linalg.det(S_i)) for S_i in sample_cov_matrices
  * np.log(np.linalg.det(pooled_cov_mat))
 Exception message: ValueError('Harmonic mean only defined if all elements greater than or equal to zero').
 Will set it as 'np.nan' for all summary functions.
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  np.log(np.linalg.det(S_i)) for S_i in sample_cov_matrices
  * np.log(np.linalg.det(pooled_cov_mat))
  (num_inst - num_classes)
  r = _umath_linalg.det(a, signature=signature)
  (num_inst - num_classes)
 Exception message: ZeroDiv

Nekateri atributi niso bili uspešno poračunani, in jih bomo odstranili

In [46]:
# features with: Nan values:
nan_indexes = set()
for example in meta_space:
    for f_index, value in enumerate(example):
        if np.isnan(value):
            nan_indexes.add(f_index)
len(nan_indexes)
# meta_space

43

In [50]:
# transform to dataframe
meta_space = pd.DataFrame(meta_space, columns=meta_features)
# odstrani vse atribute, ki jih nismo mogli poračunat
nan_names = [meta_features[i] for i in nan_indexes]
meta_space = meta_space.drop(columns=nan_names)
#poračunani atributi:
meta_space.columns

Index(['attr_conc.mean', 'attr_conc.sd', 'attr_ent.mean', 'attr_ent.sd',
       'attr_to_inst', 'cat_to_num', 'class_conc.mean', 'class_conc.sd',
       'class_ent', 'eq_num_attr', 'freq_class.mean', 'freq_class.sd',
       'joint_ent.mean', 'joint_ent.sd', 'mut_inf.mean', 'mut_inf.sd',
       'nr_attr', 'nr_cor_attr', 'nr_norm', 'nr_outliers', 'ns_ratio',
       'sparsity.mean', 'sparsity.sd'],
      dtype='object')

In [52]:
# saving
meta_space['index_in_dataset'] = meta_space_indexes
meta_space.to_parquet('parquet/meta_space_df')
meta_space = meta_space.drop(columns=['index_in_dataset'])


import pickle



#(load:)
# with open('meta_space', 'rb') as f:
#     meta_space = pickle.load(f)
# with open('meta_space_indexes', 'rb') as f:
#     meta_space_indexes = pickle.load(f)

#save caluclated meta features


# with open('meta_space', 'wb') as f:
#     pickle.dump(meta_space, f)
# with open('meta_space_indexes', 'wb') as f:
#     pickle.dump(meta_space_indexes, f)

In [54]:
meta_space

Unnamed: 0,attr_conc.mean,attr_conc.sd,attr_ent.mean,attr_ent.sd,attr_to_inst,cat_to_num,class_conc.mean,class_conc.sd,class_ent,eq_num_attr,...,joint_ent.sd,mut_inf.mean,mut_inf.sd,nr_attr,nr_cor_attr,nr_norm,nr_outliers,ns_ratio,sparsity.mean,sparsity.sd
0,0.010805,0.006192,3.584956,1.472658e-06,0.038000,0.0,0.038622,0.034719,3.321928,11.401216,...,0.230407,0.291366,0.230407,76,0.006316,0.0,69,11.303958,0.000002,5.295566e-07
1,0.009831,0.004413,3.584957,1.083121e-06,0.032000,0.0,0.036046,0.025179,3.321928,12.060353,...,0.180866,0.275442,0.180866,64,0.000496,28.0,61,12.015286,0.000002,5.363522e-08
2,0.019942,0.023107,3.584957,1.795561e-15,0.023500,0.0,0.053979,0.024114,3.321928,8.330996,...,0.171534,0.398743,0.171534,47,0.126735,0.0,40,7.990640,0.000002,2.895691e-07
3,0.034694,0.047734,2.127211,1.458498e+00,0.016463,0.0,0.358780,0.421306,2.913356,7.382268,...,1.425964,0.394642,0.252896,35,0.053782,0.0,25,4.390225,0.186830,2.238322e-01
4,0.844564,0.022563,3.321915,1.074247e-05,0.082508,0.0,0.000899,0.000152,1.000000,170.799163,...,0.000991,0.005855,0.000990,100,1.000000,0.0,100,566.380328,0.000031,3.688026e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154,0.011079,0.008169,2.312358,5.931412e-01,0.050000,0.0,0.001367,0.000914,0.811278,113.061852,...,0.589249,0.007176,0.006061,100,0.000606,0.0,100,321.256243,0.003263,2.488939e-02
155,0.010196,0.006804,2.303047,6.251888e-01,0.050000,0.0,0.001404,0.001101,0.811278,117.397622,...,0.622071,0.006911,0.005501,100,0.000404,0.0,100,332.266949,0.000895,9.266945e-04
156,0.012772,0.007451,2.355830,6.583880e-01,0.050000,0.0,0.011570,0.070136,0.811278,95.725553,...,0.653888,0.008475,0.007439,100,0.000202,2.0,98,276.972689,0.020736,1.406009e-01
157,0.208909,0.246970,0.396347,9.140778e-01,0.050000,0.0,0.191225,0.242965,0.378851,163.597935,...,0.911294,0.002316,0.005397,100,0.003434,38.0,62,170.153218,0.520368,3.933624e-01


Najbližji sosedi:

In [60]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=3)
knn.fit(X=meta_space, y=meta_space_indexes)

# atributi od našeih podatkov
value_dict = {name : value for name, value in zip(attribute_names, attribute_values)}
values = np.array([value_dict[name] for name in meta_space.columns])
values = values.reshape(1, -1)    

In [63]:
# najbiži sosedi
indicies = knn.kneighbors(values, return_distance=False)[0]
# indicies so indeksi v razvrstvi meta_space. Idex v all_data je v meta_space_indexes
indexes = [meta_space_indexes[i]  for i in indicies]

# algoritmi: 
for index in indexes:
    dataset = all_datasets[index]



In [None]:
MFE.valid_summary()

('mean',
 'nanmean',
 'sd',
 'nansd',
 'var',
 'nanvar',
 'count',
 'nancount',
 'histogram',
 'nanhistogram',
 'iq_range',
 'naniq_range',
 'kurtosis',
 'nankurtosis',
 'max',
 'nanmax',
 'median',
 'nanmedian',
 'min',
 'nanmin',
 'quantiles',
 'nanquantiles',
 'range',
 'nanrange',
 'skewness',
 'nanskewness',
 'sum',
 'nansum',
 'powersum',
 'pnorm',
 'nanpowersum',
 'nanpnorm')