In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import pandas as pd
import numpy as np
np.random.seed(132)
from functools import lru_cache

import sys

CODE_PATH = '../code'

sys.path.append(CODE_PATH)
import functions

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.datasets import load_iris, load_boston, load_breast_cancer, load_wine, load_digits
from scipy.optimize import minimize
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from tqdm import tqdm

%pylab inline

Populating the interactive namespace from numpy and matplotlib


# Берем выборку

In [None]:
# LETTER
letter = pd.read_csv('../data/letter.csv', header=None)
X = letter.drop(0, 1).values
target = pd.factorize(letter[0])[0]

In [None]:
# POKER
# https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html
from sklearn.datasets import load_svmlight_file
data = load_svmlight_file('../data/poker')
X, target = data[0].toarray(), data[1].astype(int)
X, target = X[target <= 7], target[target <= 7]
pd.Series(target).value_counts()

In [19]:
# Sensorless
# https://archive.ics.uci.edu/ml/datasets/dataset+for+sensorless+drive+diagnosis
from sklearn.datasets import load_svmlight_file
data = load_svmlight_file('../data/Sensorless.scale')
X, target = data[0].toarray(), data[1].astype(int)-1
X, _, target, _ = train_test_split(X, target, train_size=0.2, random_state=42, stratify=target)

pd.Series(target).value_counts()



5     1064
4     1064
10    1064
2     1064
9     1064
1     1064
8     1064
0     1064
7     1063
6     1063
3     1063
dtype: int64

In [None]:
# 20NEWS
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups

twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
cv = TfidfVectorizer(max_features=100)
cv = CountVectorizer(max_features=100)

X = cv.fit_transform(twenty_train.data).toarray()
target = twenty_train.target

In [None]:
# DIGITS
dataset = load_digits()
df = pd.DataFrame(dataset['data'])
target = dataset['target']

X = df.values

In [23]:
# MODEL
df = pd.read_csv('../data/model_16_16.csv')
target = df['target'].astype(int).values
del df['target']

X = df.values

In [24]:
# TRAIN/TEST/VALID SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.2, random_state=42, stratify=target)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42, stratify=y_train)
print(X_train.shape, X_test.shape, X_val.shape)

(896, 2) (320, 2) (384, 2)


In [25]:
BaseClassifier = LinearSVC
BaseClassifier = LogisticRegression

In [26]:
subset_mid = np.linspace(20, 140, 13, dtype=int)
subset_small = np.linspace(20, 55, 8, dtype=int)
subset_big_rare = np.linspace(20, 200, 10, dtype=int)
subset_big = np.linspace(10, 200, 20, dtype=int)

# N случайных дихотомий (без отбора)

In [None]:
attempts_data = []
N_attempts = 10
l = np.unique(target).size
N = 30 # кол-во дихотомий
wtypes = [None, 'accuracy', 'f1', 'confusion_list']
for i in tqdm(range(N_attempts)):
    accs = []
    code_matrix = functions.make_random_dichs(l, N)
    print('Code Matrix shape == ({},{})'.format(l, code_matrix.shape[1]))
    dich_classifiers = functions.train_dichs(code_matrix, X_train, y_train, 
                                             X_test, y_test, BaseClassifier)
    for score_type in wtypes:
        weight_type = None
        preds, num_real_dich = functions.predict_all(X_val, dich_classifiers, code_matrix, score_type, weight_type)
        acc = accuracy_score(preds, y_val)
        accs.append(acc)
    accs.append(num_real_dich)
    attempts_data.append(accs)
#             print(score_type, weight_type, accuracy_score(preds, y_val))

In [None]:
wtypes = [None, 'accuracy', 'f1', 'confusion_list']
wtypes += ['num_real_dich']
df_attempts = pd.DataFrame(attempts_data)
df_attempts.columns = ['s{}'.format(i1) for i1 in wtypes]
df_attempts.describe()

In [None]:
# делаем общее многозначное
attempts_data = []

subset = np.linspace(20, 55, 8, dtype=int)
for N in subset_mid: # кол-во дихотомий
    N_attempts = 10
    l = np.unique(target).size
    wtypes = [None, 'accuracy', 'f1', 'confusion_list']
    for i in tqdm(range(N_attempts)):
        accs = []
        code_matrix = functions.make_random_dichs(l, N)
        print('Code Matrix shape == ({},{})'.format(l, code_matrix.shape[1]))
        dich_classifiers = functions.train_dichs(code_matrix, X_train, y_train, 
                                                 X_test, y_test, BaseClassifier)
        for score_type in wtypes:
            weight_type = None
            preds, num_real_dich = functions.predict_all(X_val, dich_classifiers, code_matrix, score_type, weight_type)
            acc = accuracy_score(preds, y_val)
            accs.append(acc)
        accs.append(num_real_dich)
        attempts_data.append(accs)
    #             print(score_type, weight_type, accuracy_score(preds, y_val))


In [None]:
cols = ['ecoc', 'accuracy', 'f1', 'confusion_list', 'num_real_dich']
df_attempts = pd.DataFrame(attempts_data, columns=cols)

gb = df_attempts.groupby('num_real_dich').agg(['mean', 'std'])
gb.columns = ['_'.join(col).strip() for col in gb.columns.values]
gb = gb.reset_index()
gb

# Отбор зазором

In [None]:
attempts_data = []
N_attempts = 10
l = np.unique(target).size
N = 100 # кол-во дихотомий
wtypes = [None, 'accuracy', 'f1', 'confusion_list']
for i in tqdm(range(N_attempts)):
    accs = []
    code_matrix = functions.make_random_dichs(l, N)
#     print('Code Matrix shape == ({},{})'.format(l, code_matrix.shape[1]))
    dich_classifiers = functions.train_dichs(code_matrix, X_train, y_train, 
                                             X_test, y_test, BaseClassifier)
    for score_type in wtypes:
        preds, num_real_dich = functions.predict_all(X_val, dich_classifiers, 
                                      code_matrix, score_type, weight_type=-1)
        acc = accuracy_score(preds, y_val)
        accs.append(acc)
    accs.append(num_real_dich)
    attempts_data.append(accs)

In [None]:
wtypes = [None, 'accuracy', 'f1', 'confusion_list']
wtypes += ['num_real_dich']
df_attempts = pd.DataFrame(attempts_data)
df_attempts.columns = ['s{}'.format(i1) for i1 in wtypes]
df_attempts.describe()

In [28]:
# делаем общее многозначное
attempts_data = []
subs = [220, 240, 280, 300, 350, 400]
for N in subs: # кол-во дихотомий
    N_attempts = 10
    l = np.unique(target).size
    wtypes = [None, 'accuracy', 'f1', 'confusion_list']
    for i in tqdm(range(N_attempts)):
        accs = []
        code_matrix = functions.make_random_dichs(l, N)
    #     print('Code Matrix shape == ({},{})'.format(l, code_matrix.shape[1]))
        dich_classifiers = functions.train_dichs(code_matrix, X_train, y_train, 
                                                 X_test, y_test, BaseClassifier)
        for score_type in wtypes:
            preds, num_real_dich = functions.predict_all(X_val, dich_classifiers, 
                                          code_matrix, score_type, weight_type=-1)
            acc = accuracy_score(preds, y_val)
            accs.append(acc)
        accs.append(num_real_dich)
        attempts_data.append(accs)


  0%|          | 0/10 [00:00<?, ?it/s][A

Adding dich:   0%|          | 0/220 [00:00<?, ?it/s][A[A

Adding dich: 100%|██████████| 220/220 [00:00<00:00, 35137.54it/s][A[A

  'precision', 'predicted', average, warn_for)


Training dich classifiers:  21%|██        | 46/220 [00:00<00:00, 450.23it/s][A[A

Training dich classifiers:  43%|████▎     | 95/220 [00:00<00:00, 469.35it/s][A[A

Training dich classifiers:  70%|███████   | 155/220 [00:00<00:00, 511.32it/s][A[A

Training dich classifiers: 100%|██████████| 220/220 [00:00<00:00, 543.94it/s][A[A

[A[A
 10%|█         | 1/10 [00:10<01:30, 10.02s/it][A

Adding dich:   0%|          | 0/220 [00:00<?, ?it/s][A[A

Adding dich: 100%|██████████| 220/220 [00:00<00:00, 67402.99it/s][A[A

Training dich classifiers:   0%|          | 0/220 [00:00<?, ?it/s][A[A

Training dich classifiers:  30%|██▉       | 65/220 [00:00<00:00, 647.86it/s][A[A

Training dich classifiers:  60%|█████▉    | 131/220 [00:00<00:00, 652.32it/s][A[A

Tr

Training dich classifiers: 100%|██████████| 240/240 [00:00<00:00, 637.61it/s][A[A
 30%|███       | 3/10 [00:34<01:19, 11.37s/it][A

Adding dich:   0%|          | 0/240 [00:00<?, ?it/s][A[A

Adding dich: 100%|██████████| 240/240 [00:00<00:00, 66186.66it/s][A[A

Training dich classifiers:   0%|          | 0/240 [00:00<?, ?it/s][A[A

Training dich classifiers:  27%|██▋       | 64/240 [00:00<00:00, 630.64it/s][A[A

Training dich classifiers:  54%|█████▍    | 130/240 [00:00<00:00, 639.75it/s][A[A

Training dich classifiers:  82%|████████▏ | 196/240 [00:00<00:00, 644.42it/s][A[A

Training dich classifiers: 100%|██████████| 240/240 [00:00<00:00, 644.84it/s][A[A
 40%|████      | 4/10 [00:47<01:11, 11.96s/it][A

Adding dich:   0%|          | 0/240 [00:00<?, ?it/s][A[A

Adding dich: 100%|██████████| 240/240 [00:00<00:00, 68163.12it/s][A[A

Training dich classifiers:   0%|          | 0/240 [00:00<?, ?it/s][A[A

Training dich classifiers:  27%|██▋       | 65/240 [00:00<00:0

Training dich classifiers:  70%|██████▉   | 195/280 [00:00<00:00, 640.79it/s][A[A

Training dich classifiers:  92%|█████████▏| 258/280 [00:00<00:00, 638.08it/s][A[A

Training dich classifiers: 100%|██████████| 280/280 [00:00<00:00, 633.36it/s][A[A
 60%|██████    | 6/10 [01:16<00:50, 12.72s/it][A

Adding dich:   0%|          | 0/280 [00:00<?, ?it/s][A[A

Adding dich: 100%|██████████| 280/280 [00:00<00:00, 61387.54it/s][A[A

Training dich classifiers:   0%|          | 0/280 [00:00<?, ?it/s][A[A

Training dich classifiers:  22%|██▎       | 63/280 [00:00<00:00, 624.89it/s][A[A

Training dich classifiers:  45%|████▌     | 127/280 [00:00<00:00, 631.19it/s][A[A

Training dich classifiers:  69%|██████▊   | 192/280 [00:00<00:00, 637.55it/s][A[A

Training dich classifiers:  92%|█████████▏| 258/280 [00:00<00:00, 640.80it/s][A[A

Training dich classifiers: 100%|██████████| 280/280 [00:00<00:00, 637.34it/s][A[A
 70%|███████   | 7/10 [01:29<00:38, 12.73s/it][A

Adding dich:  

Training dich classifiers:  44%|████▎     | 131/300 [00:00<00:00, 647.03it/s][A[A

Training dich classifiers:  65%|██████▌   | 195/300 [00:00<00:00, 643.09it/s][A[A

Training dich classifiers:  87%|████████▋ | 260/300 [00:00<00:00, 643.72it/s][A[A

Training dich classifiers: 100%|██████████| 300/300 [00:00<00:00, 643.28it/s][A[A
 80%|████████  | 8/10 [01:52<00:28, 14.03s/it][A

Adding dich:   0%|          | 0/300 [00:00<?, ?it/s][A[A

Adding dich: 100%|██████████| 300/300 [00:00<00:00, 68917.25it/s][A[A

Training dich classifiers:   0%|          | 0/300 [00:00<?, ?it/s][A[A

Training dich classifiers:  22%|██▏       | 65/300 [00:00<00:00, 640.02it/s][A[A

Training dich classifiers:  43%|████▎     | 130/300 [00:00<00:00, 640.04it/s][A[A

Training dich classifiers:  65%|██████▍   | 194/300 [00:00<00:00, 637.47it/s][A[A

Training dich classifiers:  86%|████████▌ | 258/300 [00:00<00:00, 638.28it/s][A[A

Training dich classifiers: 100%|██████████| 300/300 [00:00<00:00

Training dich classifiers:  15%|█▌        | 53/350 [00:00<00:00, 518.92it/s][A[A

Training dich classifiers:  29%|██▉       | 102/350 [00:00<00:00, 501.93it/s][A[A

Training dich classifiers:  47%|████▋     | 164/350 [00:00<00:00, 539.19it/s][A[A

Training dich classifiers:  65%|██████▌   | 228/350 [00:00<00:00, 562.87it/s][A[A

Training dich classifiers:  84%|████████▎ | 293/350 [00:00<00:00, 579.57it/s][A[A

Training dich classifiers: 100%|██████████| 350/350 [00:00<00:00, 586.82it/s][A[A
 90%|█████████ | 9/10 [02:28<00:16, 16.50s/it][A

Adding dich:   0%|          | 0/350 [00:00<?, ?it/s][A[A

Adding dich: 100%|██████████| 350/350 [00:00<00:00, 69636.47it/s][A[A

Training dich classifiers:   0%|          | 0/350 [00:00<?, ?it/s][A[A

Training dich classifiers:  14%|█▍        | 50/350 [00:00<00:00, 490.67it/s][A[A

Training dich classifiers:  31%|███       | 108/350 [00:00<00:00, 534.47it/s][A[A

Training dich classifiers:  49%|████▉     | 173/350 [00:00<00:00,

Adding dich:   0%|          | 0/400 [00:00<?, ?it/s][A[A

Adding dich: 100%|██████████| 400/400 [00:00<00:00, 59199.77it/s][A[A

Training dich classifiers:   0%|          | 0/400 [00:00<?, ?it/s][A[A

Training dich classifiers:  16%|█▋        | 65/400 [00:00<00:00, 640.57it/s][A[A

Training dich classifiers:  32%|███▏      | 129/400 [00:00<00:00, 635.54it/s][A[A

Training dich classifiers:  48%|████▊     | 193/400 [00:00<00:00, 634.79it/s][A[A

Training dich classifiers:  64%|██████▍   | 256/400 [00:00<00:00, 631.71it/s][A[A

Training dich classifiers:  80%|████████  | 321/400 [00:00<00:00, 634.08it/s][A[A

Training dich classifiers:  96%|█████████▋| 385/400 [00:00<00:00, 634.66it/s][A[A

Training dich classifiers: 100%|██████████| 400/400 [00:00<00:00, 632.84it/s][A[A
 90%|█████████ | 9/10 [02:47<00:18, 18.65s/it][A

Adding dich:   0%|          | 0/400 [00:00<?, ?it/s][A[A

Adding dich: 100%|██████████| 400/400 [00:00<00:00, 68233.35it/s][A[A

Training dich cl

In [29]:
cols = ['ecoc', 'accuracy', 'f1', 'confusion_list', 'num_real_dich']
df_attempts = pd.DataFrame(attempts_data, columns=cols)
df_attempts['gb_index'] = (df_attempts.index / N_attempts).astype(int)

gb = df_attempts.groupby('gb_index').agg(['mean', 'std'])
gb.columns = ['_'.join(col).strip() for col in gb.columns.values]
gb = gb.reset_index()
gb

Unnamed: 0,gb_index,ecoc_mean,ecoc_std,accuracy_mean,accuracy_std,f1_mean,f1_std,confusion_list_mean,confusion_list_std,num_real_dich_mean,num_real_dich_std
0,0,0.332552,0.034088,0.472396,0.050185,0.38125,0.094329,0.772396,0.018986,116.5,4.503085
1,1,0.355469,0.056487,0.465625,0.068635,0.407292,0.043597,0.77474,0.026621,125.6,9.512565
2,2,0.326823,0.037362,0.44974,0.037358,0.384115,0.062936,0.773438,0.015912,133.6,12.447222
3,3,0.338281,0.03512,0.470312,0.070918,0.39974,0.045921,0.763021,0.02698,142.4,14.886235
4,4,0.331771,0.071752,0.461719,0.074035,0.429167,0.059165,0.760156,0.036488,147.8,16.430324
5,5,0.330729,0.049227,0.46875,0.044143,0.396615,0.035496,0.740104,0.041915,150.9,9.67758


# Дихотомии где остались лучшие по критерию

In [None]:
attempts_data = []
N_attempts = 10
l = np.unique(target).size
N = 300 # кол-во дихотомий
num_real_dich = 100 # кол-во дихотомий которые оставляем
wtypes = [None, 'accuracy', 'f1', 'confusion_list']
for i in tqdm(range(N_attempts)):
    accs = []
    code_matrix = functions.make_random_dichs(l, N)
    print('Code Matrix shape == ({},{})'.format(l, code_matrix.shape[1]))
    dich_classifiers = functions.train_dichs(code_matrix, X_train, y_train, 
                                             X_test, y_test, BaseClassifier)
    
    metric_map = np.array([d['f1'] for d in dich_classifiers])
    top_dich = np.sort(np.argsort(metric_map)[-num_real_dich:])
    dich_classifiers = [d for i,d in enumerate(dich_classifiers) if i in top_dich]
    code_matrix = code_matrix.T[top_dich].T
    
    for score_type in wtypes:
        weight_type = None
        preds, _ = functions.predict_all(X_val, dich_classifiers, code_matrix, score_type, weight_type)
        acc = accuracy_score(preds, y_val)
        accs.append(acc)
    accs.append(num_real_dich)
    attempts_data.append(accs)
#             print(score_type, weight_type, accuracy_score(preds, y_val))

In [None]:
wtypes += ['num_real_dich']
df_attempts = pd.DataFrame(attempts_data)
df_attempts.columns = ['s{}'.format(i1) for i1 in wtypes]
df_attempts.describe()

In [None]:
%%time
attempts_data = []
N = 400
l = np.unique(target).size
for num_real_dich in subset_mid: # кол-во дихотомий
    N_attempts = 10
    wtypes = [None, 'accuracy', 'f1', 'confusion_list']
    for i in tqdm(range(N_attempts)):
        accs = []
        code_matrix = functions.make_random_dichs(l, N)
        print('Code Matrix shape == ({},{})'.format(l, code_matrix.shape[1]))
        dich_classifiers = functions.train_dichs(code_matrix, X_train, y_train, 
                                                 X_test, y_test, BaseClassifier)

        metric_map = np.array([d['f1'] for d in dich_classifiers])
        top_dich = np.sort(np.argsort(metric_map)[-num_real_dich:])
        dich_classifiers = [d for i,d in enumerate(dich_classifiers) if i in top_dich]
        code_matrix = code_matrix.T[top_dich].T

        for score_type in wtypes:
            weight_type = None
            preds, _ = functions.predict_all(X_val, dich_classifiers, code_matrix, score_type, weight_type)
            acc = accuracy_score(preds, y_val)
            accs.append(acc)
        accs.append(num_real_dich)
        attempts_data.append(accs)

In [None]:
cols = ['ecoc', 'accuracy', 'f1', 'confusion_list', 'num_real_dich']
df_attempts = pd.DataFrame(attempts_data, columns=cols)

gb = df_attempts.groupby('num_real_dich').agg(['mean', 'std'])
gb.columns = ['_'.join(col).strip() for col in gb.columns.values]
gb = gb.reset_index()
gb

# Локальный метод оптимизации

In [None]:
def score_function(cur_dich, code_matrix):
    # cluster
    if cur_dich.max() == cur_dich.min(): #trivial dich
        return -inf
    target = np.array([cur_dich[i] for i in y_train])
    return -functions.cluster_score(X_train, target, score_type='trace_w')

In [None]:
def score_function(cur_dich, code_matrix):
    # accuracy
    X = X_train
    y = np.array([cur_dich[i] for i in y_train])
    if y.max() == y.min(): #trivial dich
        return 0
    clf = LogisticRegression()
    clf.fit(X, y)
    y_pred = clf.predict(X_test)
    y_true = np.array([cur_dich[i] for i in y_test])
    accuracy = accuracy_score(y_true, y_pred)
    return accuracy

In [None]:
def score_function(cur_dich, code_matrix):
    # f1
#     X = X_train
#     y = np.array([cur_dich[i] for i in y_train])
# ВОЗМОЖНО ЗДЕСЬ БЫЛО БОЛЬШОЕ ПЕРЕОБУЧЕНИЕ!!!
    X = X_test
    y = np.array([cur_dich[i] for i in y_test])
    if y.max() == y.min(): #trivial dich
        return 0
    clf = LogisticRegression()
    clf.fit(X, y)
    y_pred = clf.predict(X_test)
    y_true = np.array([cur_dich[i] for i in y_test])
    accuracy = f1_score(y_true, y_pred)
    return accuracy

In [None]:
# матрица неточностей
from sklearn.metrics import confusion_matrix
def score_function(cur_dich, code_matrix):
    if cur_dich.max() == cur_dich.min():
        return 0
    if code_matrix is None:
        return 0
    dich_classifiers = functions.train_dichs(code_matrix, X_train, y_train, 
                                             X_test, y_test, BaseClassifier)
    preds, num_real_dich = functions.predict_all(X_test, dich_classifiers, code_matrix, None, None)
    C = confusion_matrix(y_test, preds)
    D = (cur_dich+cur_dich.T) % 2
    q = (C*D).sum()
    return q

In [None]:
attempts_data = []
N_attempts = 10
l = np.unique(target).size
N = 10 # кол-во дихотомий
wtypes = [None, 'accuracy', 'f1', 'confusion_list']
for i in tqdm(range(N_attempts)):
    accs = []
    
    code_matrix = functions.make_code_matrix_local(l, N, score_function, 0)
    while code_matrix.sum(axis=0).max() == l or code_matrix.sum(axis=0).min() == 0:
        code_matrix = functions.make_code_matrix_local(l, N, score_function, 0)
    print('Code Matrix shape == ({},{})'.format(l, code_matrix.shape[1]))
    dich_classifiers = functions.train_dichs(code_matrix, X_train, y_train, 
                                             X_test, y_test, BaseClassifier)
    for score_type in wtypes:
        weight_type = None
        preds, num_real_dich = functions.predict_all(X_val, dich_classifiers, code_matrix, score_type, weight_type)
        acc = accuracy_score(preds, y_val)
        accs.append(acc)
    accs.append(num_real_dich)
    attempts_data.append(accs)

In [None]:
wtypes = [None, 'accuracy', 'f1', 'confusion_list']
wtypes += ['num_real_dich']
df_attempts = pd.DataFrame(attempts_data)
df_attempts.columns = ['s{}'.format(i1) for i1 in wtypes]
df_attempts.describe()

In [None]:
attempts_data = []
N_attempts = 10
l = np.unique(target).size
wtypes = [None, 'accuracy', 'f1', 'confusion_list']

for N in subset_mid: 
    for i in tqdm(range(N_attempts)):
        accs = []
        code_matrix = functions.make_code_matrix_local(l, N, score_function, 0)
        while code_matrix.sum(axis=0).max() == l or code_matrix.sum(axis=0).min() == 0:
            code_matrix = functions.make_code_matrix_local(l, N, score_function, 0)
        print('Code Matrix shape == ({},{})'.format(l, code_matrix.shape[1]))
        dich_classifiers = functions.train_dichs(code_matrix, X_train, y_train, 
                                                 X_test, y_test, BaseClassifier)
        for score_type in wtypes:
            weight_type = None
            preds, num_real_dich = functions.predict_all(X_val, dich_classifiers, code_matrix, score_type, weight_type)
            acc = accuracy_score(preds, y_val)
            accs.append(acc)
        accs.append(num_real_dich)
        attempts_data.append(accs)

In [None]:
cols = ['ecoc', 'accuracy', 'f1', 'confusion_list', 'num_real_dich']
df_attempts = pd.DataFrame(attempts_data, columns=cols)

gb = df_attempts.groupby('num_real_dich').agg(['mean', 'std'])
gb.columns = ['_'.join(col).strip() for col in gb.columns.values]
gb = gb.reset_index()
gb