## Планы:
1. Гауссианы?
2. Нейронки из статей
3. Автоэнкодер?
4. Бустинг градиентный
5. Какой-нибудь другой feature engineering?
6. Посмотреть ROC AUC
7. Проверить устойчивость модели при изменении интенсивности!

## Выводы:
1. Данные становятся менее информативными при урезании/дополнении пиков, что отражается на качестве работы модели
2. Добавление в датасет информации и количестве значимых гауссианов ухудшает качество, что может свидетельствовать о возможно неправильном алгоритме разделения гауссианов на значимые и незначимые

## Инструкция для запускающих:
Инстукция предназначена для запуска на сервере 192.168.17.10
1. Убедись, что в качестве kernel'а выбрано raman-spec

In [16]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

import random
# peaks search by gauss decomposition
# import gausspy
# import gausspy.gp as gp
# import pickle

from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
torch.cuda.is_available()

False

In [3]:
# !pip install plotly
# !pip install pandas
# !conda install pytorch torchvision torchaudio -c pytorch -y
# !pip install -U scikit-learn

## Data loading/proccessing

Загрузка данных

In [4]:
# # для работы в колабе 
# from google.colab import drive
# drive.mount('/content/drive')

### 38 peaks data proccessing

In [5]:
fixed_fg_file, fixed_ga_file = "dataSrc/peaks-fg-38nonsign.csv", "dataSrc/peaks-ga-38nonsign.csv"

data_fixed_fg = pd.read_csv(fixed_fg_file, sep=";", header=None)
data_fixed_ga = pd.read_csv(fixed_ga_file, sep=";", header=None)
columns = data_fixed_fg.columns
print(f"""data with fixed peaks FG shape: {data_fixed_fg.shape}
data with fixed peaks GA shape: {data_fixed_ga.shape}""")

data with fixed peaks FG shape: (569, 39)
data with fixed peaks GA shape: (89, 39)


In [6]:
y_fg = data_fixed_fg[columns[0]].values
print(f"old classes: {set(y_fg)}")
for index, element in enumerate(set(y_fg)):
    y_fg[y_fg == element] = index
print(f"new classes: {set(y_fg)}\n")

y_ga = data_fixed_ga[columns[0]].values
print(f"old ga classes: {set(y_ga)}")
for index, element in enumerate(set(data_fixed_ga[columns[0]])):
    y_ga[y_ga == element] = index + len(set(y_fg))
print(f"new ga classes: {set(y_ga)}")

old classes: {1, 2, 3, 4, 5, 7, 8, 9, 10, 11}
new classes: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

old ga classes: {2, 3, 5, 6}
new ga classes: {10, 11, 12, 13}


In [7]:
group1 = [0, 1, 2, 3, 4]
group2 = [5, 6, 7, 8, 9]
group3 = [10, 11]
group4 = [12, 13]
groups = [group1, group2, group3, group4]

def add_group(y, groups, add=0):
    new_y = np.zeros((y.shape[0], 2))
    new_y[:, 1] = y
    for group_num, group in enumerate(groups):
        for class_num in group:
            new_y[y==class_num] = np.array([group_num + add, class_num])
    return new_y

y_ga = add_group(y_ga, (group3, group4), add=2)
y_fg = add_group(y_fg, (group1, group2))
print(f"Now, shapes of y_fg and ga is respectively: {y_ga.shape}, {y_fg.shape}")

Now, shapes of y_fg and ga is respectively: (89, 2), (569, 2)


In [8]:
X_fixed = np.concatenate(
    (
    data_fixed_fg[columns[1:]],
    data_fixed_ga[columns[1:]]
    )
    , axis=0
)
y_fixed = np.concatenate(
    (
    y_fg,
    y_ga
    )
    , axis=0
)

print(f"shapes of X and y is respectively: {X_fixed.shape}, {y_fixed.shape}")

shapes of X and y is respectively: (658, 38), (658, 2)


### 1-40 peaks data proccessing

In [9]:
def read_file(filename, sep=';'):
    result = []
    with open(filename, 'r') as f:
        lines = f.read().splitlines()
#         print(len(strings))
    for line in lines:
        result.append(
            list(
                map(
                    float, 
                    line.split(sep)
                )
            )
        )
    return result

# read_file(nonfixed_fg_file)

In [10]:
nonfixed_fg_file, nonfixed_ga_file = "dataSrc/peaks-fg-1-40.csv", "dataSrc/peaks-ga-1-40.csv"

data_nonfixed_fg, data_nonfixed_ga = read_file(nonfixed_fg_file, sep=";"), read_file(nonfixed_ga_file, sep=";")

print(f"""data with nonfixed peaks FG shape: {len(data_nonfixed_fg)}
data with nonfixed peaks GA shape: {len(data_nonfixed_ga)}""")
data_nonfixed = data_nonfixed_fg + data_nonfixed_ga

X_nonfixed = [[len(x)] + x[1:] for x in data_nonfixed]
y_nonfixed = y_fixed.copy()
# y_nonfixed = [x[0] for data_nonfixed]

data with nonfixed peaks FG shape: 569
data with nonfixed peaks GA shape: 89


#### Создадим урезанный и дополненный датасеты:

In [11]:
min_size = 1e4
min_peaks = []
min_index = 0
max_size = 0
max_peaks = []
max_index = 0
for index, peaks in enumerate(X_nonfixed):
    if min_size > len(peaks):
        min_size = len(peaks)
        min_peaks = peaks
        min_index = index
    if max_size < len(peaks):
        max_size = len(peaks)
        max_peaks = peaks
        max_index = index

print(f"""min_size = {min_size}\n
max_size = {max_size}""")

min_size = 16

max_size = 40


In [12]:
def cut_peaks(X, min_size):
    new_data = []
    for example in X:
#         print(example)
        new_data.append(example[:min_size])
    return np.array(new_data)

X_nonfixed_cut = cut_peaks(X_nonfixed, min_size)
print(X_nonfixed_cut.shape)


def fill_peaks(X, max_size):
    new_data = []
    for example in X:
#         print(example)
        new_data.append(example+[0 for i in range(max_size - len(example))])
    return np.array(new_data)

X_nonfixed_filled = fill_peaks(X_nonfixed, max_size)
print(X_nonfixed_filled.shape)

(658, 16)
(658, 40)


In [19]:
array1 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
array2 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
random_int = random.randint(0, 10000)
np.random.seed(seed=random_int)
np.random.shuffle(array1)
np.random.seed(seed=random_int)
np.random.shuffle(array2)
array1, array2

(array([2, 1, 3, 7, 6, 9, 8, 5, 4]), array([2, 1, 3, 7, 6, 9, 8, 5, 4]))

## Применение методов ml

In [24]:
def train_template(model, data, y, description='data_cut dataset', class_type='group'):
#     shuffle data
    random_int = random.randint(0, 10000)
    np.random.seed(seed=random_int)
    np.random.shuffle(data)
    np.random.seed(seed=random_int)
    np.random.shuffle(y)
    
    if class_type == 'group':
        scores = cross_val_score(
            model, 
            data, 
            y[:, 0], 
            cv=5
        )
    else:
        scores = cross_val_score(
            model, 
            data, 
            y[:, 1], 
            cv=5
        )
    scores = (round(scores.min(), 3), round(scores.mean(), 3), round(scores.max(), 3))
    print(f'min/mean/max accuracy of {class_type} prediction on {description}: {scores[0], scores[1], scores[2]}')

### Random Forest

In [27]:
def random_forest_train(data, y, num_features, description='data_cut dataset', class_type='group'):
    if class_type not in ('group', 'class'):
        raise Exception("class_type may be equal only 'group' or 'class'")
    exrt_crystal_system = ExtraTreesClassifier(
        n_estimators=2000, 
        max_depth=40, 
        max_features=num_features, 
        n_jobs=-1, 
    #     random_state=random_state,
        warm_start=False
    )
    
    train_template(exrt_crystal_system, data, y, description, class_type)

In [28]:
# random_forest_train(
#     X_nonfixed_cut, 
#     y_nonfixed, 
#     X_nonfixed_cut.shape[1], 
#     class_type='group', 
#     description='cut 15 peaks dataset'
# )
# random_forest_train(
#     X_nonfixed_cut, 
#     y_nonfixed, 
#     X_nonfixed_cut.shape[1], 
#     class_type='class', 
#     description='cut 15 peaks dataset'
# )


# random_forest_train(
#     X_nonfixed_filled, 
#     y_nonfixed, 
#     X_nonfixed_filled.shape[1], 
#     class_type='group', 
#     description='filled 40 peaks dataset'
# )
# random_forest_train(
#     X_nonfixed_filled, 
#     y_nonfixed, 
#     X_nonfixed_filled.shape[1], 
#     class_type='class', 
#     description='filled 40 peaks dataset'
# )

random_forest_train(
    X_fixed, 
    y_fixed, 
    X_fixed.shape[1], 
    class_type='group', 
    description='38 peaks dataset'
)
random_forest_train(
    X_fixed, 
    y_fixed, 
    X_fixed.shape[1], 
    class_type='class', 
    description='38 peaks dataset'
)

min/mean/max accuracy of group prediction on 38 peaks dataset: (0.97, 0.988, 0.992)
min/mean/max accuracy of class prediction on 38 peaks dataset: (0.871, 0.897, 0.947)


### Градиентный бустинг от XGBoost

In [29]:
# !pip install xgboost
import xgboost as xgb

def xgboost_train(data, y, num_features, description='data_cut dataset', class_type='group'):
    if class_type not in ('group', 'class'):
        raise Exception("class_type may be equal only 'group' or 'class'")
    
    xg_clsfr = xgb.XGBClassifier(
        objective='multi:softmax', 
        learning_rate = 0.01,
#         max_depth = 50, 
#         reg_alpha = 0, 
#         reg_lambda=0, 
        n_estimators = 1000, 
        eval_metric='merror'
#         num_class=7, 
#         tree_method = "hist", 
#         verbosity=0, 
#         grow_policy="lossguide"
    )
    train_template(xg_clsfr, data, y, description, class_type)

In [30]:
# in order to ignore all the warning messages
import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings(action='once')

xgboost_train(X_nonfixed_cut, 
              y_nonfixed, 
              X_nonfixed_cut.shape[1], 
              class_type='group', 
              description='cut 15 peaks dataset')
xgboost_train(X_nonfixed_cut, 
              y_nonfixed, 
              X_nonfixed_cut.shape[1], 
              class_type='class', 
              description='cut 15 peaks dataset')

xgboost_train(X_nonfixed_filled, 
              y_nonfixed, 
              X_nonfixed_filled.shape[1], 
              class_type='group', 
              description='filled 40 peaks dataset')
xgboost_train(X_nonfixed_filled, 
              y_nonfixed, 
              X_nonfixed_filled.shape[1], 
              class_type='class', 
              description='filled 40 peaks dataset')

xgboost_train(X_fixed, 
              y_fixed, 
              X_fixed.shape[1], 
              class_type='group', 
              description='38 peaks dataset')
xgboost_train(X_fixed, 
              y_fixed, 
              X_fixed.shape[1], 
              class_type='class', 
              description='38 peaks dataset')

min/mean/max accuracy of group prediction on cut 15 peaks dataset: (0.435, 0.462, 0.508)
min/mean/max accuracy of class prediction on cut 15 peaks dataset: (0.076, 0.079, 0.092)
min/mean/max accuracy of group prediction on filled 40 peaks dataset: (0.409, 0.462, 0.496)
min/mean/max accuracy of class prediction on filled 40 peaks dataset: (0.076, 0.096, 0.122)
min/mean/max accuracy of group prediction on 38 peaks dataset: (0.955, 0.985, 1.0)
min/mean/max accuracy of class prediction on 38 peaks dataset: (0.779, 0.831, 0.894)


### Градиентный бустинг от CatBoost

In [31]:
# !pip install catboost
import catboost

def catboost_train(data, y, num_features, description='data_cut dataset', class_type='group'):
    if class_type not in ('group', 'class'):
        raise Exception("class_type may be equal only 'group' or 'class'")
    
#     yes, I love cats
    kitty_model = catboost.CatBoostClassifier(
        iterations = 1000,
#         if you want to see logs, set 'veerbose' to 1 or 2
        verbose=0
    )
    
    train_template(kitty_model, data, y, num_features, description, class_type)

In [32]:
# in order to ignore all the warning messages
# import warnings
# warnings.filterwarnings('ignore')
catboost_train(X_nonfixed_cut, 
              y_nonfixed, 
              X_nonfixed_cut.shape[1], 
              class_type='group', 
              description='cut 15 peaks dataset')
catboost_train(X_nonfixed_cut, 
              y_nonfixed, 
              X_nonfixed_cut.shape[1], 
              class_type='class', 
              description='cut 15 peaks dataset')

catboost_train(X_nonfixed_filled, 
              y_nonfixed, 
              X_nonfixed_filled.shape[1], 
              class_type='group', 
              description='filled 40 peaks dataset')
catboost_train(X_nonfixed_filled, 
              y_nonfixed, 
              X_nonfixed_filled.shape[1], 
              class_type='class', 
              description='filled 40 peaks dataset')

catboost_train(X_fixed, 
              y_fixed, 
              X_fixed.shape[1], 
              class_type='group', 
              description='38 peaks dataset')
catboost_train(X_fixed, 
              y_fixed, 
              X_fixed.shape[1], 
              class_type='class', 
              description='38 peaks dataset')

min/mean/max accuracy of group prediction on cut 15 peaks dataset: (0.427, 0.467, 0.5)
min/mean/max accuracy of class prediction on cut 15 peaks dataset: (0.083, 0.105, 0.144)
min/mean/max accuracy of group prediction on filled 40 peaks dataset: (0.427, 0.451, 0.47)
min/mean/max accuracy of class prediction on filled 40 peaks dataset: (0.053, 0.097, 0.137)
min/mean/max accuracy of group prediction on 38 peaks dataset: (0.969, 0.988, 1.0)
min/mean/max accuracy of class prediction on 38 peaks dataset: (0.878, 0.897, 0.917)


## Полные данные о гауссианах(114 фичей)

In [34]:
fixed_fg_file, fixed_ga_file = "dataSrc/peaks-fg-114nonsign.csv", "dataSrc/peaks-ga-114nonsign.csv"

data_fixed_fg = pd.read_csv(fixed_fg_file, sep=";", header=None)
data_fixed_ga = pd.read_csv(fixed_ga_file, sep=";", header=None)
columns = data_fixed_fg.columns
print(f"""data with fixed peaks FG shape: {data_fixed_fg.shape}
data with fixed peaks GA shape: {data_fixed_ga.shape}""")

data with fixed peaks FG shape: (569, 115)
data with fixed peaks GA shape: (89, 115)


In [35]:
y_fg = data_fixed_fg[columns[0]].values
print(f"old classes: {set(y_fg)}")
for index, element in enumerate(set(y_fg)):
    y_fg[y_fg == element] = index
print(f"new classes: {set(y_fg)}\n")

y_ga = data_fixed_ga[columns[0]].values
print(f"old ga classes: {set(y_ga)}")
for index, element in enumerate(set(data_fixed_ga[columns[0]])):
    y_ga[y_ga == element] = index + len(set(y_fg))
print(f"new ga classes: {set(y_ga)}")

old classes: {1, 2, 3, 4, 5, 7, 8, 9, 10, 11}
new classes: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

old ga classes: {2, 3, 5, 6}
new ga classes: {10, 11, 12, 13}


In [36]:
group1 = [0, 1, 2, 3, 4]
group2 = [5, 6, 7, 8, 9]
group3 = [10, 11]
group4 = [12, 13]
groups = [group1, group2, group3, group4]

def add_group(y, groups, add=0):
    new_y = np.zeros((y.shape[0], 2))
    new_y[:, 1] = y
    for group_num, group in enumerate(groups):
        for class_num in group:
            new_y[y==class_num] = np.array([group_num + add, class_num])
    return new_y

y_ga = add_group(y_ga, (group3, group4), add=2)
y_fg = add_group(y_fg, (group1, group2))
print(f"Now, shapes of y_fg and ga is respectively: {y_ga.shape}, {y_fg.shape}")

Now, shapes of y_fg and ga is respectively: (89, 2), (569, 2)


In [37]:
X_fixed = np.concatenate(
    (
    data_fixed_fg[columns[1:]],
    data_fixed_ga[columns[1:]]
    )
    , axis=0
)
y_fixed = np.concatenate(
    (
    y_fg,
    y_ga
    )
    , axis=0
)

print(f"shapes of X and y is respectively: {X_fixed.shape}, {y_fixed.shape}")

shapes of X and y is respectively: (658, 114), (658, 2)


In [40]:
def random_forest_train(data, y, num_features, description='data_cut dataset', class_type='group'):
    if class_type not in ('group', 'class'):
        raise Exception("class_type may be equal only 'group' or 'class'")
    exrt_crystal_system = ExtraTreesClassifier(
        n_estimators=2000, 
        max_depth=40, 
        max_features=num_features, 
        n_jobs=-1, 
    #     random_state=random_state,
        warm_start=False
    )
    
    train_template(exrt_crystal_system, data, y, num_features, description, class_type)

In [41]:
random_forest_train(
    X_fixed, 
    y_fixed, 
    X_fixed.shape[1], 
    class_type='group', 
    description='114 peaks dataset'
)
random_forest_train(
    X_fixed, 
    y_fixed, 
    X_fixed.shape[1], 
    class_type='class', 
    description='114 peaks dataset'
)

min/mean/max accuracy of group prediction on 114 peaks dataset: (0.992, 0.998, 1.0)
min/mean/max accuracy of class prediction on 114 peaks dataset: (0.909, 0.923, 0.939)


In [42]:
# !pip install xgboost
import xgboost as xgb

def xgboost_train(data, y, num_features, description='data_cut dataset', class_type='group'):
    if class_type not in ('group', 'class'):
        raise Exception("class_type may be equal only 'group' or 'class'")
    
    xg_clsfr = xgb.XGBClassifier(
        objective='multi:softmax', 
        learning_rate = 0.01,
#         max_depth = 50, 
#         reg_alpha = 0, 
#         reg_lambda=0, 
        n_estimators = 1000, 
        eval_metric='merror'
#         num_class=7, 
#         tree_method = "hist", 
#         verbosity=0, 
#         grow_policy="lossguide"
    )
    train_template(xg_clsfr, data, y, num_features, description, class_type)

In [43]:
# in order to ignore all the warning messages
import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings(action='once')

xgboost_train(X_fixed, 
              y_fixed, 
              X_fixed.shape[1], 
              class_type='group', 
              description='114 peaks dataset')
xgboost_train(X_fixed, 
              y_fixed, 
              X_fixed.shape[1], 
              class_type='class', 
              description='114 peaks dataset')

min/mean/max accuracy of group prediction on 114 peaks dataset: (0.97, 0.985, 0.992)
min/mean/max accuracy of class prediction on 114 peaks dataset: (0.847, 0.881, 0.902)


In [44]:
# !pip install catboost
import catboost

def catboost_train(data, y, num_features, description='data_cut dataset', class_type='group'):
    if class_type not in ('group', 'class'):
        raise Exception("class_type may be equal only 'group' or 'class'")
    
#     yes, I love cats
    kitty_model = catboost.CatBoostClassifier(
        iterations = 1000,
#         if you want to see logs, set 'veerbose' to 1 or 2
        verbose=0
    )
    
    train_template(kitty_model, data, y, num_features, description, class_type)

In [45]:
catboost_train(X_fixed, 
              y_fixed, 
              X_fixed.shape[1], 
              class_type='group', 
              description='114 peaks dataset')
catboost_train(X_fixed, 
              y_fixed, 
              X_fixed.shape[1], 
              class_type='class', 
              description='114 peaks dataset')

min/mean/max accuracy of group prediction on 114 peaks dataset: (0.985, 0.995, 1.0)
min/mean/max accuracy of class prediction on 114 peaks dataset: (0.908, 0.938, 0.97)
