# Exploring and Testing data_io.py

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../../src')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import anoog
from anoog.model import evaluate_model_with_cross_validation

## Table of contents

## [1. Load Data](#load_data)
## [2. Load Data with train and test split](#load_train_test_data)
## [3. Load Data (fast)](#targeted_feature_extraction)
## [4. Manuel Feature-Extraction](#manuel_feature_extraction)

### <a name='load_data'>Load Data</a>

This function loads the drill timeseries data of 2 persons (csv and yaml). <br>
Moreover it's extract the features, creates the target and selects the relevant features.<br>
<br>
It returns a DataFrame.

In [3]:
df = anoog.io.load_data("../../data/2021-11-09",
                        ['tippolit', 'vkorzev'],
                        extraction=anoog.io.extraction_mode.TSFRESH,
                        selection=anoog.io.selection_mode.NONE,
                        train_test_split=False,
                        test_size=0.3)
df.info()

Feature Extraction: 100%|██████████| 10/10 [01:22<00:00,  8.26s/it]
 'Voltage__query_similarity_count__query_None__threshold_0.0'
 'Current__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.


      Audio__variance_larger_than_standard_deviation  \
0.0                                              1.0   
1.0                                              0.0   
2.0                                              1.0   
3.0                                              0.0   
4.0                                              1.0   
5.0                                              1.0   
6.0                                              1.0   
7.0                                              1.0   
8.0                                              1.0   
9.0                                              1.0   
10.0                                             1.0   
11.0                                             0.0   
12.0                                             1.0   
13.0                                             1.0   
14.0                                             1.0   
15.0                                             0.0   
16.0                                            

In [4]:
df.head()

Unnamed: 0,Audio__variance_larger_than_standard_deviation,Audio__has_duplicate_max,Audio__has_duplicate_min,Audio__has_duplicate,Audio__sum_values,Audio__abs_energy,Audio__mean_abs_change,Audio__mean_change,Audio__mean_second_derivative_central,Audio__median,...,Current__permutation_entropy__dimension_6__tau_1,Current__permutation_entropy__dimension_7__tau_1,Current__query_similarity_count__query_None__threshold_0.0,"Current__matrix_profile__feature_""min""__threshold_0.98","Current__matrix_profile__feature_""max""__threshold_0.98","Current__matrix_profile__feature_""mean""__threshold_0.98","Current__matrix_profile__feature_""median""__threshold_0.98","Current__matrix_profile__feature_""25""__threshold_0.98","Current__matrix_profile__feature_""75""__threshold_0.98",y
0.0,1.0,0.0,0.0,1.0,-2.043153,831.583339,0.875932,0.001814,-0.003076,-0.000103,...,5.559817,6.161069,0.0,1.947202,8.288536,4.038009,4.018128,2.885553,4.856005,0
1.0,0.0,0.0,0.0,1.0,-2.31896,420.925765,0.635848,-0.003472,0.003089,0.0,...,5.524518,6.07904,0.0,1.48183,7.22222,3.595398,3.28605,2.62132,4.596944,0
2.0,1.0,0.0,0.0,0.0,-6.810043,475.134376,1.128933,0.021912,0.001619,-0.003905,...,5.328949,5.670727,0.0,1.099312,5.481754,3.061743,2.973147,2.021694,4.073807,0
3.0,0.0,0.0,0.0,1.0,-2.399988,235.058059,0.408791,-0.002023,0.001033,0.0,...,5.763553,6.350581,0.0,1.32217,6.704235,2.580577,2.218287,1.784324,3.255694,0
4.0,1.0,0.0,0.0,1.0,5.071669,754.353839,1.548132,0.007362,0.01124,-0.004828,...,5.435771,5.7662,0.0,1.230235,6.956805,3.097101,2.659717,1.769624,4.049682,0


In [5]:
df.to_csv("../../data/prepared_data_2021-11-09.csv", index=False)

In [10]:
df.columns

Index(['Time', 'Audio', 'Voltage', 'Current', 'ID', 'y'], dtype='object')

---

### <a name='load_train_test_data'>Train Test Data</a>

This function is for data loading for training and evaluating a AI-model.<br>
It splits in training-dataset and test-dataset.<br>
The function loads also the drill timeseries data of 2 persons (csv and yaml). <br>
And it extracts the features, creates the target and selects the relevant features.<br>
<br>
It returns 2 DataFrame's -> train-DataFrame, test-DataFrame.

In [8]:
train_data, test_data = df = anoog.io.load_data("../../data/2021-11-09",
                        ['tippolit', 'vkorzev'],
                        extraction=anoog.io.extraction_mode.TSFRESH,
                        selection=anoog.io.selection_mode.NONE,
                        train_test_split=True,
                        test_size=0.3,
                        save_as_csv=True,
                        csv_name='tobia_vadim_2021_11_09.csv')
train_data.info()

Feature Extraction: 100%|██████████| 10/10 [01:04<00:00,  6.42s/it]
 'Voltage__query_similarity_count__query_None__threshold_0.0'
 'Current__query_similarity_count__query_None__threshold_0.0'] did not have any finite values. Filling with zeros.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_datasets[i]['y'] = data[1]


Saving DataFrame at: ../../data/tobia_vadim_2021_11_09_0.csv
Saving DataFrame at: ../../data/tobia_vadim_2021_11_09_1.csv
<class 'pandas.core.frame.DataFrame'>
Float64Index: 35 entries, 7.0 to 19.0
Columns: 2362 entries, Audio__variance_larger_than_standard_deviation to y
dtypes: float64(2361), int32(1)
memory usage: 646.0 KB


In [8]:
#train_data.to_csv("../../data/prepared_train_data_2021-11-09.csv", index=False)
#test_data.to_csv("../../data/prepared_test_data_2021-11-09.csv", index=False)

In [9]:
train_data.head()

Unnamed: 0,Voltage__ratio_beyond_r_sigma__r_7,Voltage__ratio_beyond_r_sigma__r_6,"Current__agg_linear_trend__attr_""stderr""__chunk_len_50__f_agg_""var""","Voltage__agg_linear_trend__attr_""stderr""__chunk_len_50__f_agg_""var""",Current__large_standard_deviation__r_0.15000000000000002,Voltage__large_standard_deviation__r_0.15000000000000002,"Voltage__fft_coefficient__attr_""abs""__coeff_19",Voltage__ratio_beyond_r_sigma__r_5,"Current__fft_coefficient__attr_""abs""__coeff_19","Current__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.8",...,"Voltage__agg_linear_trend__attr_""rvalue""__chunk_len_10__f_agg_""min""","Voltage__fft_coefficient__attr_""abs""__coeff_5",Audio__number_peaks__n_50,Voltage__number_peaks__n_3,Current__lempel_ziv_complexity__bins_100,"Voltage__fft_coefficient__attr_""abs""__coeff_6","Current__fft_coefficient__attr_""real""__coeff_1",Voltage__partial_autocorrelation__lag_6,Current__median,y
49.0,0.002179,0.003268,0.392379,0.005755,0.0,0.0,14.709127,0.004357,123.728706,0.366403,...,-0.193329,35.700726,7.0,81.0,0.188453,25.835213,-1165.816045,0.054007,-0.779992,1
9.0,0.0,0.0,0.000526,9e-06,1.0,1.0,7.733835,0.0,55.52489,0.021255,...,-0.937037,29.550993,22.0,233.0,0.26087,22.13769,-332.630284,-0.027335,1.374036,0
3.0,0.0,0.0,0.002862,4.9e-05,1.0,1.0,1.530134,0.0,5.710973,0.04281,...,-0.361404,34.453788,14.0,164.0,0.268424,10.09959,-1167.045234,-0.060028,0.342336,0
21.0,0.0,0.0,0.00198,3.5e-05,1.0,1.0,8.331854,0.0,59.40438,0.05067,...,-0.831574,21.553152,6.0,186.0,0.262474,12.600361,-518.027123,-0.118954,1.603331,0
13.0,0.0,0.0,0.003533,6e-05,1.0,1.0,0.998379,0.0,14.135692,0.055072,...,-0.768076,28.101982,16.0,208.0,0.257984,27.073292,-739.567491,-0.124563,1.434353,0


In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 16 entries, 49.0 to 13.0
Data columns (total 92 columns):
 #   Column                                                                Non-Null Count  Dtype  
---  ------                                                                --------------  -----  
 0   Voltage__sum_values                                                   16 non-null     float64
 1   Current__linear_trend__attr_"stderr"                                  16 non-null     float64
 2   Current__fft_coefficient__attr_"angle"__coeff_91                      16 non-null     float64
 3   Voltage__fft_coefficient__attr_"imag"__coeff_2                        16 non-null     float64
 4   Audio__fft_aggregated__aggtype_"variance"                             16 non-null     float64
 5   Audio__ratio_beyond_r_sigma__r_1.5                                    16 non-null     float64
 6   Audio__lempel_ziv_complexity__bins_2                                  16 non-null     float64

In [12]:
test_data.head()

Unnamed: 0,y
1.0,0
22.0,0
30.0,1
0.0,0
32.0,1


---

### <a name='targeted_feature_extraction'>Targeted Feature Extraction</a>

To make the feature extraction a little bit faster (10 min are to long), we save all relevant features and only let them calculate. And then there is no need for feature selection anymore.<br>
<br>
For that you have to set the parameter **calculate_features** to False.

In [10]:
features = anoog.io.load_features()
features

{'Current': {'large_standard_deviation': [{'r': 0.2},
   {'r': 0.25},
   {'r': 0.15000000000000002},
   {'r': 0.1},
   {'r': 0.30000000000000004}],
  'augmented_dickey_fuller': [{'attr': 'usedlag', 'autolag': 'AIC'},
   {'attr': 'teststat', 'autolag': 'AIC'},
   {'attr': 'pvalue', 'autolag': 'AIC'}],
  'ratio_beyond_r_sigma': [{'r': 3},
   {'r': 2.5},
   {'r': 5},
   {'r': 7},
   {'r': 6},
   {'r': 2}],
  'time_reversal_asymmetry_statistic': [{'lag': 3}, {'lag': 2}, {'lag': 1}],
  'change_quantiles': [{'f_agg': 'var', 'isabs': False, 'qh': 1.0, 'ql': 0.2},
   {'f_agg': 'var', 'isabs': True, 'qh': 1.0, 'ql': 0.2},
   {'f_agg': 'var', 'isabs': False, 'qh': 1.0, 'ql': 0.0},
   {'f_agg': 'var', 'isabs': True, 'qh': 1.0, 'ql': 0.0},
   {'f_agg': 'var', 'isabs': True, 'qh': 1.0, 'ql': 0.4},
   {'f_agg': 'var', 'isabs': False, 'qh': 1.0, 'ql': 0.4},
   {'f_agg': 'var', 'isabs': False, 'qh': 1.0, 'ql': 0.6},
   {'f_agg': 'var', 'isabs': True, 'qh': 1.0, 'ql': 0.6},
   {'f_agg': 'mean', 'isabs'

In [12]:
features.keys()

dict_keys(['Current', 'Voltage', 'Audio'])

## Get real

In [4]:
data = anoog.io.load_data("../../data/2021-11-09",
                        ['tippolit', 'vkorzev'],
                        extraction=anoog.io.extraction_mode.TSFRESH_WITH_PARAMS,
                        selection=anoog.io.selection_mode.NONE,
                        train_test_split=False,
                        test_size=0.3)

Feature Extraction: 100%|██████████| 10/10 [00:23<00:00,  2.38s/it]


      Audio__length  Audio__fft_aggregated__aggtype_"variance"  \
0.0           779.0                                9141.065258   
1.0           638.0                                6710.341832   
2.0           420.0                                2829.417154   
3.0           788.0                               10492.060479   
4.0           429.0                                2625.958988   
5.0           646.0                                5680.993360   
6.0           520.0                                3716.606976   
7.0           494.0                                3906.352136   
8.0           398.0                                2515.287607   
9.0           532.0                                4437.306538   
10.0          546.0                                4491.773866   
11.0          474.0                                3865.687979   
12.0          461.0                                2779.181496   
13.0          474.0                                3325.397104   
14.0      

In [5]:
data.head()

Unnamed: 0,Audio__length,"Audio__fft_aggregated__aggtype_""variance""","Audio__fft_aggregated__aggtype_""kurtosis""",Audio__number_peaks__n_5,Audio__number_peaks__n_3,Audio__number_peaks__n_10,Audio__number_peaks__n_1,Audio__number_peaks__n_50,Audio__lempel_ziv_complexity__bins_2,Audio__lempel_ziv_complexity__bins_100,...,Current__benford_correlation,Current__symmetry_looking__r_0.1,Current__symmetry_looking__r_0.15000000000000002,Current__symmetry_looking__r_0.2,Current__count_below__t_0,Current__count_above__t_0,Current__friedrich_coefficients__coeff_2__m_3__r_30,Current__median,Current__variance_larger_than_standard_deviation,y
0.0,779.0,9141.065258,109.50147,69.0,98.0,33.0,263.0,4.0,0.050064,0.426187,...,0.812608,1.0,1.0,1.0,0.249037,0.750963,-0.091714,1.883141,1.0,0
1.0,638.0,6710.341832,82.029469,53.0,85.0,26.0,200.0,4.0,0.056426,0.431034,...,0.752029,1.0,1.0,1.0,0.213166,0.786834,-0.146053,1.998236,1.0,0
2.0,420.0,2829.417154,90.648064,34.0,48.0,14.0,145.0,4.0,0.069048,0.504762,...,-0.048162,1.0,1.0,1.0,0.247619,0.752381,0.044229,3.512523,1.0,0
3.0,788.0,10492.060479,72.225268,66.0,110.0,35.0,256.0,7.0,0.050761,0.369289,...,0.833078,1.0,1.0,1.0,0.296954,0.703046,-0.001835,1.805488,1.0,0
4.0,429.0,2625.958988,122.255716,33.0,49.0,16.0,149.0,1.0,0.067599,0.547786,...,-0.063146,1.0,1.0,1.0,0.389277,0.610723,0.242626,2.759688,1.0,0


## With train and test

In [14]:
train_data, test_data = df = anoog.io.load_data("../../data/2021-11-09",
                        ['tippolit', 'vkorzev'],
                        extraction=anoog.io.extraction_mode.TSFRESH,
                        selection=anoog.io.selection_mode.NONE,
                        train_test_split=True,
                        test_size=0.3)
train_data.info()

Feature Extraction: 100%|██████████| 39/39 [00:21<00:00,  1.84it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dataset['y'] = y_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_dataset['y'] = y_test


<class 'pandas.core.frame.DataFrame'>
Float64Index: 35 entries, 22.0 to 35.0
Columns: 489 entries, Audio__length to y
dtypes: float64(488), int32(1)
memory usage: 133.8 KB


In [16]:
#train_data.to_csv("../../data/prepared_train_data_2021-11-02.csv", index=False)
#test_data.to_csv("../../data/prepared_test_data_2021-11-02.csv", index=False)

In [17]:
train_data.head()

Unnamed: 0,Audio__length,Audio__number_peaks__n_5,Audio__number_peaks__n_3,Audio__number_peaks__n_1,Audio__number_peaks__n_10,Audio__number_peaks__n_50,Audio__lempel_ziv_complexity__bins_2,"Audio__fft_aggregated__aggtype_""variance""",Audio__count_below_mean,Audio__range_count__max_1000000000000.0__min_0,...,Current__max_langevin_fixed_point__m_3__r_30,Current__kurtosis,Current__benford_correlation,Current__mean,Current__friedrich_coefficients__coeff_2__m_3__r_30,Current__count_above__t_0,Current__count_below__t_0,Current__symmetry_looking__r_0.1,Current__median,y
22.0,1407.0,137.0,209.0,450.0,68.0,11.0,0.037669,45936.500935,976.0,888.0,...,2.614557,-1.215218,0.102014,0.557651,-0.001624,0.647477,0.352523,1.0,0.769087,0
48.0,621.0,62.0,78.0,201.0,24.0,4.0,0.056361,6098.919865,291.0,325.0,...,13.93174,11.453942,-0.443412,1.419931,0.114478,0.376812,0.623188,1.0,-0.7783,1
10.0,1928.0,248.0,292.0,605.0,73.0,23.0,0.032158,57680.500423,880.0,1469.0,...,0.994454,-1.784139,0.624933,0.272095,0.034734,0.598029,0.401971,0.0,0.99531,0
40.0,1116.0,110.0,137.0,397.0,35.0,9.0,0.042115,16168.873872,572.0,545.0,...,13.889455,10.910216,-0.447606,0.933195,0.166259,0.294803,0.705197,1.0,-0.781078,1
23.0,1445.0,166.0,214.0,499.0,56.0,18.0,0.036678,42080.842934,583.0,1139.0,...,2.33976,-1.43707,0.744621,0.631277,0.002608,0.673356,0.326644,0.0,1.314437,0


In [15]:
test_data.head()

Unnamed: 0,Audio__length,Audio__number_peaks__n_5,Audio__number_peaks__n_3,Audio__number_peaks__n_1,Audio__number_peaks__n_10,Audio__number_peaks__n_50,Audio__lempel_ziv_complexity__bins_2,"Audio__fft_aggregated__aggtype_""variance""",Audio__count_below_mean,Audio__range_count__max_1000000000000.0__min_0,...,Current__max_langevin_fixed_point__m_3__r_30,Current__kurtosis,Current__benford_correlation,Current__mean,Current__friedrich_coefficients__coeff_2__m_3__r_30,Current__count_above__t_0,Current__count_below__t_0,Current__symmetry_looking__r_0.1,Current__median,y
41.0,1076.0,104.0,140.0,378.0,48.0,9.0,0.042751,14331.232005,554.0,523.0,...,16.572278,27.260088,-0.422763,0.890222,0.147977,0.262082,0.737918,1.0,-0.783018,1
44.0,1047.0,93.0,154.0,366.0,46.0,9.0,0.04298,16539.761998,552.0,517.0,...,10.972361,20.671705,-0.265185,0.901304,0.025146,0.356256,0.643744,1.0,-0.779838,1
11.0,1756.0,240.0,269.0,516.0,85.0,23.0,0.033599,64529.475278,554.0,1460.0,...,0.432132,-1.030117,-0.173741,0.059104,-0.036617,0.699317,0.300683,0.0,0.415271,0
39.0,903.0,80.0,131.0,302.0,35.0,7.0,0.046512,15490.161696,482.0,434.0,...,17.621572,17.544434,-0.460263,0.983842,0.060618,0.312292,0.687708,1.0,-0.781803,1
8.0,1621.0,145.0,232.0,523.0,65.0,14.0,0.035163,46192.766388,792.0,799.0,...,3.318389,-1.199874,0.213322,0.105948,0.001261,0.389266,0.610734,0.0,-0.778935,0


---

### <a name='targeted_feature_extraction'>Targeted Feature Extraction</a>

Try to make self a Feature Extraction without Tsfresh.<br> 
Features: min, max, mean, median, Widerstand, Energie

In [8]:
(data, meta_data) = anoog.io.load_tsfresh('../../data/2021-11-02', ['tippolit', 'vkorzev'])

In [9]:
resamplingInterval = '10ms'

data = data.resample(resamplingInterval, label='right', closed='right', on='Time').mean()
data.dropna(inplace=True)
data.reset_index(inplace=True)

In [27]:
labelEnc = LabelEncoder()
data['y'] = pd.Series(labelEnc.fit_transform(meta_data['Operator'].values), index=meta_data['ID'])

In [28]:
data.head()

Unnamed: 0,Time,Audio,Voltage,Current,ID,y
0,2021-11-02 12:22:26.000,-0.053711,19.415771,-0.792969,0.0,0.0
1,2021-11-02 12:22:26.010,-0.002565,19.42306,-0.784249,0.0,0.0
2,2021-11-02 12:22:26.020,-0.008261,19.422976,-0.784455,0.0,0.0
3,2021-11-02 12:22:26.030,-0.004172,19.422892,-0.788045,0.0,0.0
4,2021-11-02 12:22:26.040,0.000553,19.422866,-0.788769,0.0,0.0


In [31]:
data.columns

Index(['Time', 'Audio', 'Voltage', 'Current', 'ID', 'y'], dtype='object')

In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73787 entries, 0 to 73786
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   Time     73787 non-null  datetime64[ns]
 1   Audio    73787 non-null  float64       
 2   Voltage  73787 non-null  float64       
 3   Current  73787 non-null  float64       
 4   ID       73787 non-null  float64       
 5   y        51 non-null     float64       
dtypes: datetime64[ns](1), float64(5)
memory usage: 3.4 MB


In [30]:
data['y'].value_counts()

0.0    26
1.0    25
Name: y, dtype: int64

In [43]:
data['ID'].unique()

array([ 0., 26.,  1., 27., 28., 29., 30., 31., 32., 33., 34.,  2., 35.,
        3., 36., 37., 38.,  4.,  5., 39., 40.,  6.,  7.,  8.,  9., 10.,
       11., 12., 13., 14., 41., 15., 16., 42., 17., 18., 43., 44., 45.,
       46., 47., 48., 19., 20., 49., 21., 22., 50., 23., 24., 25.])

In [66]:
def extract_feature(data, from_feature:str, function):
    feature_collection = []
    for id in data['ID'].unique():
        #feature_collection += [data[data['ID'] == id][from_feature].function()]
        feature_collection += [function(data[data['ID']==id][from_feature])]
    return feature_collection


In [70]:
X_extracted = pd.DataFrame()

for feature in ['Audio', 'Current', 'Voltage']:
    for new_feature in [('min',np.min), ('max',np.max), ('mean',np.mean), ('median',np.median), ('std',np.std)]:
        X_extracted[feature.lower()+"_"+new_feature[0]] = extract_feature(data, feature, new_feature[1])

In [71]:
X_extracted.head()

Unnamed: 0,audio_min,audio_max,audio_mean,audio_median,audio_std,current_min,current_max,current_mean,current_median,current_std,voltage_min,voltage_max,voltage_mean,voltage_median,voltage_std
0,-0.16447,0.32812,-0.006018,-0.007007,0.009466,-0.85226,-0.769306,-0.844662,-0.851234,0.018126,19.415771,19.425308,19.424427,19.424551,0.00045
1,-0.831451,0.887379,0.000148,-0.000707,0.117465,-0.84631,36.547314,0.687671,-0.784968,3.579243,13.812548,19.358973,19.076774,19.23451,0.524684
2,-0.560484,1.235044,0.796693,1.181032,0.568881,-0.866211,1.156463,0.424111,0.866264,0.73038,18.452993,19.296478,19.075376,18.991131,0.133905
3,-4.804688,4.127155,0.001984,0.004931,1.958439,-0.846721,32.518481,0.221289,-0.786618,2.447131,15.15524,19.187589,19.004036,19.117604,0.345913
4,-3.784296,3.831298,-0.041334,-0.074833,1.39626,-0.842958,5.293099,0.664881,1.453228,1.123616,18.357014,19.176748,18.939628,18.807038,0.179426


In [73]:
X_extracted.shape

(51, 15)

### Manuel Feature Selection Testing

In [3]:
data = df = anoog.io.load_data("../../data/2021-11-09",
                        ['skadkade', 'vkorzev'],
                        extraction=anoog.io.extraction_mode.MANUEL,
                        selection=anoog.io.selection_mode.NONE,
                        train_test_split=False,
                        test_size=0.3)
data.tail()

Unnamed: 0,audio_min,audio_max,audio_mean,audio_median,audio_std,current_min,current_max,current_mean,current_median,current_std,...,Time,Audio,Voltage,Current,Resistance,Power,Work,mAh,Wh,y
45,-3.856191,4.443359,0.004294,1.4e-05,0.754451,-0.848285,30.814523,1.881874,2.019224,2.47266,...,7.02,0.004294,19.339452,1.881874,0.097307,36.394408,255.488741,31.364562,606.573459,1
46,-3.926896,3.226657,0.003948,0.010897,0.65023,-0.845797,30.290646,2.727177,2.379526,3.425021,...,4.35,0.003948,19.184296,2.727177,0.142157,52.318971,227.587524,45.452951,871.98285,1
47,-3.446411,2.407992,-0.006127,0.002814,0.451224,-0.851562,3.327032,1.715205,1.930784,1.185007,...,5.91,-0.006127,19.333584,1.715205,0.088716,33.161066,195.981899,28.586756,552.68443,1
48,-2.521361,1.898758,0.00272,-0.000137,0.470434,-0.866211,29.85027,2.23903,1.965718,3.040032,...,5.2,0.00272,19.230667,2.23903,0.11643,43.058033,223.901774,37.317161,717.633891,1
49,-2.91717,3.008804,-0.001375,-0.003025,0.648057,-0.939453,33.830816,2.656957,1.379917,3.775621,...,4.6,-0.001375,19.152872,2.656957,0.138724,50.88836,234.086455,44.282618,848.13933,1


In [4]:
#X_train, y_train = train_data.iloc[:, :-1], train_data.loc[:, 'y']
#X_test, y_test = test_data.iloc[:, :-1], test_data.loc[:, 'y']
X,y = anoog.io.X_y_split(data)

In [5]:
evaluate_model_with_cross_validation(X, y)

array([0.8, 1. , 1. , 1. , 1. , 1. , 1. , 0.8, 0.6, 1. ])

In [5]:
print("\nmodel going to be trained...")
model = RandomForestClassifier(n_estimators=100, 
                               bootstrap = True,
                               max_features = 'sqrt')
# Fit on training data
model = model.fit(X_train, y_train)


model going to be trained...


In [6]:
#accuracy
y_pred = model.predict(X_test)
print(f"#### RESULTS ####\n\nAccuracy: ", accuracy_score(y_test, y_pred)*100)

#### RESULTS ####

Accuracy:  100.0


In [7]:
#confusion_matrix
print("--------\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred), "\n")

--------
Confusion Matrix:
[[8 0]
 [0 7]] 

