In [1]:
import pandas as pd
import glob
import numpy as np
%matplotlib inline

import os
cdir = os.getcwd()
print(cdir)
os.chdir(cdir + '/train/train')
print(os.getcwd())

C:\Projects\activity-recognition-abc
C:\Projects\activity-recognition-abc\train\train


In [2]:
# Multi-head CNN–RNN for multi-time series anomaly detection: An industrial case study

In [3]:
def diff(x, y):
    return np.abs(x-y) > 10

def segment(ary, tm):
    a = np.split(ary, np.argwhere(diff(tm[1:], tm[:-1])).T[0] + 1)
    return a

In [4]:
segment(np.array([1,0,1,1,2,21,20,19,5,12,51,40]), np.array([1,7,16,16,22,21,2,4,5,2,2,2]))

[array([ 1,  0,  1,  1,  2, 21]), array([20, 19,  5, 12, 51, 40])]

In [5]:
from tqdm import tqdm_notebook

In [6]:
import pandas as pd
from numpy.linalg import LinAlgError
from scipy import io, signal
#1
def AE(x): # Absolute Energy
    x = np.asarray(x)
    return sum(x * x)

#2
def SM2(y):
    #t1 = time.time()
    f, Pxx_den = signal.welch(y)
    sm2 = 0
    n = len(f)
    for i in range(0,n):
        sm2 += Pxx_den[i]*(f[i]**2)
    
    #t2 = time.time()
    #print('time: ', t2-t2)
    return sm2


#3
def LOG(y):
    n = len(y)
    return np.exp(np.sum(np.log(np.abs(y)))/n)

#4
def WL(x): # WL in primary manuscript
    return np.sum(abs(np.diff(x)))



#6
def AC(x, lag=5): # autocorrelation

    """
     [1] https://en.wikipedia.org/wiki/Autocorrelation#Estimation

    """
    # This is important: If a series is passed, the product below is calculated
    # based on the index, which corresponds to squaring the series.
    if type(x) is pd.Series:
        x = x.values
    if len(x) < lag:
        return np.nan
    # Slice the relevant subseries based on the lag
    y1 = x[:(len(x)-lag)]
    y2 = x[lag:]
    # Subtract the mean of the whole series x
    x_mean = np.mean(x)
    # The result is sometimes referred to as "covariation"
    sum_product = np.sum((y1-x_mean)*(y2-x_mean))
    # Return the normalized unbiased covariance
    return sum_product / ((len(x) - lag) * np.var(x))

#7
def BE(x, max_bins=30): # binned entropy
    hist, bin_edges = np.histogram(x, bins=max_bins)
    probs = hist / len(x)
    return - np.sum(p * np.math.log(p) for p in probs if p != 0)

#8
def C3(x, lag = 5): # c3 feature
    n = len(x)
    x = np.asarray(x)
    if 2 * lag >= n:
        return 0
    else:
        return np.mean((np.roll(x, 2 * -lag) * np.roll(x, -lag) * x)[0:(n - 2 * lag)])
    
    





#12
def AAC(x): #AAC in primary manuscript
    return np.mean(abs(np.diff(x)))

#13
def MSDC(x): # mean second derivative central
    diff = (np.roll(x, 1) - 2 * np.array(x) + np.roll(x, -1)) / 2.0
    return np.mean(diff[1:-1])

#14
def ZC(x, m = 0): # zero/mean crossing
    # m = np.mean(x)
    x = np.asarray(x)
    x = x[x != m]
    return sum(np.abs(np.diff(np.sign(x - m))))/2


#15
def SE(x): # sample entropy
    """
    [1] http://en.wikipedia.org/wiki/Sample_Entropy
    [2] https://www.ncbi.nlm.nih.gov/pubmed/10843903?dopt=Abstract
    """
    x = np.array(x)

    sample_length = 1 # number of sequential points of the time series
    tolerance = 0.2 * np.std(x) # 0.2 is a common value for r - why?

    n = len(x)
    prev = np.zeros(n)
    curr = np.zeros(n)
    A = np.zeros((1, 1))  # number of matches for m = [1,...,template_length - 1]
    B = np.zeros((1, 1))  # number of matches for m = [1,...,template_length]

    for i in range(n - 1):
        nj = n - i - 1
        ts1 = x[i]
        for jj in range(nj):
            j = jj + i + 1
            if abs(x[j] - ts1) < tolerance:  # distance between two vectors
                curr[jj] = prev[jj] + 1
                temp_ts_length = min(sample_length, curr[jj])
                for m in range(int(temp_ts_length)):
                    A[m] += 1
                    if j < n - 1:
                        B[m] += 1
            else:
                curr[jj] = 0
        for j in range(nj):
            prev[j] = curr[j]

    N = n * (n - 1) / 2
    B = np.vstack(([N], B[0]))

    # sample entropy = -1 * (log (A/B))
    similarity_ratio = A / B
    se = -1 * np.log(similarity_ratio)
    se = np.reshape(se, -1)
    return se[0]

#16
def TRAS(x, lag=5):
    # time reversal asymmetry statistic
    """
    |  [1] Fulcher, B.D., Jones, N.S. (2014).
    |  Highly comparative feature-based time-series classification.
    |  Knowledge and Data Engineering, IEEE Transactions on 26, 3026–3037.
    """
    n = len(x)
    x = np.asarray(x)
    if 2 * lag >= n:
        return 0
    else:
        return np.mean((np.roll(x, 2 * -lag) * np.roll(x, 2 * -lag) * np.roll(x, -lag) -
                        np.roll(x, -lag) * x * x)[0:(n - 2 * lag)])
    
    
#17    
def VAR(x): # variance 
    return np.var(x)

In [7]:
# time - profiling
sd = [1,2,3,1,3,1]*1000
import time
t1 = time.time()
AE(sd)
t2 = time.time()
print(f'AE: {t2-t1}')
t1 = time.time()
SM2(sd)
t2 = time.time()
print(f'SM2: {t2-t1}')
t1 = time.time()
LOG(sd)
t2 = time.time()
print(f'LOG: {t2-t1}')
t1 = time.time()
WL(sd)
t2 = time.time()
print(f'WL: {t2-t1}')
t1 = time.time()
AC(sd)
t2 = time.time()
print(f'AC: {t2-t1}')
t1 = time.time()
BE(sd)
t2 = time.time()
print(f'BE: {t2-t1}')
t1 = time.time()
C3(sd)
t2 = time.time()
print(f'C3: {t2-t1}')
t1 = time.time()
AAC(sd)
t2 = time.time()
print(f'AAC: {t2-t1}')
t1 = time.time()
MSDC(sd)
t2 = time.time()
print(f'MSDC: {t2-t1}')
t1 = time.time()
ZC(sd)
t2 = time.time()
print(f'ZC: {t2-t1}')
#t1 = time.time()
#SE(sd)
#t2 = time.time()
#print(f'SE: {t2-t1}')
t1 = time.time()
TRAS(sd)
t2 = time.time()
print(f'TRAS: {t2-t1}')
t1 = time.time()
VAR(sd)
t2 = time.time()
print(f'VAR: {t2-t1}')

AE: 0.0009922981262207031
SM2: 0.03689932823181152
LOG: 0.0009982585906982422
WL: 0.0
AC: 0.0019941329956054688




BE: 0.0029909610748291016
C3: 0.0009984970092773438
AAC: 0.0
MSDC: 0.001995563507080078
ZC: 0.0009989738464355469
TRAS: 0.0009930133819580078
VAR: 0.0009958744049072266


In [94]:
def process(sensordata, timestamps):
    # process
    # feature calculation
    sd = sensordata
    if len(sd) == 0:
        features = np.zeros((12))
    else:
        sds = segment(sensordata, timestamps)
        features = np.zeros((12))
        #features = np.array([AE(sd), SM2(sd), LOG(sd), WL(sd), AC(sd), BE(sd), C3(sd), AAC(sd), MSDC(sd), ZC(sd), TRAS(sd), VAR(sd)])
                
        wgt = 0.
        for sd in sds:
            if len(sd) > 5:
                features += len(sd)*np.array([AE(sd), SM2(sd), LOG(sd), WL(sd), AC(sd), BE(sd), C3(sd), AAC(sd), MSDC(sd), ZC(sd), TRAS(sd), VAR(sd)])
                wgt += len(sd)
        if wgt == 0.:
            wgt = 1
        features = np.nan_to_num(features)/wgt
        
        
    return np.nan_to_num(features)

In [14]:
import tensorflow as tf
from tfdeterminism import patch
patch()
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
for device in gpu_devices:
    tf.config.experimental.set_memory_growth(device, True)
import random
import numpy as np
os.environ['PYTHONHASHSEED']=str(1997)

random.seed(1997)
np.random.seed(1997)
tf.random.set_seed(1997)

TensorFlow version 2.0.0 has been patched using tfdeterminism version 0.3.0


In [15]:
tf.test.is_gpu_available()

True

In [95]:
# there are 3 subjects
subject1 = {}
subject2 = {}
subject3 = {}

# we will load every single right arm data, separate based on subject id, do feature extraction, run t-SNE

all_sensors = ['right_arm', 'right_wrist', 'left_hip', 'left_wrist'] #, 'mocap']

data_folder = [f'{sensor}/*.csv' for sensor in all_sensors]

print(data_folder)

files = glob.glob(data_folder[0])

# processing will be done as numpy array

subject1['data'] = []
subject2['data'] = []
subject3['data'] = []
 
# data format : data -> id, process(timeseries, timestamps) : right_arm as numpy array (missing data will be imputed)


# id is generated by concatenating INT(subject+trial)

for f in tqdm_notebook(files):
    #print(f)
    c_sub = {} # current subject
    if f.split(os.sep)[1].split('_')[0] == 'subject1':
        
        c_sub['id'] = int(f.split(os.sep)[1].split('_')[0][-1] + f.split(os.sep)[1].split('_')[2].split('.')[0])
        #print(ra['id'])
        
        for sensor in all_sensors:
            #print(sensor)
            ra = pd.read_csv(sensor + os.sep + f.split(os.sep)[1])
            #print(ra)
            ra.sort_values(by=['timestamp'], inplace = True)
            c_sub[sensor] = {}
            if sensor == 'mocap':
                for axis in mocap_axis:
                    c_sub[sensor][axis] = process(ra[axis], ra['timestamp'])
            else:
                for axis in ['X', 'Y', 'Z']:
                    c_sub[sensor][axis] = process(ra[axis], ra['timestamp'])
                
        subject1['data'].append(c_sub)

            
    elif f.split(os.sep)[1].split('_')[0] == 'subject2':
        
        c_sub['id'] = int(f.split(os.sep)[1].split('_')[0][-1] + f.split(os.sep)[1].split('_')[2].split('.')[0])
        #print(ra['id'])
        
        for sensor in all_sensors:
            ra = pd.read_csv(sensor + os.sep + f.split(os.sep)[1])
            #print(ra)
            ra.sort_values(by=['timestamp'], inplace = True)
            c_sub[sensor] = {}
            if sensor == 'mocap':
                for axis in mocap_axis:
                    c_sub[sensor][axis] = process(ra[axis], ra['timestamp'])
            else:
                for axis in ['X', 'Y', 'Z']:
                    c_sub[sensor][axis] = process(ra[axis], ra['timestamp'])
                
        subject2['data'].append(c_sub)
            
    elif f.split(os.sep)[1].split('_')[0] == 'subject3':
        
        c_sub['id'] = int(f.split(os.sep)[1].split('_')[0][-1] + f.split(os.sep)[1].split('_')[2].split('.')[0])
        #print(ra['id'])
        
        for sensor in all_sensors:
            ra = pd.read_csv(sensor + os.sep + f.split(os.sep)[1])
            #print(ra)
            ra.sort_values(by=['timestamp'], inplace = True)
            c_sub[sensor] = {}
            if sensor == 'mocap':
                for axis in mocap_axis:
                    try:
                        c_sub[sensor][axis] = process(ra[axis], ra['timestamp'])
                    except:
                        c_sub[sensor][axis] = [0]
            else:
                for axis in ['X', 'Y', 'Z']:
                    c_sub[sensor][axis] = process(ra[axis], ra['timestamp'])
        
        subject3['data'].append(c_sub)
    else:
        print('either new subject or a bug')
        
        
labels = pd.read_csv("labels.txt", sep=' ', header=None)
print(labels.head())
labels = labels[0].str.split(",", n=2, expand=True)
labels.columns = ['file_id', 'macro', 'micro'] #give names to the columns
labels.index = labels['file_id'] #use the file id as index to make it searchable by file_id
print(labels.head())

# label generation
subject1['label_mac'] = {}
subject2['label_mac'] = {}
subject3['label_mac'] = {}

subject1['label_mic'] = {}
subject2['label_mic'] = {}
subject3['label_mic'] = {}

for i in range(len(labels)):
    #print(labels.iloc[i]['file_id'])
    #print(labels.iloc[i]['file_id'].split('_')[0][-1])
    tid = int(labels.iloc[i]['file_id'].split('_')[0][-1] + labels.iloc[i]['file_id'].split('_')[-1])
    #print(tid)
    label = labels.iloc[i]['macro']
    label_mic = labels.iloc[i]['micro'].split(',')[:-1]
    #print(label)
    if labels.iloc[i]['file_id'].split('_')[0][-1] == '1':
        subject1['label_mac'][tid] = label
        subject1['label_mic'][tid] = label_mic
    elif labels.iloc[i]['file_id'].split('_')[0][-1] == '2':
        subject2['label_mac'][tid] = label
        subject2['label_mic'][tid] = label_mic
    elif labels.iloc[i]['file_id'].split('_')[0][-1] == '3':
        subject3['label_mac'][tid] = label
        subject3['label_mic'][tid] = label_mic
    else:
        print('some bug')
        
        
# re-formatting dataset for training

X = []
y = []
y_ml = []  # multi-label

for i in range(len(subject1['data'])):
    tid = subject1['data'][i]['id']
    y.append(subject1['label_mac'][tid])
    y_ml.append(subject1['label_mic'][tid])
    # X shape -> [ip1, ip2, ip3, ip4] ip1 = (80, len, channel) -> 4, 80, len

for i in range(len(subject1['data'])):
    cs_data = []
    for sensor in all_sensors:
        sub_data = []
        if sensor == 'mocap':
            for sig in mocap_axis:
                sub_data.append(np.array(subject1['data'][i][sensor][sig]))
        else:
            for sig in ['X', 'Y', 'Z']:
                sub_data.append(np.array(subject1['data'][i][sensor][sig]))
        sub_data = np.array(sub_data)
        # print(sub_data.shape)
        sub_data = np.swapaxes(sub_data, 0, 1)
        cs_data.append(sub_data)
    X.append(cs_data)
    
    
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import numpy as np

label_encoder = LabelEncoder()
vec = label_encoder.fit_transform(y)

y_ohe = vec # to_categorical(vec,len(set(vec)))

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y_ml_ohe = mlb.fit_transform(y_ml)

# validatin on subject2, 3

# re-formatting dataset for training

X2 = []
y2 = []
y_ml2 = []

for i in range(len(subject2['data'])):
    tid = subject2['data'][i]['id']
    y2.append(subject2['label_mac'][tid])
    y_ml2.append(subject2['label_mic'][tid])
    # X shape -> [ip1, ip2, ip3, ip4] ip1 = (80, len, channel) -> 4, 80, len

    
for i in range(len(subject2['data'])):
    cs_data = []
    for sensor in all_sensors:
        sub_data = []
        if sensor == 'mocap':
            for sig in mocap_axis:
                sub_data.append(np.array(subject2['data'][i][sensor][sig]))
        else:
            for sig in ['X', 'Y', 'Z']:
                sub_data.append(np.array(subject2['data'][i][sensor][sig]))
        sub_data = np.array(sub_data)
        # print(sub_data.shape)
        sub_data = np.swapaxes(sub_data, 0, 1)
        cs_data.append(sub_data)
    X2.append(cs_data)
    
label_encoder = LabelEncoder()
vec = label_encoder.fit_transform(y2)

y2_ohe = vec # to_categorical(vec,len(set(vec)))

mlb = MultiLabelBinarizer()
y_ml2_ohe = mlb.fit_transform(y_ml2)

# validatin on subject2, 3

# re-formatting dataset for training

X3 = []
y3 = []
y_ml3 = []

for i in range(len(subject3['data'])):
    tid = subject3['data'][i]['id']
    y3.append(subject3['label_mac'][tid])
    y_ml3.append(subject3['label_mic'][tid])
    # X shape -> [ip1, ip2, ip3, ip4] ip1 = (80, len, channel) -> 4, 80, len

for i in range(len(subject3['data'])):
    cs_data = []
    for sensor in all_sensors:
        sub_data = []
        if sensor == 'mocap':
            for sig in mocap_axis:
                sub_data.append(np.array(subject3['data'][i][sensor][sig]))
        else:
            for sig in ['X', 'Y', 'Z']:
                sub_data.append(np.array(subject3['data'][i][sensor][sig]))
        sub_data = np.array(sub_data)
        # print(sub_data.shape)
        sub_data = np.swapaxes(sub_data, 0, 1)
        cs_data.append(sub_data)
    X3.append(cs_data)
    

label_encoder = LabelEncoder()
vec = label_encoder.fit_transform(y3)

y3_ohe = vec # to_categorical(vec,len(set(vec)))

mlb = MultiLabelBinarizer()
y_ml3_ohe = mlb.fit_transform(y_ml3)

['right_arm/*.csv', 'right_wrist/*.csv', 'left_hip/*.csv', 'left_wrist/*.csv']


HBox(children=(IntProgress(value=0, max=288), HTML(value='')))

  return getattr(obj, method)(*args, **kwds)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg,

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nper

  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))
  .format(nperseg, input_length))



                                           0
0           subject2_file_457,sandwich,Take,
1      subject2_file_679,sandwich,Wash,Take,
2        subject2_file_95,sandwich,Cut,Wash,
3  subject2_file_899,sandwich,other,Cut,Put,
4            subject2_file_368,sandwich,Put,
                             file_id     macro           micro
file_id                                                       
subject2_file_457  subject2_file_457  sandwich           Take,
subject2_file_679  subject2_file_679  sandwich      Wash,Take,
subject2_file_95    subject2_file_95  sandwich       Cut,Wash,
subject2_file_899  subject2_file_899  sandwich  other,Cut,Put,
subject2_file_368  subject2_file_368  sandwich            Put,


In [43]:
X[:5]

[[array([[ 2.05306808e+02,  9.13120531e+01,  1.39215012e+02],
         [ 4.01258786e-01,  1.68697914e-01,  2.08663717e-01],
         [ 0.00000000e+00,  0.00000000e+00,  1.17442550e-01],
         [ 2.17954994e+02,  1.41165496e+02,  1.57070997e+02],
         [ 2.04243623e-02,  5.73460215e-02,  5.08444896e-01],
         [ 2.43789398e+00,  2.51665912e+00,  2.41412431e+00],
         [ 3.27067257e-04, -5.22671460e-03, -1.75843692e-02],
         [ 1.55904860e-01,  1.00976750e-01,  1.12354075e-01],
         [ 5.65855297e-05, -4.80672835e-05, -3.58983536e-05],
         [ 2.66000000e+02,  3.18000000e+02,  2.95000000e+02],
         [-4.12309097e-03,  5.36966516e-03, -3.03464732e-03],
         [ 1.46612888e-01,  6.52332291e-02,  9.88169752e-02]]),
  array([[ 3.91586281e+03,  5.04237856e+03,  5.52307014e+03],
         [ 3.26533957e-01,  9.17798464e-01,  5.98819075e-01],
         [ 4.72085954e-01,  4.20896512e-01,  4.46460736e-01],
         [ 4.42831864e+02,  9.01984051e+02,  6.92956575e+02],
      

In [15]:
len(X)

80

In [16]:
len(X[0])

4

In [17]:
len(X[0][0])

12

In [18]:
len(X[0][0][0])

3

In [44]:
def flatten(lists,n):
    if n == 0:
        return lists
    if n == 1:
        return [x for xs in lists for x in xs]   
    else:
        return [flatten(xs,n-1) for xs in lists]
    
def flatten_agg(x, v):
    # channel aggregation
    a = 0.33
    b = 0.33
    c = 0.33
    x0  =  x[:,0]
    x1  =  x[:,1]
    x2  =  x[:,2]
    xf = a*x0 + b*x1 + c*x2
    return xf

In [45]:
def feature_agg(x):
    x0 = np.array(flatten_agg(x[0],1))
    x1 = np.array(flatten_agg(x[1],1))
    x2 = np.array(flatten_agg(x[2],1))
    x3 = np.array(flatten_agg(x[3],1))
    # tunable parameters
    a = 0.25
    b = 0.25
    c = 0.25
    d = 0.25
    x_o = a*x0 + b*x1 + c*x2 + d*x3
    return x_o

In [46]:
zzz = feature_agg(X[0])

In [47]:
zzz.shape

(12,)

In [96]:
# feature reshaping from arrays

X = [feature_agg(x) for x in X]
X2 = [feature_agg(x) for x in X2]
X3 = [feature_agg(x) for x in X3]

In [49]:
label_encoder.classes_

array(['cereal', 'fruitsalad', 'sandwich'], dtype='<U10')

In [50]:
mlb.classes_

array(['Add', 'Cut', 'Mix', 'Open', 'Peel', 'Pour', 'Put', 'Take', 'Wash',
       'other'], dtype=object)

In [51]:
# 4, 80, len, 3

In [52]:
all_sensors

['right_arm', 'right_wrist', 'left_hip', 'left_wrist']

In [53]:
# generating labels for test set

In [54]:
mlb.classes_

array(['Add', 'Cut', 'Mix', 'Open', 'Peel', 'Pour', 'Put', 'Take', 'Wash',
       'other'], dtype=object)

In [55]:
y[:5]

['sandwich', 'sandwich', 'cereal', 'fruitsalad', 'cereal']

In [56]:
y_ohe[:5]

array([2, 2, 0, 1, 0], dtype=int64)

In [57]:
y_ml[:5]

[['other', 'Put'], ['Take'], ['Take'], ['Cut', 'Peel', 'Put'], ['Take']]

In [58]:
y_ohe

array([2, 2, 0, 1, 0, 2, 2, 2, 2, 1, 2, 2, 1, 0, 0, 0, 2, 2, 2, 0, 0, 0,
       1, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2, 1, 1, 2, 1, 0, 1, 1, 0, 2, 2, 2,
       2, 2, 0, 0, 1, 0, 1, 0, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 0, 2,
       1, 1, 2, 2, 2, 1, 2, 2, 0, 0, 0, 2, 2, 2], dtype=int64)

In [59]:
type(y_ohe)

numpy.ndarray

In [60]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import sklearn

def sub_cv(clf, importance = False):
    clf = sklearn.base.clone(clf)
    Xm = X + X2
    ym = list(y_ohe) + list(y2_ohe)
    clf.fit(Xm, ym)
    if importance:
        fi1 = clf.feature_importances_
    a1 = accuracy_score(clf.predict(X3), y3_ohe)
    
    clf = sklearn.base.clone(clf)
    Xm = X + X3
    ym = list(y_ohe) + list(y3_ohe)
    clf.fit(Xm, ym)
    if importance:
        fi2 = clf.feature_importances_
    a2 = accuracy_score(clf.predict(X2), y2_ohe)
    
    clf = sklearn.base.clone(clf)
    Xm = X3 + X2
    ym = list(y3_ohe) + list(y2_ohe)
    clf.fit(Xm, ym)
    if importance:
        fi3 = clf.feature_importances_
    a3 = accuracy_score(clf.predict(X), y_ohe)
    
    if importance:
        un_f = []

        for i in range(len(X[0])):
            if fi1[i] <= 0.0005 and fi2[i] <= 0.0005 and fi3[i] <= 0.0005:
                un_f.append(i)

        print(f'un. features: {un_f}')
    
    print(f'a1: {a1}  a2: {a2}  a3: {a3}  av: {(a1+a2+a3)/3.}')

In [79]:
# simple feature reduction technique for finding the best set of features
# if removing a feature increases performance, remove it
clf = RandomForestClassifier(n_estimators = 50, max_depth=1, random_state=0)
sub_cv(clf, True)

clf = SVC(gamma='auto')
sub_cv(clf)

un. features: []
a1: 0.3106796116504854  a2: 0.3523809523809524  a3: 0.3125  av: 0.3251868546771459
a1: 0.34951456310679613  a2: 0.3619047619047619  a3: 0.2  av: 0.30380644167051934


In [98]:
# simple feature reduction technique for finding the best set of features
# if removing a feature increases performance, remove it

# with segmentation on, the result is almost same
clf = RandomForestClassifier(n_estimators = 50, max_depth=1, random_state=0)
sub_cv(clf, True)

clf = SVC(gamma='auto')
sub_cv(clf)

un. features: []
a1: 0.3106796116504854  a2: 0.3523809523809524  a3: 0.3125  av: 0.3251868546771459
a1: 0.34951456310679613  a2: 0.3619047619047619  a3: 0.2  av: 0.30380644167051934


In [47]:
dict_data = {'file_id': file_ids, 'macro': y_gen, 'micro': y_ml_gen}
out_csv = pd.DataFrame(dict_data)

In [49]:
out_csv.to_csv('cc_PseudoEmpirical_submission1.csv', index = False, header = False, sep = ';')