<h1><center>Reduce HDFS</center></h1>

## Read original data (.csv)

In [238]:
hdfs_csv = pd.read_csv('../Anomaly-Detection/Models/loglizer/data/HDFS/hdfs_structured.csv')

events = set(hdfs_csv['EventId'])
events_mapping = dict(zip(events, range(len(events))))

hdfs_csv['EventId'] = hdfs_csv['EventId'].apply(lambda x : events_mapping.get(x))
hdfs_grouped = hdfs_csv.groupby('blk_id').agg(lambda x : list(x))

labels = pd.read_csv('../Anomaly-Detection/Data/HDFS_Drain_Parsed/anomaly_label.csv')
labels['Label'] = labels['Label'].apply(lambda x : 0 if x=='Normal' else 1)
block_labels = dict(labels.values)

x = hdfs_grouped['EventId'].values
y = list(map(lambda x : block_labels.get(x),list(hdfs_grouped.index)))

In [286]:
np.savez('HDFS.npz', x_data=x, y_data=y)

### Work on HDFS.npz

In [338]:
#Load data
hdfs = np.load('HDFS.npz', allow_pickle=True)

#Split into anomalies and normal points
anomalies = hdfs['x_data'][np.argwhere(hdfs['y_data'] == 1).flatten()]
normal = hdfs['x_data'][np.argwhere(hdfs['y_data'] == 0).flatten()]

### Split data train test

In [340]:
from sklearn.model_selection import train_test_split

train_ratios = [0.5, 0.75, 0.9]
for train_ratio in train_ratios:

    normal_train, normal_test = train_test_split(normal, train_size=train_ratio)
    anomalies_train, anomalies_test = train_test_split(anomalies, train_size=train_ratio)

    #Train part
    x = np.concatenate((normal_train, anomalies_train))
    
    zer = np.zeros(len(normal_train))
    ones = np.ones(len(anomalies_train))
    
    y = np.concatenate((zer, ones))
    np.savez('HDFS_train_'+str(train_ratio), x_data=x, y_data=y)

    reduction_percentages = [0.0, 0.25, 0.50, 0.75, 0.99, 0.999]
    for reduct_percentage in reduction_percentages:
        #Test part
        num_abnormal = len(anomalies_test)
        num_reduced = int(num_abnormal*(1-reduct_percentage))

        abnormal = np.random.choice(anomalies_test, num_reduced)

        x = np.concatenate((normal_test, abnormal))
        zer = np.zeros(len(normal_test))
        ones = np.ones(len(abnormal))
        y = np.concatenate((zer, ones))

        np.savez('HDFS_test_'+ str(train_ratio) + '_'+ str(reduct_percentage), x_data=x, y_data=y)
        
for train_ratio in train_ratios:
    train_data = np.load('HDFS_train_'+str(train_ratio) + '.npz', allow_pickle=True)
    
    anomalies = train_data['x_data'][np.argwhere(train_data['y_data'] == 1).flatten()]
    normal = train_data['x_data'][np.argwhere(train_data['y_data'] == 0).flatten()]
    
    for reduct_percentage in reduction_percentages:
        num_abnormal = len(anomalies)
        num_reduced = int(num_abnormal*(1-reduct_percentage))

        abnormal = np.random.choice(anomalies, num_reduced)

        x = np.concatenate((normal, abnormal))
        
        zer = np.zeros(len(normal))
        ones = np.ones(len(abnormal))
        
        y = np.concatenate((zer, ones))

        np.savez('HDFS_train_'+ str(train_ratio) + '_'+ str(reduct_percentage), x_data=x, y_data=y)

In [None]:
import shutil, os 

hdfs_files = [x for x in os.listdir() if ('HDFS' in x) and (len(x) > 20) and ('.ipynb' not in x) and (x not in ['HDFS.npz', 'HDFS_Data'])] 
for file in hdfs_files:
    print(file)
    train_test = file.split('_')[1]
    train_ratio = file.split('_')[2]
    reduction = file.split('_')[-1].split('.npz')[0]
    dest = os.replace(file, 'HDFS_Data/'+train_test+'/'+train_ratio+'/'+reduction+'/' + file)
    print('------')

## Prepare data for DeepLog

In [353]:
def stringify(l):
    return ' '.join([str(x) for x in l])

def prepare_deeplog(path):
    data = np.load(path, allow_pickle=True)

    data_normal = data['x_data'][data['y_data'] == 0]
    data_abnormal = data['x_data'][data['y_data'] == 1]

    data_normal = '\n'.join([stringify(x) for x in data_normal])
    data_abnormal = '\n'.join([stringify(x) for x in data_abnormal])
    
    return data_normal, data_abnormal

In [362]:
for train_ratio in train_ratios:
    for reduction_perc in reduction_percentages:
        if(str(train_ratio) not in os.listdir('DeepLog/data/')):
            os.mkdir('DeepLog/data/' + str(train_ratio))
        if(str(reduction_perc) not in os.listdir('DeepLog/data/' + str(train_ratio))):
            os.mkdir('DeepLog/data/' + str(train_ratio) + '/' + str(reduction_perc))
        
        path_train = 'DeepLog/data/' + str(train_ratio) + '/' 
        path_test = 'DeepLog/data/' + str(train_ratio) + '/' + str(reduction_perc) + '/'
        
        path_file_train = ''
        path_file_test = 'HDFS_Data/test/' + str(train_ratio) + '/' + str(reduction_perc) + '/'

        data_train_normal, data_train_abnormal = prepare_deeplog(path_file_train + 'HDFS_train_' + str(train_ratio) + '.npz')
        data_test_normal, data_test_abnormal = prepare_deeplog(path_file_test + 'HDFS_test_'+ str(train_ratio) + '_' + str(reduction_perc) + '.npz')
        
        with open(path_train + 'HDFS_train_normal_' + str(train_ratio) + '.txt', 'w') as _ :
            _.write(data_train_normal)
            
        with open(path_train + 'HDFS_train_abnormal_' + str(train_ratio) + '.txt', 'w') as _ :
            _.write(data_train_abnormal)
            
        with open(path_test + 'HDFS_test_normal_' + str(train_ratio) + '_' + str(reduction_perc) + '.txt', 'w') as _ :
            _.write(data_test_normal)
            
        with open(path_test + 'HDFS_test_abnormal_' + str(train_ratio) + '_' + str(reduction_perc)  + '.txt', 'w') as _ :
            _.write(data_test_abnormal)

**Split data train test**

In [140]:
data_train_normal, data_train_abnormal = prepare_deeplog('HDFS_train_0.5.npz')
data_test_normal, data_test_abnormal = prepare_deeplog('HDFS_test_0.5_0.25.npz')