Process the pre-processed CSV files to create time series **per hard disk**

* we start capture from `D-2` (`D` being day of failure) to `NUM_DAYS_RECORD` ago
* this means that with `NUM_DAYS_RECORD` of data, we know 2 days before the drive fails
* we do this for **all failure cases** and ~5% of non-failures cases

At the end, create two CSV files.

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import glob
import re

data_files = glob.glob("./processed/*.csv")
data_files.sort()
data_files = data_files[::-1]
print(len(data_files), "Files:")
print("From:", data_files[0], ", to:", data_files[-1])

In [None]:
# loads all the dataframes

loaded_df = []

for n,data_file in tqdm_notebook(enumerate(data_files), total=len(data_files)):
    df = pd.read_csv(data_file)
    df = df.drop("failure", axis=1)
    df.set_index("serial_number", inplace=True)
    loaded_df.append(df)

In [None]:
def get_smart_data(index, serial_number):
    df = loaded_df[index]
    smart_data = list(df.loc[serial_number])[4:]
    return smart_data

In [None]:
fail_record = {}
record = {}

In [None]:
NUM_DAYS_RECORD = 7

for n,data_file in tqdm_notebook(enumerate(data_files), total=len(data_files)):
    print("File:", data_file)
    try:
        df = pd.read_csv(data_file)
        serial_numbers = list(df["serial_number"])
        failures = list(df["failure"])
        df.set_index("serial_number", inplace=True)
        interval = int(len(failures)/40)-1
        for i,failure in enumerate(failures):
            # failure
            if int(failure)==1:
                try:
                    serial_number = serial_numbers[i]
                    row = df.loc[serial_number]
                    hdd_model = int(re.sub("\D", "", row["model"]))
                    hdd_size = row["capacity_bytes"]
                    data = [hdd_model, hdd_size]
                    for d in range(2,2+NUM_DAYS_RECORD):
                        smart_data = get_smart_data(n+d, serial_number)
                        data += smart_data
                    fail_record[serial_number] = data
                except Exception as e:
                    print(e)

            # ok
            elif i%interval==0:
                if int(failure)==0:
                    try:
                        serial_number = serial_numbers[i]
                        row = df.loc[serial_number]
                        hdd_model = int(re.sub("\D", "", row["model"]))
                        hdd_size = row["capacity_bytes"]
                        data = [hdd_model, hdd_size]
                        for d in range(2,2+NUM_DAYS_RECORD):
                            smart_data = get_smart_data(n+d, serial_number)
                            data += smart_data
                        record[serial_number] = data
                    except Exception as e:
                        print(e)
    except Exception as e:
        print("file failed", e)

In [None]:
print(len(fail_record[list(fail_record.keys())[0]]), "length:\n", fail_record[list(fail_record.keys())[0]])

In [None]:
header_row = ["failure", "hdd_model", "hdd_capacity"] + ["feat"+str(i) for i in range(len(fail_record[list(fail_record.keys())[0]])-2)]

In [None]:
assert len(header_row) == len(fail_record[list(fail_record.keys())[0]])+1
assert len(header_row) == len(record[list(record.keys())[0]])+1

Replace previously generated CSV files

In [None]:
!rm *.csv

In [None]:
import csv

with open('ok.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    for key, value in record.items():
        writer.writerow([0]+value)
        
with open('fail.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(header_row)
    for key, value in fail_record.items():
        writer.writerow([1]+value)

In [None]:
!cat *.csv > merged.csv