Process the pre-processed CSV files to create time series **per hard disk**

* we start capture from `D-2` (`D` being day of failure) to `NUM_DAYS_RECORD` ago
* this means that with `NUM_DAYS_RECORD` of data, we know 2 days before the drive fails
* we do this for **all failure cases** and a portion of non-failures cases

At the end, create two CSV files.

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import glob
import re

data_files = glob.glob("./processed/*.csv")
data_files.sort()
data_files = data_files[::-1]
print(len(data_files), "Files:")
print("From:", data_files[0], ", to:", data_files[-1])

607 Files:
From: ./processed/2018-09-30.csv , to: ./processed/2017-02-01.csv


In [2]:
# loads all the dataframes

loaded_df = []

for n,data_file in tqdm_notebook(enumerate(data_files), total=len(data_files)):
    df = pd.read_csv(data_file)
    df = df.drop("failure", axis=1)
    df.set_index("serial_number", inplace=True)
    loaded_df.append(df)

HBox(children=(IntProgress(value=0, max=607), HTML(value='')))




In [3]:
def get_smart_data(index, serial_number):
    df = loaded_df[index]
    smart_data = list(df.loc[serial_number])[4:]
    return smart_data

In [4]:
fail_record = {}
record = {}

In [5]:
NUM_DAYS_RECORD = 7

for n,data_file in tqdm_notebook(enumerate(data_files), total=len(data_files)):
    print("File:", data_file)
    try:
        df = pd.read_csv(data_file)
        serial_numbers = list(df["serial_number"])
        failures = list(df["failure"])
        df.set_index("serial_number", inplace=True)
        interval = int(len(failures)/40)-1
        for i,failure in enumerate(failures):
            # failure
            if int(failure)==1:
                try:
                    serial_number = serial_numbers[i]
                    row = df.loc[serial_number]
                    hdd_model = int(re.sub("\D", "", row["model"]))
                    hdd_size = row["capacity_bytes"]
                    data = [hdd_model, hdd_size]
                    for d in range(2,2+NUM_DAYS_RECORD):
                        smart_data = get_smart_data(n+d, serial_number)
                        data += smart_data
                    fail_record[serial_number] = data
                except Exception as e:
                    print(e)

            # ok
            elif i%interval==0:
                if int(failure)==0:
                    try:
                        serial_number = serial_numbers[i]
                        row = df.loc[serial_number]
                        hdd_model = int(re.sub("\D", "", row["model"]))
                        hdd_size = row["capacity_bytes"]
                        data = [hdd_model, hdd_size]
                        for d in range(2,2+NUM_DAYS_RECORD):
                            smart_data = get_smart_data(n+d, serial_number)
                            data += smart_data
                        record[serial_number] = data
                    except Exception as e:
                        print(e)
    except Exception as e:
        print("file failed", e)

HBox(children=(IntProgress(value=0, max=607), HTML(value='')))

File: ./processed/2018-09-30.csv
File: ./processed/2018-09-29.csv
'the label [ZJV04SZ3] is not in the [index]'
'the label [ZCH0BET9] is not in the [index]'
File: ./processed/2018-09-28.csv
File: ./processed/2018-09-27.csv
'the label [ZA13C001] is not in the [index]'
File: ./processed/2018-09-26.csv
File: ./processed/2018-09-25.csv
File: ./processed/2018-09-24.csv
File: ./processed/2018-09-23.csv
File: ./processed/2018-09-22.csv
File: ./processed/2018-09-21.csv
File: ./processed/2018-09-20.csv
File: ./processed/2018-09-19.csv
File: ./processed/2018-09-18.csv
'the label [ZCH07Z30] is not in the [index]'
File: ./processed/2018-09-17.csv
File: ./processed/2018-09-16.csv
File: ./processed/2018-09-15.csv
File: ./processed/2018-09-14.csv
File: ./processed/2018-09-13.csv
File: ./processed/2018-09-12.csv
File: ./processed/2018-09-11.csv
File: ./processed/2018-09-10.csv
File: ./processed/2018-09-09.csv
File: ./processed/2018-09-08.csv
File: ./processed/2018-09-07.csv
File: ./processed/2018-09-06

'the label [ZCH0ACED] is not in the [index]'
File: ./processed/2018-04-30.csv
File: ./processed/2018-04-29.csv
File: ./processed/2018-04-28.csv
File: ./processed/2018-04-27.csv
File: ./processed/2018-04-26.csv
File: ./processed/2018-04-25.csv
File: ./processed/2018-04-24.csv
File: ./processed/2018-04-23.csv
'the label [ZA14LWDQ] is not in the [index]'
File: ./processed/2018-04-22.csv
File: ./processed/2018-04-21.csv
File: ./processed/2018-04-20.csv
'the label [ZCH0D398] is not in the [index]'
File: ./processed/2018-04-19.csv
File: ./processed/2018-04-18.csv
'the label [ZCH0D398] is not in the [index]'
File: ./processed/2018-04-17.csv
'the label [ZCH0D398] is not in the [index]'
File: ./processed/2018-04-16.csv
'the label [ZCH0D398] is not in the [index]'
File: ./processed/2018-04-15.csv
File: ./processed/2018-04-14.csv
'the label [ZJV03WEA] is not in the [index]'
File: ./processed/2018-04-13.csv
File: ./processed/2018-04-12.csv
File: ./processed/2018-04-11.csv
File: ./processed/2018-04

'the label [ZCH076KH] is not in the [index]'
File: ./processed/2017-12-27.csv
'the label [ZCH07WPN] is not in the [index]'
File: ./processed/2017-12-26.csv
'the label [ZCH072ZK] is not in the [index]'
'the label [Z305CVYW] is not in the [index]'
File: ./processed/2017-12-25.csv
File: ./processed/2017-12-24.csv
'the label [ZCH06ZM0] is not in the [index]'
'the label [ZCH072ZK] is not in the [index]'
'the label [ZCH077P5] is not in the [index]'
File: ./processed/2017-12-23.csv
'the label [ZCH07TLL] is not in the [index]'
'the label [ZCH07G90] is not in the [index]'
File: ./processed/2017-12-22.csv
'the label [PL1331LAGS58GH] is not in the [index]'
'the label [ZCH08192] is not in the [index]'
'the label [ZA16DDTZ] is not in the [index]'
'the label [ZCH06V7F] is not in the [index]'
File: ./processed/2017-12-21.csv
File: ./processed/2017-12-20.csv
File: ./processed/2017-12-19.csv
File: ./processed/2017-12-18.csv
File: ./processed/2017-12-17.csv
File: ./processed/2017-12-16.csv
'the label [Z

'the label [Z305B2QN] is not in the [index]'
'the label [ZCH089J2] is not in the [index]'
'the label [S301PS8K] is not in the [index]'
'the label [PL1331LAGRGEKH] is not in the [index]'
'the label [PL2331LAHBAXGJ] is not in the [index]'
'the label [Z305PE6B] is not in the [index]'
'the label [ZA16DDR2] is not in the [index]'
'the label [Z304HRZ4] is not in the [index]'
'the label [PL1331LAGRV7VH] is not in the [index]'
'the label [Z304L99Y] is not in the [index]'
'the label [ZA12683S] is not in the [index]'
'the label [ZA1814JM] is not in the [index]'
'the label [PL1331LAHBS9SH] is not in the [index]'
'the label [ZA18A99J] is not in the [index]'
'the label [Z305K1W1] is not in the [index]'
'the label [Z305B6J2] is not in the [index]'
'the label [Z305DJ2X] is not in the [index]'
'the label [Z304GK5D] is not in the [index]'
'the label [Z305D6SB] is not in the [index]'
'the label [S3010MEF] is not in the [index]'
'the label [ZA110RTA] is not in the [index]'
'the label [PL2331LAGUHXAJ] is 

'the label [ZA180QCY] is not in the [index]'
File: ./processed/2017-08-21.csv
File: ./processed/2017-08-20.csv
'the label [ZA180XWY] is not in the [index]'
File: ./processed/2017-08-19.csv
File: ./processed/2017-08-18.csv
'the label [ZA180XWT] is not in the [index]'
File: ./processed/2017-08-17.csv
File: ./processed/2017-08-16.csv
File: ./processed/2017-08-15.csv
File: ./processed/2017-08-14.csv
'the label [PL2331LAG93HNJ] is not in the [index]'
File: ./processed/2017-08-13.csv
'the label [PL1331LAGA0EUH] is not in the [index]'
File: ./processed/2017-08-12.csv
File: ./processed/2017-08-11.csv
'the label [ZA16HJMC] is not in the [index]'
File: ./processed/2017-08-10.csv
'the label [ZA181A4Q] is not in the [index]'
File: ./processed/2017-08-09.csv
'the label [W4J1GN19] is not in the [index]'
'the label [ZA181A4Q] is not in the [index]'
File: ./processed/2017-08-08.csv
'the label [ZA1815AY] is not in the [index]'
'the label [ZA180R4W] is not in the [index]'
File: ./processed/2017-08-07.cs

'the label [ZA14CHN1] is not in the [index]'
'the label [PL1331LAH3DVDH] is not in the [index]'
File: ./processed/2017-03-25.csv
'the label [PL2331LAHBH02J] is not in the [index]'
'the label [PL2331LAHDTBYJ] is not in the [index]'
File: ./processed/2017-03-24.csv
'the label [PL2331LAHDN1DJ] is not in the [index]'
'the label [ZA16EEYW] is not in the [index]'
File: ./processed/2017-03-23.csv
'the label [PL2331LAHBU7HJ] is not in the [index]'
'the label [PL2331LAHDT42J] is not in the [index]'
'the label [PL1331LAHBYHKH] is not in the [index]'
File: ./processed/2017-03-22.csv
'the label [Z300CLT4] is not in the [index]'
'the label [PL1331LAHDZTHH] is not in the [index]'
'the label [PL2331LAHDV6TJ] is not in the [index]'
'the label [PL2331LAHDH90J] is not in the [index]'
'the label [PL1331LAHD210H] is not in the [index]'
'the label [PL1331LAH3DV7H] is not in the [index]'
File: ./processed/2017-03-21.csv
'the label [PL2331LAHDRVRJ] is not in the [index]'
'the label [PL2331LAH41R4J] is not in

list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out o

In [6]:
print(len(fail_record[list(fail_record.keys())[0]]), "length:\n", fail_record[list(fail_record.keys())[0]])

317 length:
 [80000055, 8001563222016, 107111848, 0.0, 0, 11, 0, 2958041911, 0.0, 13325, 0, 0.0, 11, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 34360262668.0, 0.0, 32.0, 4291.0, 7.0, 12112.0, 32, 107111848.0, 0.0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 13308.0, 45379128328.0, 95734062516.0, 0.0, 0.0, 0.0, 0.0, 0.0, 167836786, 0.0, 0, 11, 0, 2952445755, 0.0, 13301, 0, 0.0, 11, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 34360262668.0, 0.0, 32.0, 4287.0, 7.0, 12078.0, 32, 167836786.0, 0.0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 13284.0, 45342011976.0, 95587600460.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8546056, 0.0, 0, 9, 0, 2946107190, 0.0, 13278, 0, 0.0, 9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 34360262668.0, 0.0, 31.0, 4281.0, 7.0, 12034.0, 31, 8546056.0, 0.0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 13261.0, 45309584672.0, 95460737034.0, 0.0, 0.0, 0.0, 0.0, 0.0, 69200560, 0.0, 0, 9, 0, 2940258012, 0.0, 13253, 0, 0.0, 9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 34360262668.0, 0.0, 31.0, 4271.0, 7.0, 12000.0, 31, 6

In [7]:
print(len(record[list(record.keys())[0]]), "length:\n", record[list(record.keys())[0]])

317 length:
 [4000000, 4000787030016, 88752888, 0.0, 0, 10, 0, 633680208, 0.0, 14584, 0, 0.0, 10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 22.0, 0.0, 0.0, 34005.0, 22, 0.0, 0.0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 14336.0, 36652382824.0, 33340019739.0, 0.0, 0.0, 0.0, 0.0, 0.0, 145630168, 0.0, 0, 10, 0, 632451534, 0.0, 14560, 0, 0.0, 10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 22.0, 0.0, 0.0, 34005.0, 22, 0.0, 0.0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 14312.0, 36637078000.0, 33199384991.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9051704, 0.0, 0, 10, 0, 630881864, 0.0, 14536, 0, 0.0, 10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 22.0, 0.0, 0.0, 34005.0, 22, 0.0, 0.0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 14288.0, 36620777648.0, 33077905739.0, 0.0, 0.0, 0.0, 0.0, 0.0, 82744344, 0.0, 0, 10, 0, 629709520, 0.0, 14512, 0, 0.0, 10, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 22.0, 0.0, 0.0, 34005.0, 22, 0.0, 0.0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 14264.0, 366108020

In [8]:
header_row = ["failure", "hdd_model", "hdd_capacity"] + ["feat"+str(i) for i in range(len(fail_record[list(fail_record.keys())[0]])-2)]

In [9]:
assert len(header_row) == len(fail_record[list(fail_record.keys())[0]])+1
assert len(header_row) == len(record[list(record.keys())[0]])+1

Replace previously generated CSV files

In [10]:
import csv

with open('train.csv', 'w') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(header_row)
    for key, value in fail_record.items():
        writer.writerow([1]+value)
    for key, value in record.items():
        writer.writerow([0]+value)