In [1]:
# Make pics for test data

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import trange,tqdm

In [2]:

import warnings
warnings.filterwarnings('ignore')

In [3]:
from pyts.image import GramianAngularField, MarkovTransitionField, RecurrencePlot
import matplotlib.pyplot as plt
from skimage.transform import resize

def extract_features(x, method='GAF', image_size=224, graph=False):
    #TODO cmap
    if method == 'GAF':
        transformer = GramianAngularField(image_size=image_size)
    elif method == 'MTF':
        transformer = MarkovTransitionField(image_size=image_size)
    elif method == 'RP':
        transformer = RecurrencePlot(dimension=1, percentage=10)
        
    x = transformer.fit_transform(x[None,:])
    x = resize(x, (1,image_size, image_size))
    return x

def make_pics(x, step_length=1000, method = 'GAF', cmap='rainbow', path = None, additions = ''):
    if path is None:
        path = "data/images_2/"
    path += method
    if not os.path.isdir(path):
        os.mkdir(path)
    
    # Reshaping and approximate standardization with mean 5 and std 3.
    temp = (x - 5 ) / 3
    
    for i in range(x.shape[0]//(step_length)):        
        file = os.path.join(path, additions + '_' + str(i*step_length) + '.png')
        if not os.path.isfile(file):
            image_data = extract_features(temp[i:i+step_length,0],method = method)
            plt.imsave(file, image_data[0,:,:], cmap=cmap)

# make_pics(df.to_numpy(),additions = 'train')

In [4]:
submission = pd.read_csv('data/sample_submission.csv', index_col='seg_id', dtype={"time_to_failure": np.float32})

# Load each test data and create the picture
for i, seg_id in enumerate(tqdm(submission.index)):    
    x = pd.read_csv('data/test/' + seg_id + '.csv').to_numpy()
    make_pics(x, step_length=1000, method = 'GAF', cmap='rainbow', additions = seg_id)
#     if i > 100:
#         break

100%|██████████| 2624/2624 [3:15:15<00:00,  4.60s/it]


In [4]:
# %%time
df = pd.read_csv('data/train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure': np.float32},nrows = 150*1000*500)

In [5]:
import glob
lst = []
for i, file in enumerate(tqdm(glob.glob('data/images/GAF/*.png'))):
    idx = int(file.split('/')[-1].split('.')[0])
    
    if idx < df.shape[0]:
    
        assert type(df.iloc[idx,1]) == np.float32, idx

        lst.append({"name":file,'label':np.float32(df.iloc[idx,1])})

print(lst[0],len(lst))
labels_df = pd.DataFrame().from_dict(lst)
labels_df.head()

100%|██████████| 554486/554486 [00:03<00:00, 145944.04it/s]


{'name': 'data/images/GAF/48465000.png', 'label': 0.42099872} 75000


Unnamed: 0,label,name
0,0.420999,data/images/GAF/48465000.png
1,8.797798,data/images/GAF/16218000.png
2,2.319097,data/images/GAF/41159000.png
3,3.746998,data/images/GAF/35662000.png
4,12.121798,data/images/GAF/58014000.png


In [6]:
from fastai.vision import *
data = (ImageList.from_df(labels_df,"",'name') #Where to find the data? -> in path and its subfolders
        .split_by_rand_pct(0.2)              #How to split in train/valid? -> use the folders
        .label_from_df('label')
#         .transform(tfms, size=224)       #Data augmentation? -> use tfms with a size of 64
        .databunch())                   #Finally? -> use the defaults for conversion to ImageDataBunch

learn = cnn_learner(data, models.resnet18, metrics=mse)

In [None]:
learn.load('GAF-learner')
preds,losses = learn.get_preds(ds_type=DatasetType.Train)

In [31]:
import glob
# print(glob.glob('data/images/GAF/GAF/*.png')[:10])
lst = []
for i, file in enumerate(tqdm(glob.glob('data/images/GAF/GAF/*_test_seg_*.png'))):
    idx = file.split('/')[-1].split("_test_")[0]
    seg = file.split('/')[-1].split("_test_seg_")[1].split('.')[0]
    lst.append({"name":file,'idx':int(idx),'seg':seg,'unique':seg+str(idx)})
#     break

test_labels_df = pd.DataFrame().from_dict(lst)
test_labels_df.head()








  0%|          | 0/393600 [00:00<?, ?it/s][A[A[A[A[A[A





  8%|▊         | 32082/393600 [00:00<00:01, 320811.89it/s][A[A[A[A[A[A





 17%|█▋        | 67452/393600 [00:00<00:00, 330017.36it/s][A[A[A[A[A[A





 26%|██▌       | 102872/393600 [00:00<00:00, 336917.58it/s][A[A[A[A[A[A





 35%|███▌      | 139211/393600 [00:00<00:00, 344443.97it/s][A[A[A[A[A[A





 44%|████▍     | 173251/393600 [00:00<00:00, 343217.86it/s][A[A[A[A[A[A





 53%|█████▎    | 207021/393600 [00:00<00:00, 341541.78it/s][A[A[A[A[A[A





 61%|██████    | 239465/393600 [00:00<00:00, 336223.49it/s][A[A[A[A[A[A





 69%|██████▊   | 270500/393600 [00:00<00:00, 327094.31it/s][A[A[A[A[A[A





 78%|███████▊  | 307036/393600 [00:00<00:00, 337704.73it/s][A[A[A[A[A[A





 87%|████████▋ | 341828/393600 [00:01<00:00, 340704.91it/s][A[A[A[A[A[A





 96%|█████████▌| 376889/393600 [00:01<00:00, 343614.62it/s][A[A[A[A[A[A





100%|██████████| 

Unnamed: 0,idx,name,seg,unique
0,7000,data/images/GAF/GAF/7000_test_seg_a1a511.png,a1a511,a1a5117000
1,61000,data/images/GAF/GAF/61000_test_seg_c11a4f.png,c11a4f,c11a4f61000
2,30000,data/images/GAF/GAF/30000_test_seg_222c5f.png,222c5f,222c5f30000
3,123000,data/images/GAF/GAF/123000_test_seg_a5f4dd.png,a5f4dd,a5f4dd123000
4,68000,data/images/GAF/GAF/68000_test_seg_e7d1f8.png,e7d1f8,e7d1f868000


In [32]:
len(set(test_labels_df.unique))

393600

In [26]:
test_labels_df.shape

(393600, 3)

In [27]:
img_lst = ImageList.from_df(test_labels_df,"","name")
learn.data.add_test(img_lst)

In [33]:
preds,losses = learn.get_preds(ds_type=DatasetType.Test)

KeyboardInterrupt: 

In [13]:
predictions = []
for pred, row in zip(preds, labels_df.iterrows()):
    row = dict(row[1])
    row["pred"] = pred.numpy()[0]
    row['idx'] = int(row["name"].split('/')[-1].split('.')[0])
    predictions.append(row)
pred_df = pd.DataFrame().from_dict(predictions)
pred_df.to_csv('data/predictions_.csv')
print(pred_df.shape)
pred_df.head()


(75000, 4)


Unnamed: 0,idx,label,name,pred
0,48465000,0.420999,data/images/GAF/48465000.png,0.646745
1,16218000,8.797798,data/images/GAF/16218000.png,7.968527
2,41159000,2.319097,data/images/GAF/41159000.png,3.07375
3,35662000,3.746998,data/images/GAF/35662000.png,3.480164
4,58014000,12.121798,data/images/GAF/58014000.png,12.320049


In [14]:
pred_df = pd.read_csv('data/predictions_.csv')
print(pred_df.shape)
pred_df.head()

(75000, 5)


Unnamed: 0.1,Unnamed: 0,idx,label,name,pred
0,0,48465000,0.420999,data/images/GAF/48465000.png,0.646745
1,1,16218000,8.797798,data/images/GAF/16218000.png,7.968527
2,2,41159000,2.319097,data/images/GAF/41159000.png,3.07375
3,3,35662000,3.746998,data/images/GAF/35662000.png,3.480164
4,4,58014000,12.121798,data/images/GAF/58014000.png,12.320049


In [74]:
print(df.shape)
df.head()

(629145480, 2)


Unnamed: 0,acoustic_data,time_to_failure
0,12,1.4691
1,6,1.4691
2,8,1.4691
3,5,1.4691
4,8,1.4691
