# Prerequisites
Understanding in
- machine learning and deep learning
- python syntax
- python libraries: numpy, pandas, Pytorch

# Introduction
This notebook shows the study of the [CWRU Bearing Dataset](https://csegroups.case.edu/bearingdatacenter/home), which contains data of normal and fault bearings. Artificial defects of different diameters (0.007 ~ 0.028 Inches) are manufactured at different locations of the bearings: inner raceway(IR), outer raceway(OR) and ball(B) defects. 

Vibration data was recorded for motor loads of 0 to 3 hp (motor speed of 1797 to 1720 RPM) using accelerometers at the drive end (DE) and fan end (FE) and the data is stored as Matlab files. The sampling rate is 12 kHz and each Matlab file contains between ~120k to ~240k sample points. For more information please refer to the [website](https://csegroups.case.edu/bearingdatacenter/home).

This study focuses on the classification of the drive end bearing defects using only the signal data at **DE**. It is a **multiclass classification** problem. The input is the vibration signal data at DE and the output is the type of defects:
- 0 : Normal (N), 
- 1 : Fault at Ball (B),
- 2 : Fault at Inner Raceway (IR), 
- 3 : Fault at Outer Raceway (OR), 



# Import

In [1]:
# Data science libraries
import scipy.io
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Pytorch
import torch
from torch import nn
from torch.nn import functional as F
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader
from torch import optim
from torch.nn.modules.loss import CrossEntropyLoss

# Others
from IPython.core.debugger import set_trace
from pathlib import Path

from helper import get_df_all, download
from train_helper import get_dataloader, fit, validate 
import nn_model
from data_urls import URLS

In [2]:
working_dir = Path('.')
DATA_PATH = Path("./Data")
save_model_path = working_dir / 'Model'
DE_path = DATA_PATH / '12k_DE'
FE_path = DATA_PATH / '12k_FE'

for path in [DATA_PATH, save_model_path]:
    if not path.exists():
        path.mkdir(parents=True)

In [3]:
# # Uncomment this to download the 12k_DE data if needed
# for name, url in URLS["DE_12k"].items():
#     download(url, DE_path, name, suffix=".mat")

In [4]:
#### HYPERPARAMETERS ####
bs = 64
lr = 0.001
wd = 1e-5
betas=(0.99, 0.999)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
random_seed = 42

# Load Data and Preprocessing

In [5]:
df_all = get_df_all(DE_path, segment_length=500)
features = df_all.columns[2:]
target = 'label'

In [6]:
df_all.sample(5)

Unnamed: 0,label,filename,0,1,2,3,4,5,6,7,...,490,491,492,493,494,495,496,497,498,499
10097,0,Normal_2.mat,-0.037342,-0.074058,-0.083029,-0.049025,0.017732,0.053197,0.036299,0.003964,...,0.13414,0.136852,0.090122,0.06321,0.084072,0.111818,0.099718,0.041514,0.00146,-0.002086
4509,2,IR007_2.mat,0.400403,0.019817,-0.224973,-0.043533,0.139857,0.110131,0.129298,-0.027939,...,-0.162922,-0.030538,0.115491,-0.126537,-0.258922,-0.062538,0.211166,0.335916,-0.022091,-0.173156
11519,3,OR007@12_1.mat,0.347124,0.272404,-0.435164,-0.428504,0.619365,0.444098,-0.879749,-0.239429,...,0.052467,0.152689,-0.049055,-0.146029,0.083004,0.099735,-0.166496,-0.122151,0.085603,-0.060588
16852,3,OR021@3_3.mat,0.31951,0.35622,0.131897,-0.212465,0.012183,0.087553,-0.054253,-0.335753,...,0.004386,0.066111,-0.01803,-0.011046,0.146841,0.110131,0.068872,-0.004386,0.020304,-0.047918
2086,1,B021_0.mat,-0.018193,0.208892,0.097461,0.01998,0.150902,0.144405,-0.028101,0.032487,...,-0.024853,0.128161,0.092426,-0.067573,-0.013645,0.139694,0.031188,-0.063512,0.07001,0.071634


In [7]:
df_all.shape

(17987, 502)

In [8]:
## Split the data into train and validation set
X_train, X_valid, y_train, y_valid = train_test_split(df_all[features], 
                                                      df_all[target], 
                                                      test_size=0.20, random_state=random_seed, shuffle=True
                                                     )

In [9]:
## Create DataLoader of train and validation set
X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_valid = torch.tensor(X_valid.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.long)
y_valid = torch.tensor(y_valid.values, dtype=torch.long)

train_ds = TensorDataset(X_train, y_train)
valid_ds = TensorDataset(X_valid, y_valid)
train_dl, valid_dl = get_dataloader(train_ds, valid_ds, bs)

# Training with Adams Optimizer

In [10]:
## Instantiate model, optimizer and loss function
model = nn_model.CNN_1D_2L(len(features))
model.to(device)
opt = optim.Adam(model.parameters(), lr=lr, betas=betas, weight_decay=wd)
loss_func = CrossEntropyLoss()

In [11]:
%%time
## Train
epochs = 20
model, metrics = fit(epochs, model, loss_func, opt, train_dl, valid_dl, train_metric=False)

EPOCH 	 Train Loss 	 Val Loss 	 Train Acc 	 Val Acc 	
0 	 0.69507 	 0.55293 	 0.00000 	0.74291 	
1 	 0.28088 	 0.13620 	 0.00000 	0.96470 	
2 	 0.13778 	 0.10443 	 0.00000 	0.96776 	
3 	 0.11517 	 0.10766 	 0.00000 	0.96331 	
4 	 0.10318 	 0.05372 	 0.00000 	0.98221 	
5 	 0.06259 	 0.11454 	 0.00000 	0.95775 	
6 	 0.06185 	 0.02922 	 0.00000 	0.99055 	
7 	 0.06488 	 0.12063 	 0.00000 	0.95497 	
8 	 0.11528 	 0.06886 	 0.00000 	0.97526 	
9 	 0.08687 	 0.08658 	 0.00000 	0.96998 	
10 	 0.08889 	 0.03706 	 0.00000 	0.98555 	
11 	 0.09271 	 0.02116 	 0.00000 	0.99444 	
12 	 0.05223 	 0.02518 	 0.00000 	0.99333 	
13 	 0.05163 	 0.03168 	 0.00000 	0.98694 	
14 	 0.03327 	 0.01448 	 0.00000 	0.99500 	
15 	 0.03796 	 0.03786 	 0.00000 	0.98583 	
16 	 0.06547 	 0.03188 	 0.00000 	0.98694 	
17 	 0.04638 	 0.02037 	 0.00000 	0.99416 	
18 	 0.04591 	 0.15368 	 0.00000 	0.94302 	
19 	 0.05200 	 0.05724 	 0.00000 	0.98277 	
Wall time: 23min 59s


In [12]:
%%time
## Train
epochs = 20
model, metrics = fit(epochs, model, loss_func, opt, train_dl, valid_dl, train_metric=False)

EPOCH 	 Train Loss 	 Val Loss 	 Train Acc 	 Val Acc 	
0 	 0.04319 	 0.01822 	 0.00000 	0.99305 	
1 	 0.03688 	 0.03391 	 0.00000 	0.98471 	
2 	 0.03701 	 0.05930 	 0.00000 	0.97749 	
3 	 0.03749 	 0.02099 	 0.00000 	0.99250 	
4 	 0.06485 	 0.01811 	 0.00000 	0.99389 	
5 	 0.05885 	 0.03004 	 0.00000 	0.99055 	
6 	 0.05891 	 0.01545 	 0.00000 	0.99361 	
7 	 0.03521 	 0.01278 	 0.00000 	0.99611 	
8 	 0.04258 	 0.01375 	 0.00000 	0.99611 	
9 	 0.04958 	 0.05147 	 0.00000 	0.97971 	
10 	 0.05267 	 0.03447 	 0.00000 	0.98805 	
11 	 0.02937 	 0.01268 	 0.00000 	0.99583 	
12 	 0.03082 	 0.04544 	 0.00000 	0.98499 	
13 	 0.02965 	 0.03133 	 0.00000 	0.98749 	
14 	 0.02374 	 0.01512 	 0.00000 	0.99389 	
15 	 0.02338 	 0.01044 	 0.00000 	0.99555 	
16 	 0.02206 	 0.03224 	 0.00000 	0.98833 	
17 	 0.02528 	 0.01274 	 0.00000 	0.99583 	
18 	 0.02112 	 0.01835 	 0.00000 	0.99250 	
19 	 0.03252 	 0.02516 	 0.00000 	0.99111 	
Wall time: 24min 20s


# Save trained model

In [13]:
torch.save(model.state_dict(), save_model_path / 'model.pth')

In [13]:
model2 = nn_model.CNN_1D_2L(len(features))
loss_func = CrossEntropyLoss()

In [14]:
model2.load_state_dict(torch.load(save_model_path / 'model.pth'))
model2.eval()

CNN_1D_2L(
  (layer1): Sequential(
    (0): Conv1d(1, 64, kernel_size=(9,), stride=(1,), padding=(4,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5)
    (4): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Conv1d(64, 128, kernel_size=(5,), stride=(1,), padding=(2,))
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5)
    (4): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
  )
  (linear1): Linear(in_features=16000, out_features=4, bias=True)
)

In [15]:
%%time
print(validate(model2, valid_dl, loss_func))

(0.02515624563769852, 0.991106170094497, (array([2, 0, 1, ..., 3, 1, 3], dtype=int64), array([2, 0, 1, ..., 3, 1, 3], dtype=int64)))
Wall time: 3.93 s


# 12k_FE

In [8]:
from helper import matfile_to_df, matfile_to_dic, divide_signal

In [4]:
df_FE = matfile_to_df(FE_path)

In [7]:
df_FE = df_FE.drop(labels='DE_time', axis=1)

In [10]:
df_FE = divide_signal(df_FE, segment_length=500)

In [11]:
map_label = {'N':0, 'B':1, 'IR':2, 'OR':3}
df_FE['label'] = df_FE['label'].map(map_label)

In [12]:
df_FE.head()

Unnamed: 0,label,filename,0,1,2,3,4,5,6,7,...,490,491,492,493,494,495,496,497,498,499
0,1,B007_0.mat,0.319666,0.32617,-0.260481,0.031056,0.44698,0.100485,-0.281294,-0.006992,...,0.042438,0.007805,-0.084713,-0.064063,0.099672,-0.041625,-0.268123,-0.002114,0.07447,-0.060486
1,1,B007_0.mat,0.100973,0.115769,-0.13772,0.127476,0.268123,-0.188125,-0.163573,0.051218,...,0.011382,-0.086339,-0.105688,-0.029593,-0.00439,-0.071543,-0.180646,0.029105,0.133655,-0.106989
2,1,B007_0.mat,-0.159183,0.16276,0.082437,-0.057722,0.055283,-0.005203,-0.012195,0.123574,...,-0.078209,0.193979,0.042926,-0.218694,-0.055283,0.187963,-0.065364,-0.273326,0.039674,0.176256
3,1,B007_0.mat,-0.146988,-0.017398,0.256741,0.039186,-0.040487,0.249262,0.163898,-0.031056,...,-0.259505,0.089754,0.127476,-0.215767,-0.190889,-0.015284,0.101461,-0.098209,-0.219019,0.082274
4,1,B007_0.mat,0.144224,0.015122,0.10894,0.246823,0.055446,0.118696,0.230075,0.055283,...,-0.006341,-0.021788,0.039349,-0.046178,-0.068941,0.040812,0.043576,-0.139671,-0.173817,-0.043901


In [13]:
features = df_FE.columns[2:]
target = 'label'

In [14]:
X_test, y_test = df_FE[features], df_FE[target]

In [15]:
## Create DataLoader of train and validation set
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.long)

test_ds = TensorDataset(X_test, y_test)
test_dl = DataLoader(test_ds, batch_size=64)

In [16]:
model2 = nn_model.CNN_1D_2L(len(features))
model2.load_state_dict(torch.load(save_model_path / 'model.pth'))
model2.eval()
loss_func = CrossEntropyLoss()

In [17]:
%%time
print(validate(model2, test_dl, loss_func))

(14.914141002260743, 0.40269873324765926, (array([1, 1, 1, ..., 3, 3, 3], dtype=int64), array([3, 2, 3, ..., 2, 2, 2], dtype=int64)))
Wall time: 12.2 s
