In [None]:
import pandas as pd
import os as os
import numpy as np
# Creating empty list for broken tooth and healthy gearbox datasets
broken = []
healthy = []

# Defining the path where the datasets are stored
pth1 = 'dataset/BrokenTooth'
pth2 = 'dataset/Healthy'

# Reading the dataset as the list items using the OS library to access the files and read_csv function from pandas to read the daatasets 
# **Reading each file in the BrokenTooth and Healthy directories and appending it to the respective lists as pandas' DataFrame object (abbr.: df)
for file_name in os.listdir(pth1):
    this_df = pd.read_csv(os.path.join(pth1, file_name))
    this_df['load'] = 10*int(file_name[5])
    this_df['gearbox_status'] = 0
    broken.append(this_df)

for file_name in os.listdir(pth2):
    this_df = pd.read_csv(os.path.join(pth2, file_name))
    this_df['load'] = 10*int(file_name[5])
    this_df['gearbox_status'] = 1
    healthy.append(this_df)

print(broken[0])

In [None]:
# Aggregating dataset based on health status
broken_agg = broken[0]
healthy_agg = healthy[0]
for i in range(1,10):
    broken_agg = pd.concat([broken_agg, broken[i]], ignore_index=True)
    healthy_agg = pd.concat([healthy_agg, healthy[i]], ignore_index=True)

print(broken[0])

In [35]:
import math

def divide_into(df, n):
    group_size = n
    df_groups = []
    for i in range(math.ceil(df['a1'].count()/group_size)):
        new_df = df.iloc[i*group_size:(i+1)*group_size, :]
        df_groups.append(new_df)
    return df_groups

def get_group_stdevs(df, n):
    groups = divide_into(df, n)
    stdevs = pd.DataFrame(map(pd.DataFrame.std, groups))
    stdevs['load'] = df['load']
    stdevs['gearbox_status'] = df['gearbox_status']
    
    return stdevs

healthy_stdev = []
for i in range(0,10):
    healthy_stdev.append(get_group_stdevs(healthy[i], 100))
print(healthy_stdev)

[           a1        a2        a3        a4  load  gearbox_status
0    4.518469  2.627219  3.129601  3.046165     0               1
1    3.463637  2.616907  2.286667  2.389276     0               1
2    4.461867  2.833214  2.413283  2.977934     0               1
3    4.381037  4.703040  2.668196  2.650610     0               1
4    3.880276  3.054690  2.502711  2.147417     0               1
..        ...       ...       ...       ...   ...             ...
884  3.644520  2.551445  2.531753  2.083169     0               1
885  3.096537  2.772343  1.819461  1.997667     0               1
886  4.810296  2.917624  2.365434  2.678131     0               1
887  3.814070  2.744130  2.529900  2.127945     0               1
888  4.740960  2.435926  2.100364  2.925200     0               1

[889 rows x 6 columns],             a1        a2        a3        a4  load  gearbox_status
0    10.138598  4.113320  4.783741  4.474848    10               1
1     7.550895  6.508534  3.872753  4.031962    

In [None]:
broken_stdev = []
for i in range(0,10):
    broken_stdev.append(get_group_stdevs(broken[i], 100))
broken_stdev[0].describe()

In [None]:
healthy_stdev = []
for i in range(0,10):
    healthy_stdev.append(get_group_stdevs(healthy[i], 100))
print(healthy_stdev)

In [None]:
dataset = pd.concat([broken_stdev[0], healthy_stdev[0]], axis=0)
dataset.describe()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from pylab import rcParams

plt.figure(figsize=(15,28*4))
gs = gridspec.GridSpec(28, 1)
for i, cn in enumerate(dataset[['a1','a2','a3','a4']]):
    ax = plt.subplot(gs[i])
    sns.distplot(dataset[cn][dataset.gearbox_status == 0], bins=50, color="red")
    sns.distplot(dataset[cn][dataset.gearbox_status == 1], bins= 50, color="lime")
    ax.set_xlabel('')
    plt.legend(['healthy', 'broken'])
    ax.set_title('histogram for ' + str(cn))
plt.show()