In [98]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [99]:
def load_sample(data_dir):
    data = np.load(data_dir)
    return pd.DataFrame(data)

def load_all_samples(data_dir):
    data = []
    for file in os.listdir(data_dir):
        if file.endswith(".labels.npy"):
            continue
        if file.endswith(".npy"):
            data.append(load_sample(os.path.join(data_dir, file)))
    return pd.concat(data, axis=0)

def load_data(data_dir):
    return load_all_samples(data_dir)

In [100]:
data = []
for folder in os.listdir('../dataset'):
    if os.path.isfile(os.path.join('../dataset', folder)):
        continue
    data.append(load_data(os.path.join('../dataset', folder)))
master_data = np.stack(data, axis=0)

In [101]:
print(master_data.shape)

(6, 20000, 548)


In [102]:
master_data_pd = master_data.reshape(master_data.shape[0]*master_data.shape[1], -1)
master_data_pd = pd.DataFrame(master_data_pd)
with open('../dataset/feature_names.txt', 'r') as f:
    feature_names = f.read().splitlines()
master_data_pd.columns = feature_names

In [103]:
master_data_pd.head()

Unnamed: 0,zcr_mean,zcr_std,yin_0,yin_1,yin_2,yin_3,yin_4,yin_5,yin_6,yin_7,...,cln_contrast_mean_4,cln_contrast_mean_5,cln_contrast_mean_6,cln_contrast_std_0,cln_contrast_std_1,cln_contrast_std_2,cln_contrast_std_3,cln_contrast_std_4,cln_contrast_std_5,cln_contrast_std_6
0,0.166783,0.07944,490.957062,491.752991,488.983582,485.077576,469.738678,513.687073,512.657349,245.042892,...,13.432678,13.885087,17.060001,6.169077,3.001384,3.211213,4.549636,3.261581,3.175519,1.371739
1,0.24142,0.02663,467.413574,516.481689,515.562134,514.210266,509.392456,510.750336,511.419983,511.064209,...,11.027208,14.805115,17.158812,4.463205,3.956305,1.96252,2.206298,2.37836,1.921748,2.072176
2,0.264509,0.02355,511.845062,508.178223,509.490753,509.445099,513.549133,512.35553,514.466064,515.264587,...,12.740061,13.21579,16.725847,1.774137,3.76616,2.463165,1.830873,3.058117,1.740199,1.977649
3,0.29541,0.016382,514.2547,512.569824,4576.394043,4567.560547,4576.487793,4655.340332,4689.938477,4697.403809,...,11.951676,13.193249,16.734816,3.23022,3.042296,3.633888,2.285815,2.923266,1.529872,1.633861
4,0.292899,0.026179,462.359528,4722.730469,4718.769043,4696.359375,4698.638672,4731.375488,4714.80957,4737.364258,...,10.747338,12.740884,16.750946,3.083272,2.659685,3.981223,3.348599,2.550901,1.909587,1.636644


In [104]:
master_data_pd.describe()

Unnamed: 0,zcr_mean,zcr_std,yin_0,yin_1,yin_2,yin_3,yin_4,yin_5,yin_6,yin_7,...,cln_contrast_mean_4,cln_contrast_mean_5,cln_contrast_mean_6,cln_contrast_std_0,cln_contrast_std_1,cln_contrast_std_2,cln_contrast_std_3,cln_contrast_std_4,cln_contrast_std_5,cln_contrast_std_6
count,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,...,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0,120000.0
mean,0.130285,0.023294,1763.902222,1750.269897,1740.552002,1744.570435,1740.652222,1738.122681,1746.456055,1743.41626,...,13.084869,14.375112,16.739674,3.176971,2.618193,2.780379,3.070741,2.875529,2.219403,1.737573
std,0.11235,0.025121,2760.121582,2738.184326,2731.456543,2732.496338,2730.286133,2726.077148,2734.907227,2735.542725,...,2.496332,2.843038,3.988505,1.211233,0.722041,0.78283,1.097991,0.876289,0.793816,1.202259
min,0.0,0.0,99.773758,99.773758,99.773758,99.773758,99.773758,99.773758,99.773758,99.773758,...,6.629853,6.629853,6.629853,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.045619,0.008031,194.957733,193.84877,192.016838,191.566219,191.436676,190.203506,192.186684,192.137497,...,11.812891,12.581925,14.09956,2.339905,2.113249,2.24059,2.355032,2.284546,1.713805,1.028646
50%,0.097447,0.01543,511.840607,512.085052,510.567902,511.650146,510.045486,510.735931,510.970535,510.724991,...,12.479323,13.399871,15.328071,2.984243,2.530173,2.678236,2.850663,2.740575,2.057766,1.333858
75%,0.194824,0.028841,2174.313599,2157.410767,2124.02124,2145.046265,2137.107971,2140.767761,2152.302185,2135.778015,...,13.420479,15.246063,17.977098,3.786228,3.0252,3.200193,3.498945,3.301398,2.521881,2.039211
max,0.794364,0.32145,11025.0,11025.0,11025.0,11025.0,11025.0,11025.0,11025.0,11025.0,...,47.035965,48.849712,62.293606,19.133646,15.536793,14.77196,13.534113,14.318196,14.467334,22.148211


In [105]:
master_data_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Columns: 548 entries, zcr_mean to cln_contrast_std_6
dtypes: float32(548)
memory usage: 250.9 MB


In [106]:
print(f'Null check: {master_data_pd.isnull().values.any()}')


Null check: False
