In [0]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

%matplotlib inline

In [0]:
label_df = pd.read_csv('/content/drive/My Drive/Beat-PD/Train_Dataset/Data_Train_CIS-PD/Data_Train_CIS-PD/cis-pd.data_labels/data_labels/CIS-PD_Training_Data_IDs_Labels.csv')
label_df.dropna(axis=0, inplace=True)
label_df.head()

Unnamed: 0,measurement_id,subject_id,on_off,dyskinesia,tremor
0,cc7b822c-e310-46f0-a8ea-98c95fdb67a1,1004,1.0,1.0,1.0
1,5163afe8-a6b0-4ea4-b2ba-9b4501dd5912,1004,0.0,0.0,0.0
2,5cf68c8e-0b7a-4b73-ad4f-015c7a20fb5a,1004,1.0,1.0,1.0
3,fb188ae2-2173-4137-9236-19a137a402c2,1004,3.0,3.0,3.0
4,19a3e9ea-fce1-40b7-9457-2618970beb7b,1004,1.0,1.0,1.0


# Split Dataset

In [0]:
train_df, test_df = train_test_split(label_df, shuffle=False, test_size = 0.2)
train_df.head()

Unnamed: 0,measurement_id,subject_id,on_off,dyskinesia,tremor
0,cc7b822c-e310-46f0-a8ea-98c95fdb67a1,1004,1.0,1.0,1.0
1,5163afe8-a6b0-4ea4-b2ba-9b4501dd5912,1004,0.0,0.0,0.0
2,5cf68c8e-0b7a-4b73-ad4f-015c7a20fb5a,1004,1.0,1.0,1.0
3,fb188ae2-2173-4137-9236-19a137a402c2,1004,3.0,3.0,3.0
4,19a3e9ea-fce1-40b7-9457-2618970beb7b,1004,1.0,1.0,1.0


# Get Train-Test dataset

In [0]:
TRAIN_PATH = '/content/drive/My Drive/Beat-PD/Train_Dataset/Data_Train_CIS-PD/Data_Train_CIS-PD/cis-pd.training_data/training_data/' 
def get_data(i, row, train_path):
  df = pd.read_csv(train_path + row['measurement_id'] + '.csv')
  X = pd.DataFrame()
  X['Timestamp'] = df['Timestamp']
  X['3D'] = (df['X']**2 + df['Y']**2 + df['Z']**2)**(1/2)
  y = row[['subject_id', 'on_off', 'dyskinesia', 'tremor']]

  del df
  return X, y

In [0]:
X_train = []
y_train = []
for i, row in tqdm(train_df.iterrows()):
  X, y = get_data(i, row, TRAIN_PATH)
  X_train.append(X)
  y_train.append(y)


770it [00:48, 15.98it/s]


In [0]:
X_test = []
y_test = []
for i, row in tqdm(test_df.iterrows()):
  X, y = get_data(i, row, TRAIN_PATH)
  X_test.append(X)
  y_test.append(y)

193it [00:11, 16.34it/s]


In [0]:
print('size of train_data: {}; size of test_data: {}'.format(len(X_train), len(X_test)))

size of train_data: 770; size of test_data: 193


# Detect and remove Noise using DBSCAN

In [0]:
epsilon = 0.8 # for DBSCAN
min_samples = 5 # for DBSCAN

In [0]:
def remove_noise(X_df):
  db = DBSCAN(eps = epsilon, min_samples=min_samples).fit(X_df)
  X_df_removed = X_df.loc[db.core_sample_indices_]
  X_df_removed.reset_index()
  
  del db
  return X_df_removed

In [0]:
def view_DBSCAN(X_df, y):
    print(y)
    db = DBSCAN(eps = epsilon, min_samples = min_samples).fit(X_df)
    labels = db.labels_

    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True

    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    unique_labels = set(labels)
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))

    plt.figure(figsize=(20, 5))

    # Plot the points with colors
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = 'k'

        class_member_mask = (labels == k)

        # Plot the datapoints that are clustered
        xy = X_df.loc[class_member_mask & core_samples_mask]
        plt.scatter(xy['Timestamp'], xy['3D'],s=50, c=[col], marker=u'.', alpha=1)

        # Plot the outliers
        xy = X_df.loc[class_member_mask & ~core_samples_mask]
        plt.scatter(xy['Timestamp'], xy['3D'],s=50, c=[col], marker=u'.', alpha=1)
    plt.show()

View some chart and compare

In [0]:
# kk = 0
# view_DBSCAN(X_train[kk], y_train[kk])

In [0]:
# vs = remove_noise(X_train[kk])
# view_DBSCAN(vs, y_train[kk])

Now it's time for remove noise

In [0]:
for i in tqdm(range(len(X_train))):
    X_train[i] = remove_noise(X_train[i])

100%|██████████| 770/770 [1:07:39<00:00,  4.97s/it]


In [0]:
for i in tqdm(range(len(X_test))):
    X_test[i] = remove_noise(X_test[i])

100%|██████████| 193/193 [16:13<00:00,  5.45s/it]


# Extract Features

## Get mean and std of First 50 and Last 50 items of records.

In [0]:
def get_mean_std_head_tail(X_df):
    len_X_df = X_df.shape[0]
    head = X_df.loc[0:49]
    tail = X_df.loc[len_X_df-50:len_X_df]
    return np.mean(head['3D']), np.std(head['3D']), np.mean(tail['3D']), np.std(tail['3D'])

## Get mean and std of top 10 minimum values.

In [0]:
def get_mean_std_minvalues(X_df):
    list_values = np.sort(X_df['3D'])[0:8]
    return list_values.mean(), list_values.std()

(0.5071053246678111, 0.04574199564580611)

## Get mean & std of minimum/maximum width of values in 2s (100 items)


Because It is to long, I set this ranger is 0 -> min(len_X_df, 20 000)

In [0]:
RANGER = 200
def get_minmax_std_values(X_df, ranger):
    max_mean, max_std, min_mean, min_std = 0, 0, 10, 10 # init values
    for i in range(RANGER-1, min(X_df.shape[0], 20000)):
        list_values = X_df['3D'].loc[i-RANGER+1:i]
        mean_, std_ = np.mean(list_values), np.std(list_values)
        
        if std_ > max_std:
            max_mean = mean_
            max_std = std_

        if  std_ < min_std:
            min_mean = mean_
            min_std = std_
    return max_mean, max_std, min_mean, min_std

## Extract features to DataFrame

In [0]:
def get_X_features(X_df):
    X_features = pd.DataFrame({
        'mean_head': [],
        'std_head': [],
        'mean_tail': [],
        'std_tail': [],
        'mean_minvalues': [],
        'std_minvalues': [],
        'mean_maxwidth': [],
        'std_maxwidth': [],
        'mean_minwidth': [],
        'std_minwidth': []
    })

    for i in tqdm(range(len(X_df))):
        a, b, c, d = get_mean_std_head_tail(X_df[i])
        e, f = get_mean_std_minvalues(X_df[i])
        g, h, ik, k = get_minmax_std_values(X_df[i], RANGER)
        X_features.loc[i] = [a,b,c,d,e,f,g,h,ik,k]

    X_features.reset_index()
    return X_features

In [0]:
X_features_train = get_X_features(X_train)

In [0]:
X_features_test = get_X_features(X_test)

100%|██████████| 193/193 [16:53<00:00,  5.15s/it]


# Save to csv

In [0]:
X_features_train.to_csv('/content/drive/My Drive/Beat-PD/DecisionTree/PreProcessing.csv')

In [0]:
X_features_test.to_csv('/content/drive/My Drive/Beat-PD/DecisionTree/Test_preproc.csv')

In [0]:
y_train_np = np.array(y_train)
y_train_df = pd.DataFrame({"subject_id": y_train_np.T[0], "on_off":y_train_np.T[1], "dyskinesia":y_train_np.T[2], "tremor":y_train_np.T[3]})
y_train_df.to_csv('/content/drive/My Drive/Beat-PD/DecisionTree/PreProcessing_labels.csv')

In [0]:
y_test_np = np.array(y_test)
y_test_df = pd.DataFrame({"subject_id": y_test_np.T[0], "on_off":y_test_np.T[1], "dyskinesia":y_test_np.T[2], "tremor":y_test_np.T[3]})
y_test_df.to_csv('/content/drive/My Drive/Beat-PD/DecisionTree/Test_preproc_labels.csv')