In [25]:
import synapseclient
import pandas as pd
import numpy as np
import sklearn
import json
np.random.seed(42)

In [125]:
from sklearn.model_selection import train_test_split

In [26]:
syn = synapseclient.login()

Welcome, Hemanth Kumar Tirupati!



In [27]:
WALKING_TABLE = 'syn5511449'
DEMG_TABLE = 'syn5511429'

In [28]:
wk_query = syn.tableQuery(f'SELECT * FROM {WALKING_TABLE} limit 5000')

In [29]:
wk_df = wk_query.asDataFrame()
print(wk_df.columns)
wk_df = wk_df[['healthCode',  'deviceMotion_walking_outbound.json.items', 'deviceMotion_walking_return.json.items', 'deviceMotion_walking_rest.json.items', 'medTimepoint']]
wk_df = wk_df[wk_df['medTimepoint']!='Just after Parkinson medication (at your best)']
wk_df.shape

Index(['recordId', 'healthCode', 'createdOn', 'appVersion', 'phoneInfo',
       'accel_walking_outbound.json.items',
       'deviceMotion_walking_outbound.json.items',
       'pedometer_walking_outbound.json.items',
       'accel_walking_return.json.items',
       'deviceMotion_walking_return.json.items',
       'pedometer_walking_return.json.items', 'accel_walking_rest.json.items',
       'deviceMotion_walking_rest.json.items', 'medTimepoint'],
      dtype='object')


(4696, 5)

In [30]:
dgtrue_query = syn.tableQuery(f'SELECT * FROM {DEMG_TABLE} WHERE "professional-diagnosis" IS True limit 1000')
dgfalse_query = syn.tableQuery(f'SELECT * FROM {DEMG_TABLE} WHERE "professional-diagnosis" IS False limit 1000')

In [31]:
dgtrue_df = dgtrue_query.asDataFrame()
dgfalse_df = dgfalse_query.asDataFrame()
dg_df = pd.concat([dgtrue_df, dgfalse_df])

In [32]:
print(dg_df.columns)
dg_df = dg_df[['healthCode', 'age', 'professional-diagnosis']]
print(dg_df.head())
print(dg_df.shape)
dg_df['professional-diagnosis'].value_counts()

Index(['recordId', 'healthCode', 'createdOn', 'appVersion', 'phoneInfo', 'age',
       'are-caretaker', 'deep-brain-stimulation', 'diagnosis-year',
       'education', 'employment', 'gender', 'health-history',
       'healthcare-provider', 'home-usage', 'last-smoked', 'maritalStatus',
       'medical-usage', 'medical-usage-yesterday', 'medication-start-year',
       'onset-year', 'packs-per-day', 'past-participation', 'phone-usage',
       'professional-diagnosis', 'race', 'smartphone', 'smoked', 'surgery',
       'video-usage', 'years-smoking'],
      dtype='object')
                                    healthCode   age  professional-diagnosis
20597_21  2257e233-5815-4211-b0f2-7f135d1604b0  65.0                    True
20602_21  4a318f75-f290-44cf-9bb3-5080e546b13f  59.0                    True
20604_21  84cdec84-5148-47d3-822d-b829ef24d11f  69.0                    True
20607_21  9d478389-f02d-47a0-bb1e-c226bac23030  43.0                    True
20615_21  945bb372-bfbe-4ef0-87d9-d268bc

professional-diagnosis
True     1000
False    1000
Name: count, dtype: int64

In [304]:
print(dg_df.shape, wk_df.shape)
pd_df = pd.merge(dg_df, wk_df, on='healthCode', how='inner')
pd_df = pd_df.dropna()
print(pd_df.shape)
print(pd_df.head())
pd_df['professional-diagnosis'].value_counts()
true_sample = pd_df[pd_df['professional-diagnosis'] == True][:1000]
false_sample = pd_df[pd_df['professional-diagnosis'] == False][:1000]


# Concatenate the sampled values
pd_df = pd.concat([true_sample, false_sample])
print(pd_df.shape)
print(pd_df['professional-diagnosis'].value_counts())

(2000, 3) (4696, 5)
(2140, 7)
                             healthCode   age  professional-diagnosis  \
0  2257e233-5815-4211-b0f2-7f135d1604b0  65.0                    True   
1  2257e233-5815-4211-b0f2-7f135d1604b0  65.0                    True   
2  2257e233-5815-4211-b0f2-7f135d1604b0  65.0                    True   
3  2257e233-5815-4211-b0f2-7f135d1604b0  65.0                    True   
4  2257e233-5815-4211-b0f2-7f135d1604b0  65.0                    True   

   deviceMotion_walking_outbound.json.items  \
0                                   5400565   
1                                   5476404   
2                                   5481901   
3                                   5444867   
4                                   5653760   

   deviceMotion_walking_return.json.items  \
0                                 5400615   
1                                 5476689   
2                                 5482218   
3                                 5445148   
4                      

In [35]:
fileMap_walk_out = syn.downloadTableColumns(wk_query, ['deviceMotion_walking_outbound.json.items'])
fileMap_walk_in = syn.downloadTableColumns(wk_query, ['deviceMotion_walking_return.json.items'])
fileMap_walk_rest = syn.downloadTableColumns(wk_query, ['deviceMotion_walking_rest.json.items'])

Downloading 0 files, 5000 cached locally
Downloading 0 files, 5000 cached locally
Downloading 0 files, 5000 cached locally


In [36]:
def read_json_file(file_handle_id, fileMap):
    path = fileMap.get(str(file_handle_id))
    if path is None:
        return None
    with open(path) as f:
        data = json.load(f)
    return data

In [103]:
def accel_array_padded(data):
    accel = [record['userAcceleration'] for record in data]
    accel = [list(record.values()) for record in accel]
    return np.asarray(accel)

In [75]:
data = read_json_file(5654332, fileMap_walk_rest)
ls = accel_array_padded(data)
print(ls.shape)
data[0]

(3001, 3)


{'attitude': {'y': 0.02805523755830615,
  'w': 0.6448983010414894,
  'z': 0.003125350360080418,
  'x': -0.7637468933779529},
 'timestamp': 2593.75614725,
 'rotationRate': {'x': 0.1270270943641663,
  'y': -1.846529006958008,
  'z': -0.2416433840990067},
 'userAcceleration': {'x': 0.001780364778824151,
  'y': 0.05194719135761261,
  'z': 0.009953535161912441},
 'gravity': {'x': 0.04095950350165367,
  'y': 0.9849027991294861,
  'z': 0.1681928336620331},
 'magneticField': {'y': 0, 'z': 0, 'x': 0, 'accuracy': -1}}

In [121]:
X = []
y = []

for idx, row in pd_df.iterrows():
    fileId = row['deviceMotion_walking_outbound.json.items']
    label = row['professional-diagnosis']
    data = read_json_file(fileId, fileMap_walk_out)
    ts = accel_array_padded(data)
    ts = np.pad(ts, ((0,4000-ts.shape[0]), (0,0)))
    X.append(ts)
    if label:
        y.append(1)
    else:
        y.append(0)

In [122]:
X_np = np.asarray(X)
y_np = np.asarray(y)

In [131]:
X_train, X_test, y_train, y_test = train_test_split(X_np, y_np, test_size=0.1, random_state=42, shuffle=True)

In [132]:
print(X_train.shape, X_test.shape)

(1789, 4000, 3) (199, 4000, 3)


In [137]:
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)