## dev pain questionnaire

In [1]:
import os, sys
import numpy as np
import pandas as pd

#### check when imaging vs questionnaires

In [2]:
data_dir = '/vols/Data/pain/asdahl/uk_biobank/suyi_extend/pain_questionnaire'

In [28]:
pq_path = os.path.join(data_dir, 'pain_questionnaire_without_condition.tsv')
df_pq = pd.read_csv(pq_path, sep='\t')

In [29]:
df_pq.shape

(26485, 148)

In [30]:
df_pq.columns

Index(['eid', '120000-0.0', '120001-0.0', '120002-0.0', '120003-0.0',
       '120004-0.0', '120005-0.0', '120006-0.0', '120007-0.0', '120008-0.0',
       ...
       '120119-0.0', '120120-0.0', '120121-0.0', '120122-0.0', '120123-0.0',
       '120124-0.0', '120125-0.0', '120126-0.0', '120127-0.0', '120128-0.0'],
      dtype='object', length=148)

In [32]:
df_pq = df_pq.set_index('eid')

In [58]:
df_pq['120128-0.0']

eid
1000025    2019-01-31T15:40:44
1000108                    NaN
1000149    2019-05-23T17:08:51
1000441    2019-02-27T08:53:33
1000755    2019-02-07T18:48:26
                  ...         
6023887                    NaN
6023923    2019-02-08T17:15:42
6024480                    NaN
6024739    2019-01-30T17:42:58
6025233    2019-02-17T20:07:25
Name: 120128-0.0, Length: 26485, dtype: object

In [59]:
df_pq_time = pd.to_datetime(df_pq['120128-0.0'])

In [60]:
df_pq_time

eid
1000025   2019-01-31 15:40:44
1000108                   NaT
1000149   2019-05-23 17:08:51
1000441   2019-02-27 08:53:33
1000755   2019-02-07 18:48:26
                  ...        
6023887                   NaT
6023923   2019-02-08 17:15:42
6024480                   NaT
6024739   2019-01-30 17:42:58
6025233   2019-02-17 20:07:25
Name: 120128-0.0, Length: 26485, dtype: datetime64[ns]

In [61]:
idp_path = os.path.join(data_dir, 'qsidp_subjs_control_allvisits_extended.tsv')
df_idp = pd.read_csv(idp_path, sep='\t')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [62]:
df_idp = df_idp.set_index('eid')

In [63]:
df_idp.shape

(26490, 2860)

In [64]:
df_img_time = pd.to_datetime(df_idp['53-2.0'])

In [65]:
df_img_time

eid
1000025   2019-09-17
1000108   2019-07-16
1000149   2018-08-06
1000441   2019-09-12
1000755   2018-12-13
             ...    
6023887   2016-05-17
6023923   2019-04-28
6024480   2018-06-25
6024739   2018-03-22
6025233   2019-08-09
Name: 53-2.0, Length: 26490, dtype: datetime64[ns]

In [68]:
df_time = pd.concat([df_img_time, df_pq_time], axis=1)

In [69]:
df_time.shape

(26490, 2)

In [73]:
df_tn = df_time.dropna()

In [74]:
df_tn.shape

(21634, 2)

In [75]:
df_tn.columns

Index(['53-2.0', '120128-0.0'], dtype='object')

In [76]:
# calcualte diff
diff = (df_tn['120128-0.0'] - df_tn['53-2.0']).values
dff = diff.astype('int64')

In [80]:
# subjects completing pq after imaging
df_valid = df_tn[dff>0]

In [81]:
df_valid.shape

(16788, 2)

#### check pain

In [82]:
from subject_select import cwp_positive

In [83]:
from clean_questions import disease_label

In [85]:
# find no pain
df_idp['eid'] = df_idp.index
df_idp_nopain = cwp_positive(df_idp, positive=False)

In [86]:
df_idp_nopain.shape

(19593, 2861)

In [87]:
# find no disease
disease_status = disease_label(df_idp_nopain, visits=[2], grouping='detailed')

In [89]:
disease_status.head()

Unnamed: 0_level_0,irritable bowel syndrome,migraine,headaches (not migraine),back problem,cervical spondylosis,spine arthritis/spondylitis,prolapsed disc/slipped disc,disc problem,disc degeneration,back pain,ankylosing spondylitis,psoriatic arthropathy,rheumatoid arthritis,synovitis,osteoarthritis,joint pain,arthritis (nos),joint disorder,fibromyalgia,chronic fatigue syndrome
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1000025,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000108,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000149,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1000441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1000923,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [90]:
# eid of no pain and no disease
nd_eid = disease_status[disease_status.sum(axis=1)==0].index

In [92]:
nd_eid

Int64Index([1000025, 1000108, 1000149, 1000923, 1001121, 1001143, 1001192,
            1001289, 1001632, 1001680,
            ...
            6022107, 6022257, 6022279, 6022913, 6023505, 6023724, 6023887,
            6023923, 6024739, 6025233],
           dtype='int64', name='eid', length=17173)

In [94]:
# eid of no pain, no disease, imaging before pain questionnaire
eid_npd_valid = nd_eid.intersection(df_valid.index)

In [95]:
len(eid_npd_valid)

10859

In [97]:
df_time[df_time.index.isin(eid_npd_valid)].head()

Unnamed: 0_level_0,53-2.0,120128-0.0
eid,Unnamed: 1_level_1,Unnamed: 2_level_1
1000149,2018-08-06,2019-05-23 17:08:51
1001143,2016-01-30,2019-01-30 11:08:20
1001192,2017-07-25,2019-01-24 19:54:36
1001289,2017-08-18,2019-02-13 18:32:34
1002039,2015-05-08,2019-01-22 15:31:21


#### check has pain when doing pain questionnaire

In [117]:
# 120019,Troubled by pain or discomfort present for more than 3 months
df_pq_valid = df_pq[df_pq.index.isin(eid_npd_valid)]

In [121]:
df_pq_valid['120019-0.0'] = df_pq_valid['120019-0.0'].astype(int)

In [130]:
df_pq_valid.shape

(10859, 147)

In [137]:
np.unique(df_pq_valid['120019-0.0'].values)

array([-818, -121,    0,    1])

In [131]:
# those with chronic pain
sum(df_pq_valid['120019-0.0']==1)

3929

In [132]:
# those with no pain
sum(df_pq_valid['120019-0.0']==0)

6908

In [133]:
3929+6908

10837

In [149]:
eid_out = df_pq_valid[(df_pq_valid['120019-0.0']==1) | (df_pq_valid['120019-0.0']==0)]

In [152]:
eid_out['eid'] = eid_out.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eid_out['eid'] = eid_out.index


In [154]:
eid_out['eid'].to_csv('./subjs_bmrc/subjs_painquestion.csv', index=None)

#### check age/gender

In [188]:
df_dem = df_idp[['31-0.0', '21003-2.0']]

In [195]:
df_demt = pd.concat([df_dem[df_dem.index.isin(eid_out.index)], df_time[df_time.index.isin(eid_out.index)]], axis=1)

In [217]:
df_demt.shape

(10837, 4)

In [197]:
df_demt_clean = df_demt.dropna()

In [198]:
df_demt_clean.shape

(10837, 4)

In [218]:
df_demt_clean['label'] = df_pq_valid['120019-0.0']

In [222]:
df_demt_clean['label'].value_counts()

0    6908
1    3929
Name: label, dtype: int64

In [223]:
df_demt_clean['label'].to_csv('./subjs_bmrc/label_painquestion.csv')

In [209]:
diff = (df_demt_clean['120128-0.0'] - df_demt_clean['53-2.0']).values
df_demt_clean['diff'] = diff.astype('int64')

In [210]:
df_cp = df_demt_clean[df_demt_clean.index.isin(df_pq_valid[df_pq_valid['120019-0.0']==1].index)]

In [211]:
df_cp.shape

(3929, 5)

In [212]:
df_np = df_demt_clean[df_demt_clean.index.isin(df_pq_valid[df_pq_valid['120019-0.0']==0].index)]

In [213]:
df_np.shape

(6908, 5)

In [214]:
df_cp.describe()

Unnamed: 0,31-0.0,21003-2.0,diff
count,3929.0,3929.0,3929.0
mean,0.471876,63.574192,5.873035e+16
std,0.499272,7.215278,4.302101e+16
min,0.0,46.0,35348000000000.0
25%,0.0,58.0,2.095715e+16
50%,0.0,64.0,4.843038e+16
75%,1.0,69.0,9.481767e+16
max,1.0,82.0,1.697339e+17


In [216]:
df_np.describe()

Unnamed: 0,31-0.0,21003-2.0,diff
count,6908.0,6908.0,6908.0
mean,0.507817,63.32542,5.439153e+16
std,0.499975,7.359656,4.165322e+16
min,0.0,46.0,28982000000000.0
25%,0.0,58.0,1.847319e+16
50%,1.0,64.0,4.454021e+16
75%,1.0,69.0,8.529097e+16
max,1.0,80.0,1.653249e+17
