In [1]:
import pandas as pd
import numpy as np

## Importa dataset

In [2]:
dados_onda1 = pd.read_csv('BASES-ELSI/ELSI_Portugues_1a_onda.csv')

In [3]:
cognitive = {
    'temporal orientation': ['q7', 'q8', 'q9', 'q10'],
    'memory recall': ['q13'],
    'sematinc memory': ['q18', 'q19', 'q20', 'q21'],
    'verbal fluency': ['q14']
}

psychological = {
    'depression': ['r2', 'r3', 'r4', 'r5', 'r6', 'r7', 'r8', 'r9'],
    'sleep quality': ['n74', 'n75'],
}

sensorial = {
    'hearing deficit': ['n16'],
    'distance vision': ['n6'],
    'near vision': ['n7']
}

locomotor = {
    'gait speed': ['mf33', 'mf34', 'mf35', 'mf36', 'mf37', 'mf38'],
    'balance': ['mf30', 'mf31', 'mf32'] #held more then 10 seconds
}


vitality =  {
    'handgrip': ['mf27', 'mf28', 'mf29'], #media
    'peso': ['mf22', 'mf24'], # media do peso e pedo informada
    'altura': ['mf13', 'mf15'] # media da altura e altura  informada
}

## Clean data

### Cognitive

In [4]:
cognitive_dfs = []

#### temporal orientation

In [5]:
for q in cognitive['temporal orientation']:
    print(dados_onda1[q].value_counts())

1    6950
0    2134
8     328
Name: q7, dtype: int64
1    8105
0     979
8     328
Name: q8, dtype: int64
1    7723
0    1361
8     328
Name: q9, dtype: int64
1    8599
0     485
8     328
Name: q10, dtype: int64


In [6]:
cognitive_to = dados_onda1[cognitive['temporal orientation']].replace(8, np.nan)

for q in cognitive['temporal orientation']:
    print(cognitive_to[q].value_counts())


1.0    6950
0.0    2134
Name: q7, dtype: int64
1.0    8105
0.0     979
Name: q8, dtype: int64
1.0    7723
0.0    1361
Name: q9, dtype: int64
1.0    8599
0.0     485
Name: q10, dtype: int64


In [7]:
teporal_orientation = cognitive_to.apply(sum, axis=1)
teporal_orientation.name = 'temporal_orientation'
teporal_orientation

0       4.0
1       4.0
2       4.0
3       4.0
4       1.0
       ... 
9407    4.0
9408    3.0
9409    4.0
9410    4.0
9411    4.0
Name: temporal_orientation, Length: 9412, dtype: float64

In [8]:
cognitive_dfs.append(teporal_orientation)

#### memory recall

In [9]:
for q in cognitive['memory recall']:
    print(dados_onda1[q].value_counts())

4     2215
5     2027
3     1608
6     1284
2      787
7      488
88     328
1      295
0      184
8      160
9       27
10       9
Name: q13, dtype: int64


In [10]:
cognitive_mr = dados_onda1[cognitive['memory recall']].replace(88, np.nan)

for q in cognitive['memory recall']:
    print(cognitive_mr[q].value_counts().sort_values())


10.0       9
9.0       27
8.0      160
0.0      184
1.0      295
7.0      488
2.0      787
6.0     1284
3.0     1608
5.0     2027
4.0     2215
Name: q13, dtype: int64


In [11]:
memory_recall = cognitive_mr
memory_recall.columns = ['memory_recall']
memory_recall

Unnamed: 0,memory_recall
0,6.0
1,2.0
2,4.0
3,4.0
4,3.0
...,...
9407,7.0
9408,4.0
9409,3.0
9410,6.0


In [12]:
cognitive_dfs.append(memory_recall)

#### semantic memory

In [13]:
for q in cognitive['sematinc memory']:
    print(dados_onda1[q].value_counts())

1     8830
8      328
2      194
9       52
10       8
Name: q18, dtype: int64
1     4599
2     3989
9      453
8      328
10      43
Name: q19, dtype: int64
1     7917
9      758
8      328
2      301
10     108
Name: q20, dtype: int64
9     5005
1     3288
2      654
8      328
10     137
Name: q21, dtype: int64


In [14]:
substitutions = {8: np.nan, 9: 0, 10: np.nan, 2: 0}


cognitive_sm = dados_onda1[cognitive['sematinc memory']].replace(substitutions)

for q in cognitive['sematinc memory']:
    print(cognitive_sm[q].value_counts().sort_values())


0.0     246
1.0    8830
Name: q18, dtype: int64
0.0    4442
1.0    4599
Name: q19, dtype: int64
0.0    1059
1.0    7917
Name: q20, dtype: int64
1.0    3288
0.0    5659
Name: q21, dtype: int64


In [15]:
cognitive_sm.isna().sum()

q18    336
q19    371
q20    436
q21    465
dtype: int64

In [16]:
semantic_memory = cognitive_sm.apply(sum, axis=1)
semantic_memory.name = 'semantic_memory'
semantic_memory

0       3.0
1       3.0
2       2.0
3       1.0
4       1.0
       ... 
9407    4.0
9408    4.0
9409    1.0
9410    1.0
9411    4.0
Name: semantic_memory, Length: 9412, dtype: float64

In [17]:
cognitive_dfs.append(semantic_memory)

#### verbal fluency

In [18]:
for q in cognitive['verbal fluency']:
    print(dados_onda1[q].value_counts().sort_index())

0       10
1       12
2       35
3       59
4      141
5      213
6      379
7      466
8      618
9      780
10     961
11     919
12     831
13     774
14     642
15     526
16     414
17     381
18     287
19     180
20     123
21     104
22      61
23      47
24      25
25      24
26      22
27      13
28       7
29       3
30       6
31       2
32       1
33       1
35       1
38       1
888    328
999     15
Name: q14, dtype: int64


In [19]:
substitutions = {888: np.nan, 999: np.nan}


cognitive_vf = dados_onda1[cognitive['verbal fluency']].replace(substitutions)

for q in cognitive['verbal fluency']:
    print(cognitive_vf[q].value_counts().sort_values())


38.0      1
35.0      1
33.0      1
32.0      1
31.0      2
29.0      3
30.0      6
28.0      7
0.0      10
1.0      12
27.0     13
26.0     22
25.0     24
24.0     25
2.0      35
23.0     47
3.0      59
22.0     61
21.0    104
20.0    123
4.0     141
19.0    180
5.0     213
18.0    287
6.0     379
17.0    381
16.0    414
7.0     466
15.0    526
8.0     618
14.0    642
13.0    774
9.0     780
12.0    831
11.0    919
10.0    961
Name: q14, dtype: int64


In [20]:
verbal_fluency = cognitive_vf
verbal_fluency.columns = ['verbal_fluency']
verbal_fluency

Unnamed: 0,verbal_fluency
0,15.0
1,8.0
2,11.0
3,10.0
4,9.0
...,...
9407,16.0
9408,10.0
9409,11.0
9410,12.0


In [21]:
cognitive_dfs.append(verbal_fluency)

#### final df


In [22]:
cognitive_df = pd.concat(cognitive_dfs, axis=1)
cognitive_df

Unnamed: 0,temporal_orientation,memory_recall,semantic_memory,verbal_fluency
0,4.0,6.0,3.0,15.0
1,4.0,2.0,3.0,8.0
2,4.0,4.0,2.0,11.0
3,4.0,4.0,1.0,10.0
4,1.0,3.0,1.0,9.0
...,...,...,...,...
9407,4.0,7.0,4.0,16.0
9408,3.0,4.0,4.0,10.0
9409,4.0,3.0,1.0,11.0
9410,4.0,6.0,1.0,12.0


### Psychological

In [23]:
psy_dfs = []

#### depression

In [24]:
for q in psychological['depression']:
    print(dados_onda1[q].value_counts())

0    6032
1    2438
8     910
9      32
Name: r2, dtype: int64
1    4711
0    3767
8     910
9      24
Name: r3, dtype: int64
1    4381
0    4104
8     910
9      17
Name: r4, dtype: int64
1    6526
0    1925
8     910
9      51
Name: r5, dtype: int64
0    5506
1    2984
8     910
9      12
Name: r6, dtype: int64
1    6564
0    1862
8     910
9      76
Name: r7, dtype: int64
0    5242
1    3244
8     910
9      16
Name: r8, dtype: int64
0    5790
1    2677
8     910
9      35
Name: r9, dtype: int64


In [25]:
psy_dep = dados_onda1[psychological['depression']].replace(8, np.nan).replace(9, np.nan)

for q in psychological['depression']:
    print(psy_dep[q].value_counts())


0.0    6032
1.0    2438
Name: r2, dtype: int64
1.0    4711
0.0    3767
Name: r3, dtype: int64
1.0    4381
0.0    4104
Name: r4, dtype: int64
1.0    6526
0.0    1925
Name: r5, dtype: int64
0.0    5506
1.0    2984
Name: r6, dtype: int64
1.0    6564
0.0    1862
Name: r7, dtype: int64
0.0    5242
1.0    3244
Name: r8, dtype: int64
0.0    5790
1.0    2677
Name: r9, dtype: int64


In [26]:
depression = psy_dep.apply(sum, axis=1)
depression.name = 'depression_scale'
depression

0       2.0
1       2.0
2       5.0
3       3.0
4       5.0
       ... 
9407    1.0
9408    4.0
9409    4.0
9410    7.0
9411    5.0
Name: depression_scale, Length: 9412, dtype: float64

In [27]:
psy_dfs.append(depression)

#### sleep quality

In [28]:
for q in psychological['sleep quality']:
    print(dados_onda1[q].value_counts())

2    4219
3    2547
4    1227
1     872
5     529
9      18
Name: n74, dtype: int64
1    7824
4    1201
3     264
2     112
9      11
Name: n75, dtype: int64


In [29]:
psy_sleep = dados_onda1[psychological['sleep quality']].replace(9, np.nan)

for q in psychological['sleep quality']:
    print(psy_sleep[q].value_counts())


2.0    4219
3.0    2547
4.0    1227
1.0     872
5.0     529
Name: n74, dtype: int64
1.0    7824
4.0    1201
3.0     264
2.0     112
Name: n75, dtype: int64


In [30]:
sleep_quality = psy_sleep.apply(sum, axis=1)
sleep_quality.name = 'sleep_quality'
sleep_quality

0       3.0
1       2.0
2       5.0
3       3.0
4       6.0
       ... 
9407    3.0
9408    4.0
9409    4.0
9410    3.0
9411    2.0
Name: sleep_quality, Length: 9412, dtype: float64

In [31]:
sleep_quality.max(), sleep_quality.min()

(9.0, 2.0)

In [32]:
psy_dfs.append(sleep_quality)

#### final df

In [33]:
psychological_df = pd.concat(psy_dfs, axis=1)
psychological_df

Unnamed: 0,depression_scale,sleep_quality
0,2.0,3.0
1,2.0,2.0
2,5.0,5.0
3,3.0,3.0
4,5.0,6.0
...,...,...
9407,1.0,3.0
9408,4.0,4.0
9409,4.0,4.0
9410,7.0,3.0


### Sensory

In [34]:
sensory_dfs = []

#### hearing deficit

In [35]:
for q in sensorial['hearing deficit']:
    print(dados_onda1[q].value_counts())

2    5395
3    2308
1    1107
4     489
5     100
9      13
Name: n16, dtype: int64


In [36]:
sensorial_hearing = dados_onda1[sensorial['hearing deficit']].replace(9, np.nan)

for q in sensorial['hearing deficit']:
    print(sensorial_hearing[q].value_counts())


2.0    5395
3.0    2308
1.0    1107
4.0     489
5.0     100
Name: n16, dtype: int64


In [37]:
sensorial_hearing.columns = ['hearing_deficit']
sensorial_hearing

Unnamed: 0,hearing_deficit
0,2.0
1,4.0
2,2.0
3,3.0
4,3.0
...,...
9407,2.0
9408,3.0
9409,2.0
9410,2.0


In [38]:
sensory_dfs.append(sensorial_hearing)

#### Distance vision

In [39]:
for q in sensorial['distance vision']:
    print(dados_onda1[q].value_counts())

2    5002
3    2154
4    1037
1     865
5     329
9      25
Name: n6, dtype: int64


In [40]:
sens_distance = dados_onda1[ sensorial['distance vision']].replace(9, np.nan)

for q in  sensorial['distance vision']:
    print(sens_distance[q].value_counts())


2.0    5002
3.0    2154
4.0    1037
1.0     865
5.0     329
Name: n6, dtype: int64


In [41]:
sens_distance.columns = ['distance_vision']
sens_distance

Unnamed: 0,distance_vision
0,2.0
1,3.0
2,3.0
3,2.0
4,2.0
...,...
9407,2.0
9408,2.0
9409,3.0
9410,2.0


In [42]:
sensory_dfs.append(sens_distance)

#### Near vision

In [43]:
for q in sensorial['near vision']:
    print(dados_onda1[q].value_counts())

2    4978
3    2263
4    1264
1     576
5     312
9      19
Name: n7, dtype: int64


In [44]:
sens_near = dados_onda1[ sensorial['near vision']].replace(9, np.nan)

for q in  sensorial['near vision']:
    print(sens_near[q].value_counts())


2.0    4978
3.0    2263
4.0    1264
1.0     576
5.0     312
Name: n7, dtype: int64


In [45]:
sens_near.columns = ['near_vision']
sens_near

Unnamed: 0,near_vision
0,2.0
1,3.0
2,4.0
3,2.0
4,3.0
...,...
9407,2.0
9408,2.0
9409,2.0
9410,2.0


In [46]:
sensory_dfs.append(sens_near)

#### final df

In [47]:
sensory_df = pd.concat(sensory_dfs, axis=1)
sensory_df

Unnamed: 0,hearing_deficit,distance_vision,near_vision
0,2.0,2.0,2.0
1,4.0,3.0,3.0
2,2.0,3.0,4.0
3,3.0,2.0,2.0
4,3.0,2.0,3.0
...,...,...,...
9407,2.0,2.0,2.0
9408,3.0,2.0,2.0
9409,2.0,3.0,2.0
9410,2.0,2.0,2.0


### Locomotor

In [48]:
locomotor_dfs = []

#### gait speed

In [49]:
locomotor['gait speed']

['mf33', 'mf34', 'mf35', 'mf36', 'mf37', 'mf38']

In [50]:
for q in locomotor['gait speed']:
    print(q)
    print(dados_onda1[q].value_counts())

mf33
0       8848
9888     304
9666     258
1          2
Name: mf33, dtype: int64
mf34
3       2452
4       2323
5       1420
2        933
6        709
7        345
9888     304
9666     258
8        200
9        104
1        103
10        85
12        31
11        27
13        18
14        11
16        10
22        10
15         9
19         5
18         5
33         4
38         4
17         3
23         3
28         3
21         3
59         2
40         2
27         2
25         2
34         2
48         2
55         2
44         2
58         2
45         1
57         1
39         1
32         1
0          1
49         1
24         1
29         1
35         1
30         1
41         1
37         1
Name: mf34, dtype: int64
mf35
9888    304
9666    258
28      214
50      178
78      172
       ... 
26       41
95       41
73       39
83       32
86       31
Name: mf35, Length: 102, dtype: int64
mf36
0       8839
8888     562
9666       7
9888       3
1          1
Name: mf36, dtype: 

In [51]:
locomotor_gait_speed = dados_onda1[locomotor['gait speed']].replace(9666, np.nan).replace(9888, np.nan)

for q in locomotor['gait speed']:
    print(locomotor_gait_speed[q].value_counts())


0.0    8848
1.0       2
Name: mf33, dtype: int64
3.0     2452
4.0     2323
5.0     1420
2.0      933
6.0      709
7.0      345
8.0      200
9.0      104
1.0      103
10.0      85
12.0      31
11.0      27
13.0      18
14.0      11
16.0      10
22.0      10
15.0       9
19.0       5
18.0       5
33.0       4
38.0       4
17.0       3
21.0       3
28.0       3
23.0       3
59.0       2
27.0       2
58.0       2
44.0       2
48.0       2
55.0       2
25.0       2
34.0       2
40.0       2
0.0        1
41.0       1
30.0       1
29.0       1
24.0       1
49.0       1
57.0       1
32.0       1
45.0       1
35.0       1
39.0       1
37.0       1
Name: mf34, dtype: int64
28.0    214
50.0    178
78.0    172
97.0    155
38.0    151
       ... 
95.0     41
26.0     41
73.0     39
83.0     32
86.0     31
Name: mf35, Length: 100, dtype: int64
0.0       8839
8888.0     562
1.0          1
Name: mf36, dtype: int64
3.0       2854
4.0       2209
2.0       1202
5.0       1200
8888.0     562
6.0        54

In [52]:
medida1 = (locomotor_gait_speed['mf33'] * 60 + locomotor_gait_speed['mf34'] + locomotor_gait_speed['mf35']/100)

medida2 = (locomotor_gait_speed['mf36'] * 60 + locomotor_gait_speed['mf37'] + locomotor_gait_speed['mf38']/100)

gait_speed = (medida1 + medida2)/2

gait_speed.name = 'gait_speed'
gait_speed

0        3.500
1        3.300
2        4.595
3        5.610
4       38.145
         ...  
9407     4.265
9408     5.800
9409       NaN
9410       NaN
9411     5.270
Name: gait_speed, Length: 9412, dtype: float64

In [53]:
locomotor_gait_speed['mf38'].isna().sum(), locomotor_gait_speed['mf37'].isna().sum(), locomotor_gait_speed['mf36'].isna().sum()

(10, 10, 10)

In [54]:
locomotor_gait_speed['mf33'].isna().sum(), locomotor_gait_speed['mf34'].isna().sum(), locomotor_gait_speed['mf35'].isna().sum()

(562, 562, 562)

In [55]:
locomotor_dfs.append(gait_speed)

#### Balance Test

In [56]:
for q in locomotor['balance']:
    print(dados_onda1[q].value_counts())

10.00      8700
9999.00     342
9888.00     151
9666.00      89
4.00          5
           ... 
3.80          1
1.73          1
9.10          1
2.38          1
2.41          1
Name: mf30, Length: 107, dtype: int64
10.00      7930
9999.00     395
9666.00     173
9888.00     160
6.00         39
           ... 
8.28          1
9.85          1
8.79          1
3.56          1
1.29          1
Name: mf31, Length: 404, dtype: int64
30.00      4505
10.00      1470
9999.00     457
9666.00     331
9888.00     187
           ... 
20.06         1
8.01          1
18.13         1
2.46          1
15.03         1
Name: mf32, Length: 1056, dtype: int64


In [57]:
loc_balance = dados_onda1[locomotor['balance']].replace(9666, np.nan).replace(9888, np.nan).replace(9999, np.nan)

for q in locomotor['balance']:
    print(loc_balance[q].value_counts().sort_values())


4.71        1
9.84        1
5.51        1
7.98        1
9.50        1
         ... 
3.00        3
7.00        4
5.00        4
4.00        5
10.00    8700
Name: mf30, Length: 104, dtype: int64
5.51        1
2.64        1
3.16        1
2.68        1
3.21        1
         ... 
2.00       27
5.00       27
8.00       37
6.00       39
10.00    7930
Name: mf31, Length: 401, dtype: int64
24.06       1
14.87       1
15.88       1
24.50       1
17.28       1
         ... 
5.00       64
2.00       66
3.00       70
10.00    1470
30.00    4505
Name: mf32, Length: 1053, dtype: int64


In [58]:
def rule(x):
    if x < 3:
        return 0
    elif x < 10:
        return 1
    elif x >= 10:
        return 2
    else: 
        return np.nan

foot1 = (loc_balance['mf30'] >= 10).astype(int)
foot2 = (loc_balance['mf31'] >= 10).astype(int) 
foot3 = loc_balance['mf32'].map(rule)

balance = foot1 + foot2 + foot3

balance.name = 'balance'
balance

0       4.0
1       4.0
2       4.0
3       4.0
4       NaN
       ... 
9407    3.0
9408    NaN
9409    NaN
9410    NaN
9411    4.0
Name: balance, Length: 9412, dtype: float64

In [59]:
balance.max(), balance.min()

(4.0, 0.0)

In [60]:
locomotor_dfs.append(balance)

#### final df

In [61]:
locomotor_df = pd.concat(locomotor_dfs, axis=1)
locomotor_df

Unnamed: 0,gait_speed,balance
0,3.500,4.0
1,3.300,4.0
2,4.595,4.0
3,5.610,4.0
4,38.145,
...,...,...
9407,4.265,3.0
9408,5.800,
9409,,
9410,,


### Vitality

In [62]:
vitality_dfs = []

#### Hadgrip

In [63]:
vitality['handgrip']

['mf27', 'mf28', 'mf29']

In [64]:
for q in vitality['handgrip']:
    print(q)
    print(dados_onda1[q].value_counts().sort_index().tail(10))

mf27
60        7
62        2
67        1
68        1
70        1
8888    180
9555     20
9666     28
9777    166
9888    145
Name: mf27, dtype: int64
mf28
63        3
64        1
65        2
66        1
73        1
8888    180
9555     27
9666     34
9777    169
9888    150
Name: mf28, dtype: int64
mf29
62        1
64        1
65        1
66        1
73        1
8888    180
9555     27
9666     34
9777    170
9888    152
Name: mf29, dtype: int64


In [65]:
substitute = {
    '8888': np.nan,
    '9555': np.nan,
    '9666': np.nan,
    '9777': np.nan,
    '9888': np.nan,
    8888: np.nan,
    9555: np.nan,
    9666: np.nan,
    9777: np.nan,
    9888: np.nan
}

vitality_handgrip = dados_onda1[vitality['handgrip']].replace(substitute)

for q in vitality['handgrip']:
    print(vitality_handgrip[q].value_counts().sort_index().tail(10))


55.0    12
56.0     5
57.0     2
58.0     2
59.0     4
60.0     7
62.0     2
67.0     1
68.0     1
70.0     1
Name: mf27, dtype: int64
58.0    3
59.0    1
60.0    2
61.0    3
62.0    1
63.0    3
64.0    1
65.0    2
66.0    1
73.0    1
Name: mf28, dtype: int64
57.0    5
58.0    5
59.0    4
60.0    6
61.0    1
62.0    1
64.0    1
65.0    1
66.0    1
73.0    1
Name: mf29, dtype: int64


In [66]:
handgrip = vitality_handgrip.apply(lambda x: x.mean(), axis=1)

handgrip.name = 'handgrip'

handgrip

0       24.333333
1       25.666667
2       14.666667
3       24.000000
4             NaN
          ...    
9407    22.666667
9408    20.333333
9409    26.666667
9410    38.333333
9411    18.333333
Name: handgrip, Length: 9412, dtype: float64

In [67]:
vitality_dfs.append(handgrip)

#### IMC

In [68]:
for q in vitality['peso']:
    print(q)
    print(dados_onda1[q].value_counts().sort_index())

mf22
28.299999         1
29.150000         1
29.200001         1
29.799999         2
29.900000         1
               ... 
148.500000        1
148.899990        1
149.000000        1
173.649990        1
99999.000000    385
Name: mf22, Length: 1450, dtype: int64
mf24
8.0      9213
35.0        3
38.0        1
39.0        1
40.0        2
         ... 
105.0       1
110.0       2
113.0       1
126.0       1
140.0       1
Name: mf24, Length: 63, dtype: int64


In [69]:
for q in vitality['altura']:
    print(dados_onda1[q].value_counts().sort_index())

1.245          1
1.250          1
1.260          1
1.270          1
1.290          1
            ... 
1.910          3
1.920          1
1.930          1
1.950          2
99999.000    369
Name: mf13, Length: 603, dtype: int64
1.30       2
1.39       1
1.40       2
1.44       1
1.45       4
1.47       3
1.48       3
1.49       1
1.50      13
1.51       1
1.52       2
1.53       3
1.54       2
1.55       6
1.56       8
1.57       5
1.58       4
1.59       5
1.60      17
1.61       2
1.62       8
1.63       3
1.65      21
1.66       3
1.67       2
1.68       5
1.69       3
1.70      12
1.71       3
1.72       4
1.73       6
1.75       6
1.76       2
1.78       3
1.79       1
1.80       4
1.83       2
1.85       1
8.00    9238
Name: mf15, dtype: int64


In [70]:
vit_peso = dados_onda1[vitality['peso']].replace(99999.000000, np.nan).replace(8, np.nan)

for q in vitality['peso']:
    print(vit_peso[q].value_counts().sort_index())


28.299999     1
29.150000     1
29.200001     1
29.799999     2
29.900000     1
             ..
143.899990    1
148.500000    1
148.899990    1
149.000000    1
173.649990    1
Name: mf22, Length: 1449, dtype: int64
35.0     3
38.0     1
39.0     1
40.0     2
40.9     1
        ..
105.0    1
110.0    2
113.0    1
126.0    1
140.0    1
Name: mf24, Length: 62, dtype: int64


In [71]:
vit_altura = dados_onda1[vitality['altura']].replace(99999.000000, np.nan).replace(8, np.nan)

for q in vitality['altura']:
    print(vit_altura[q].value_counts().sort_index())


1.245    1
1.250    1
1.260    1
1.270    1
1.290    1
        ..
1.900    1
1.910    3
1.920    1
1.930    1
1.950    2
Name: mf13, Length: 602, dtype: int64
1.30     2
1.39     1
1.40     2
1.44     1
1.45     4
1.47     3
1.48     3
1.49     1
1.50    13
1.51     1
1.52     2
1.53     3
1.54     2
1.55     6
1.56     8
1.57     5
1.58     4
1.59     5
1.60    17
1.61     2
1.62     8
1.63     3
1.65    21
1.66     3
1.67     2
1.68     5
1.69     3
1.70    12
1.71     3
1.72     4
1.73     6
1.75     6
1.76     2
1.78     3
1.79     1
1.80     4
1.83     2
1.85     1
Name: mf15, dtype: int64


In [72]:
vit_altura.loc[vit_altura['mf13'].isna(), 'mf13'] = vit_altura.loc[vit_altura['mf13'].isna(), 'mf15']

vit_peso.loc[vit_peso['mf22'].isna(), 'mf22'] = vit_peso.loc[vit_peso['mf22'].isna(), 'mf24']

In [73]:
imc = vit_peso['mf22']/(vit_altura['mf13'])**2
imc.name = 'imc'
imc


0       23.343753
1       31.863105
2       23.236321
3       27.860572
4       22.600775
          ...    
9407    29.231439
9408    26.710508
9409    29.811097
9410    20.716794
9411    34.547347
Name: imc, Length: 9412, dtype: float64

In [74]:
vitality_dfs.append(imc)

#### final df

In [75]:
vitality_df = pd.concat(vitality_dfs, axis=1)
vitality_df

Unnamed: 0,handgrip,imc
0,24.333333,23.343753
1,25.666667,31.863105
2,14.666667,23.236321
3,24.000000,27.860572
4,,22.600775
...,...,...
9407,22.666667,29.231439
9408,20.333333,26.710508
9409,26.666667,29.811097
9410,38.333333,20.716794


## Variáveis para pintar

In [76]:
dados_onda1.columns

Index(['id', 'iddom', 'upa', 'peso_calibrado', 'estrato', 'regiao', 'zona',
       'ar4', 'ar6', 'ar8',
       ...
       'ci4', 'ci5', 'ci6', 'ci7', 'ci8', 'ci9', 'rendadom', 'rendadompc',
       'rendaind', 'propriedades'],
      dtype='object', length=1082)

In [85]:
sexo = dados_onda1['sexo']
etnia = dados_onda1['e9']
regiao = dados_onda1['regiao']
etnia.name = 'etinia'
sexo, etinia, regiao

(0       0
 1       1
 2       0
 3       0
 4       0
        ..
 9407    0
 9408    0
 9409    1
 9410    1
 9411    0
 Name: sexo, Length: 9412, dtype: int64,
 0       3
 1       9
 2       9
 3       3
 4       2
        ..
 9407    3
 9408    3
 9409    1
 9410    3
 9411    1
 Name: etinia, Length: 9412, dtype: int64,
 0       3
 1       3
 2       3
 3       3
 4       3
        ..
 9407    3
 9408    3
 9409    3
 9410    3
 9411    3
 Name: regiao, Length: 9412, dtype: int64)

In [86]:
sexo_moradores  = dados_onda1[[f'ar12_{x}' for x in range (1, 15)]]
sexo_moradores.columns = [f'sexo_morador_{x}' for x in range (1, 15)]
idade_moradores = dados_onda1[[f'ar14_{x}' for x in range (1, 15)]]
idade_moradores.columns = [f'idade_morador_{x}' for x in range (1, 15)]
papel_moradores = dados_onda1[[f'ar15_{x}' for x in range (1, 15)]]
papel_moradores.columns = [f'papel_morador_{x}' for x in range (1, 15)]

In [87]:
papel_moradores

Unnamed: 0,papel_morador_1,papel_morador_2,papel_morador_3,papel_morador_4,papel_morador_5,papel_morador_6,papel_morador_7,papel_morador_8,papel_morador_9,papel_morador_10,papel_morador_11,papel_morador_12,papel_morador_13,papel_morador_14
0,2.0,3.0,,,,,,,,,,,,
1,2.0,,,,,,,,,,,,,
2,2.0,,,,,,,,,,,,,
3,3.0,6.0,,,,,,,,,,,,
4,12.0,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9407,2.0,3.0,,,,,,,,,,,,
9408,3.0,6.0,,,,,,,,,,,,
9409,,,,,,,,,,,,,,
9410,,,,,,,,,,,,,,


In [88]:
idade_moradores

Unnamed: 0,idade_morador_1,idade_morador_2,idade_morador_3,idade_morador_4,idade_morador_5,idade_morador_6,idade_morador_7,idade_morador_8,idade_morador_9,idade_morador_10,idade_morador_11,idade_morador_12,idade_morador_13,idade_morador_14
0,51.0,19.0,,,,,,,,,,,,
1,60.0,,,,,,,,,,,,,
2,57.0,,,,,,,,,,,,,
3,48.0,23.0,,,,,,,,,,,,
4,88.0,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9407,69.0,33.0,,,,,,,,,,,,
9408,42.0,11.0,,,,,,,,,,,,
9409,,,,,,,,,,,,,,
9410,,,,,,,,,,,,,,


In [89]:
sexo_moradores

Unnamed: 0,sexo_morador_1,sexo_morador_2,sexo_morador_3,sexo_morador_4,sexo_morador_5,sexo_morador_6,sexo_morador_7,sexo_morador_8,sexo_morador_9,sexo_morador_10,sexo_morador_11,sexo_morador_12,sexo_morador_13,sexo_morador_14
0,0.0,1.0,,,,,,,,,,,,
1,0.0,,,,,,,,,,,,,
2,1.0,,,,,,,,,,,,,
3,0.0,1.0,,,,,,,,,,,,
4,0.0,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9407,1.0,0.0,,,,,,,,,,,,
9408,0.0,1.0,,,,,,,,,,,,
9409,,,,,,,,,,,,,,
9410,,,,,,,,,,,,,,


In [90]:
socio_dem = pd.concat([sexo, regiao, etnia, sexo_moradores, papel_moradores, idade_moradores], axis=1)

## Final FINAL DF

In [None]:
df = pd.concat([vitality_df, locomotor_df, sensory_df, psychological_df, cognitive_df, socio_dem], axis=1)
df

Unnamed: 0,handgrip,imc,gait_speed,balance,hearing_deficit,distance_vision,near_vision,depression_scale,sleep_quality,temporal_orientation,...,idade_morador_5,idade_morador_6,idade_morador_7,idade_morador_8,idade_morador_9,idade_morador_10,idade_morador_11,idade_morador_12,idade_morador_13,idade_morador_14
0,24.333333,23.343753,3.500,4.0,2.0,2.0,2.0,2.0,3.0,4.0,...,,,,,,,,,,
1,25.666667,31.863105,3.300,4.0,4.0,3.0,3.0,2.0,2.0,4.0,...,,,,,,,,,,
2,14.666667,23.236321,4.595,4.0,2.0,3.0,4.0,5.0,5.0,4.0,...,,,,,,,,,,
3,24.000000,27.860572,5.610,4.0,3.0,2.0,2.0,3.0,3.0,4.0,...,,,,,,,,,,
4,,22.600775,38.145,,3.0,2.0,3.0,5.0,6.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9407,22.666667,29.231439,4.265,3.0,2.0,2.0,2.0,1.0,3.0,4.0,...,,,,,,,,,,
9408,20.333333,26.710508,5.800,,3.0,2.0,2.0,4.0,4.0,3.0,...,,,,,,,,,,
9409,26.666667,29.811097,,,2.0,3.0,2.0,4.0,4.0,4.0,...,,,,,,,,,,
9410,38.333333,20.716794,,,2.0,2.0,2.0,7.0,3.0,4.0,...,,,,,,,,,,


In [None]:
df.describe()

Unnamed: 0,handgrip,imc,gait_speed,balance,hearing_deficit,distance_vision,near_vision,depression_scale,sleep_quality,temporal_orientation,...,idade_morador_5,idade_morador_6,idade_morador_7,idade_morador_8,idade_morador_9,idade_morador_10,idade_morador_11,idade_morador_12,idade_morador_13,idade_morador_14
count,8873.0,9168.0,8840.0,8437.0,9399.0,9387.0,9393.0,8303.0,9385.0,9084.0,...,720.0,332.0,163.0,76.0,36.0,18.0,8.0,6.0,1.0,1.0
mean,25.421729,27.697987,4.782973,3.651653,2.263751,2.463407,2.548387,3.955197,4.060416,3.454095,...,16.416667,13.762048,12.662577,12.473684,10.805556,13.055556,14.75,13.333333,23.0,25.0
std,9.977432,5.271347,3.220193,0.759109,0.773109,0.929669,0.915875,1.762156,1.623462,0.962918,...,14.937748,13.69612,13.69384,16.152171,10.358763,15.975983,13.604096,14.706008,,
min,1.0,11.787508,1.01,0.0,1.0,1.0,1.0,0.0,2.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,23.0,25.0
25%,18.0,24.138893,3.44,4.0,2.0,2.0,2.0,2.0,3.0,3.0,...,7.0,4.0,4.0,2.0,1.75,3.0,6.25,2.5,23.0,25.0
50%,24.0,27.242172,4.23,4.0,2.0,2.0,2.0,4.0,4.0,4.0,...,13.0,11.0,8.0,8.5,8.5,6.5,8.5,7.0,23.0,25.0
75%,31.666667,30.676569,5.3,4.0,3.0,3.0,3.0,5.0,5.0,4.0,...,21.0,18.0,18.0,16.25,13.75,12.0,20.25,25.75,23.0,25.0
max,71.333333,57.981211,73.64,4.0,5.0,5.0,5.0,8.0,9.0,4.0,...,87.0,86.0,78.0,95.0,38.0,57.0,36.0,32.0,23.0,25.0


In [None]:
df.to_csv('Data/processed_df.csv', index=False)

  values = values.astype(str)
