In [1]:
"""
Expected data path and required files: 

DATA_DIR
├── UK_65
│    ├── *_PERSON.csv
│    ├── *_CONTACT_DIAGNOSTICS.csv
│    ├── *_CONTACT_PRESCRIPTIONS.csv
│    ├── *_MEASURE_CHANGES.csv
│    ├── *_SOCIAL_CHANGES.csv
│    ├── *_ATTRIBUTES_CHANGES.csv
│    └── *_PERSON_MEASURE_CHANGES.csv
│
├── UK_70
├── FR_65
└── FR_70

MAPPING_DIR
├── mapping.csv
├── complementary_icd10_mapping_UK.pkl # dictionay mapping labels to icd10 code
└── complementary_icd10_mapping_FR.pkl # dictionay mapping labels to icd10 code
"""

DATA_DIR = '/localdrive10TB/datasets/thin-data/Extractions_EU'
MAPPING_DIR = '/localdrive10TB/users/karim.zaidi/mapping'
OUTPUT_DIR = '/localdrive10TB/users/karim.zaidi/datasets'

In [2]:
CODES = {}

CODES['alzheimer'] = {
    'F00', # "Démence de la maladie d’Alzheimer (G30.- †)"
    'G30', # "Maladie d’Alzheimer"
} 

CODES['parkinson'] = {
    'G20', # "Maladie de Parkinson"   
    # 'G23.2', # "Atrophie multi-systématisée de type parkinsonien [AMS-P] [MSA-P]"
    # 'G23.3', # "Atrophie multi-systématisée de type cérébelleux [AMS-C] [MSA-C]"
}

CODES['vascular_dementias'] = {
    'F01', # "Démence vasculaire"
}

# Mild cognitive impairment
CODES['mci'] = {
    'F06.7', # "Trouble cognitif léger" 
    'R41.8', # "Symptômes et signes relatifs aux fonctions cognitives et à la conscience, autres et non précisés"
    'R41', # "Autres symptômes et signes relatifs aux fonctions cognitives et à la conscience"
}

CODES['alcohol_dementias'] = {
    'F10.6', # "Syndrome amnésique dû à l’alcool"
    #'F02.8', # "Démence au cours d’autres maladies classées ailleurs" (+ intoxication alcoolique)
    'G31.2', # "Dégénérescence du système nerveux liée à l’alcool"
    'E51.2', # "Encéphalopathie de Wernicke"
}

CODES['frontotemporal_dementias'] = {
    'G31.0', # "Circumscribed brain atrophy"
    'F02.0', # "Dementia in Pick disease (G31.0†)"
}

CODES['other_dementias'] = {
    'F02', # "Démence au cours d’autres maladies classées ailleurs"    
    'F03', # "Démence, sans précision"
    'F04', # "Syndrome amnésique organique, non induit par l’alcool et d’autres substances psychoactives"
    'F05.1', # "Delirium surajouté à une démence"
    
    'G31.01', # "Maladie de Pick"
    'G31.09', # "Autre trouble neurocognitif frontotemporal"
    'G31.1', # "Dégénérescence cérébrale sénile, non classée ailleurs"
}

CODES['parkinson_dementias'] = {
    'F02.3', # "Démence de la maladie de Parkinson"
    'G31.8', # "Autres affections dégénératives précisées du système nerveux" 
             #     Corps de Lewy (maladie à) (démence à) (F02.8 *)
    #'F02.8', # "Démence au cours d’autres maladies classées ailleurs"
}

from functools import reduce
all_dementias_codes = [CODES[key] for key in CODES.keys() if key not in {'parkinson', 'alcohol_dementias', 'mci'}]
CODES['all_dementias'] = reduce(set.union, all_dementias_codes)

DISEASES_OF_INTEREST = list(CODES.keys())

# Utils

### Importations

In [3]:
import pandas as pd
pd.options.mode.chained_assignment = None # disable (false positive) warning 

from utils.dataset import get_dataset, processing_nans, save_dataset
from functools import partial
get_dataset = partial(get_dataset, data_dir=DATA_DIR, mapping_dir=MAPPING_DIR, disease2codes=CODES)
processing_nans= partial(processing_nans, diseases_of_interest=DISEASES_OF_INTEREST)
save_dataset = partial(save_dataset, base_dir=OUTPUT_DIR)

### Params

In [23]:
min_before=1; min_during=0; min_after=1
start_year=2008; end_year=2010

n_most_frequent=50
valid_height_range=(100, 250); valid_weight_range=(30, 250) # value from paper
extraction_date='2023-01-01'

### UK_65

In [24]:
%%time
dataset_UK_65, has_another_disease_before_UK_65, inactive_ids_UK_65, stats_UK_65 = get_dataset(
    country='UK', age=65, 
    valid_height_range=valid_height_range , valid_weight_range=valid_weight_range, 
    min_before=min_before, min_during=min_during, min_after=min_after,
    n_most_frequent=n_most_frequent, 
    start_year=start_year, end_year=end_year,
    extraction_date=extraction_date)

* Begin datasets loadings:

  > Loading: CONTACT...
	[INFO] 12,835,764 samples for 58,104 unique patients

  > Loading: CONTACT DIAGNOSTICS...
	[INFO] 3,557,475 samples for 57,940 unique patients

  > Loading: PERSON...
	[INFO] 58,104 samples for 58,104 unique patients

  > Loading: CONTACT PRESCRIPTIONS...
	[INFO] 15,244,661 samples for 57,998 unique patients

  > Loading: SOCIAL CHANGES...
	[INFO] 74,563 samples for 58,104 unique patients

  > Loading: MEASURE CHANGES...
	[INFO] 4,903,990 samples for 57,894 unique patients

  > Loading: ATTRIBUTES CHANGES...
	[INFO] 786,165 samples for 57,801 unique patients

  > Loading: PERSON MEASURE CHANGES...
	[INFO] 4,903,990 samples for 57,894 unique patients

* Filtering out 3,605 inactive patients (6.20% of 58104)

* Begin extraction of interesting features:
Note: statistics display using [INFO] flag is from the current table only and are not combined with info from other tables

  > GENDER:
	[INFO] We have 54,499 unique patients:
		- 52.12%

In [25]:
dataset_UK_65 = processing_nans(dataset_UK_65, has_another_disease_before_UK_65)

* Original dataset:


Unnamed: 0,person_id,diseases,gender_code,person_state_code,duration (days),avg. Alcohol (glasses/day),avg. Tobaco (cigarettes/day),avg. BMI,avg. CHARLSON,J01
# NaN,0.0,1143.0,0.0,41.0,41.0,13986.0,38514.0,2714.0,533.0,40.0
NaN (%),0.0,2.1,0.0,0.08,0.08,25.66,70.67,4.98,0.98,0.07


54,497 patients

* Processing NaNs:
	> Excluding 41 patients due to being either dead before the 
	period of interest or always Temporaire / Inactive.

	> Excluding 1,162 patients who had a disease before the end of
	the period of interest.

		- 91 patients having alzheimer
		- 225 patients having parkinson
		- 27 patients having vascular_dementias
		- 850 patients having mci
		- 26 patients having alcohol_dementias
		- 1 patients having frontotemporal_dementias
		- 40 patients having other_dementias
		- 2 patients having parkinson_dementias
		- 144 patients having all_dementias

	> Filling 104 patients' diseases column for patient having no
	diagnostics (assuming no neurodegenerative diseases).

	> Filling 40 patients' medications columns for patients having
	no prescriptions (assuming no medications were taken).

* After processing:


Unnamed: 0,person_id,diseases,gender_code,person_state_code,duration (days),avg. Alcohol (glasses/day),avg. Tobaco (cigarettes/day),avg. BMI,avg. CHARLSON,J01
# NaN,0.0,0.0,0.0,0.0,0.0,13690.0,37671.0,2655.0,514.0,0.0
NaN (%),0.0,0.0,0.0,0.0,0.0,25.69,70.69,4.98,0.96,0.0


53,294 patients

Diseases:
	- 548 patients having alzheimer
	- 292 patients having parkinson
	- 279 patients having vascular_dementias
	- 1,530 patients having mci
	- 13 patients having alcohol_dementias
	- 4 patients having frontotemporal_dementias
	- 201 patients having other_dementias
	- 47 patients having parkinson_dementias
	- 958 patients having all_dementias


In [26]:
save_dataset(dataset_UK_65, has_another_disease_before_UK_65,
             inactive_ids_UK_65, stats_UK_65, 
             'UK', 65)

  df.to_csv(dataset_path, index=False)


### UK_70

In [27]:
%%time
dataset_UK_70, has_another_disease_before_UK_70, inactive_ids_UK_70, stats_UK_70 = get_dataset(
    country='UK', age=70, 
    valid_height_range=valid_height_range , valid_weight_range=valid_weight_range, 
    min_before=min_before, min_during=min_during, min_after=min_after,
    n_most_frequent=n_most_frequent, 
    start_year=start_year, end_year=end_year,
    extraction_date=extraction_date)

* Begin datasets loadings:

  > Loading: CONTACT...
	[INFO] 12,449,963 samples for 48,738 unique patients

  > Loading: CONTACT DIAGNOSTICS...
	[INFO] 3,282,257 samples for 48,600 unique patients

  > Loading: PERSON...
	[INFO] 48,738 samples for 48,738 unique patients

  > Loading: CONTACT PRESCRIPTIONS...
	[INFO] 15,544,939 samples for 48,660 unique patients

  > Loading: SOCIAL CHANGES...
	[INFO] 64,377 samples for 48,738 unique patients

  > Loading: MEASURE CHANGES...
	[INFO] 4,722,086 samples for 48,586 unique patients

  > Loading: ATTRIBUTES CHANGES...
	[INFO] 749,614 samples for 48,460 unique patients

  > Loading: PERSON MEASURE CHANGES...
	[INFO] 4,722,086 samples for 48,586 unique patients

* Filtering out 3,016 inactive patients (6.19% of 48738)

* Begin extraction of interesting features:
Note: statistics display using [INFO] flag is from the current table only and are not combined with info from other tables

  > GENDER:
	[INFO] We have 45,722 unique patients:
		- 52.87%

In [28]:
dataset_UK_70 = processing_nans(dataset_UK_70, has_another_disease_before_UK_70)

* Original dataset:


Unnamed: 0,person_id,diseases,gender_code,person_state_code,duration (days),avg. Alcohol (glasses/day),avg. Tobaco (cigarettes/day),avg. BMI,avg. CHARLSON,J01
# NaN,0.0,1274.0,0.0,35.0,35.0,11894.0,33486.0,1986.0,465.0,21.0
NaN (%),0.0,2.79,0.0,0.08,0.08,26.01,73.24,4.34,1.02,0.05


45,722 patients

* Processing NaNs:
	> Excluding 35 patients due to being either dead before the 
	period of interest or always Temporaire / Inactive.

	> Excluding 1,437 patients who had a disease before the end of
	the period of interest.

		- 225 patients having alzheimer
		- 272 patients having parkinson
		- 78 patients having vascular_dementias
		- 995 patients having mci
		- 18 patients having alcohol_dementias
		- 2 patients having frontotemporal_dementias
		- 76 patients having other_dementias
		- 12 patients having parkinson_dementias
		- 351 patients having all_dementias

	> Filling 97 patients' diseases column for patient having no
	diagnostics (assuming no neurodegenerative diseases).

	> Filling 20 patients' medications columns for patients having
	no prescriptions (assuming no medications were taken).

* After processing:


Unnamed: 0,person_id,diseases,gender_code,person_state_code,duration (days),avg. Alcohol (glasses/day),avg. Tobaco (cigarettes/day),avg. BMI,avg. CHARLSON,J01
# NaN,0.0,0.0,0.0,0.0,0.0,11495.0,32423.0,1919.0,454.0,0.0
NaN (%),0.0,0.0,0.0,0.0,0.0,25.98,73.27,4.34,1.03,0.0


44,250 patients

Diseases:
	- 1,091 patients having alzheimer
	- 350 patients having parkinson
	- 528 patients having vascular_dementias
	- 2,198 patients having mci
	- 15 patients having alcohol_dementias
	- 2 patients having frontotemporal_dementias
	- 358 patients having other_dementias
	- 64 patients having parkinson_dementias
	- 1,851 patients having all_dementias


In [29]:
save_dataset(dataset_UK_70, has_another_disease_before_UK_70, 
             inactive_ids_UK_70, stats_UK_70, 
             'UK', 70)

  df.to_csv(dataset_path, index=False)


### FR_65

In [30]:
%%time
dataset_FR_65, has_another_disease_before_FR_65, inactive_ids_FR_65, stats_FR_65 = get_dataset(
    country='FR', age=65, 
    valid_height_range=valid_height_range , valid_weight_range=valid_weight_range, 
    min_before=min_before, min_during=min_during, min_after=min_after,
    n_most_frequent=n_most_frequent, 
    start_year=start_year, end_year=end_year,
    extraction_date=extraction_date)

* Begin datasets loadings:

  > Loading: CONTACT...
	[INFO] 1,829,146 samples for 26,266 unique patients

  > Loading: CONTACT DIAGNOSTICS...
	[INFO] 6,063,963 samples for 26,102 unique patients

  > Loading: PERSON...
	[INFO] 26,266 samples for 26,266 unique patients

  > Loading: CONTACT PRESCRIPTIONS...
	[INFO] 5,773,545 samples for 25,390 unique patients

  > Loading: SOCIAL CHANGES...
	[INFO] 65,135 samples for 26,266 unique patients

  > Loading: MEASURE CHANGES...
	[INFO] 3,179,812 samples for 23,633 unique patients

  > Loading: ATTRIBUTES CHANGES...
	[INFO] 289,671 samples for 26,266 unique patients

  > Loading: PERSON MEASURE CHANGES...
	[INFO] 3,179,812 samples for 23,633 unique patients

* Filtering out 3,736 inactive patients (14.22% of 26266)

* Begin extraction of interesting features:
Note: statistics display using [INFO] flag is from the current table only and are not combined with info from other tables

  > GENDER:
	[INFO] We have 22,530 unique patients:
		- 52.05% 

In [31]:
dataset_FR_65 = processing_nans(dataset_FR_65, has_another_disease_before_FR_65)

* Original dataset:


Unnamed: 0,person_id,diseases,gender_code,person_state_code,duration (days),avg. Alcohol (glasses/day),avg. Tobaco (cigarettes/day),avg. BMI,avg. CHARLSON,N02
# NaN,0.0,594.0,0.0,0.0,0.0,21649.0,21520.0,9320.0,447.0,380.0
NaN (%),0.0,2.64,0.0,0.0,0.0,96.09,95.52,41.37,1.98,1.69


22,530 patients

* Processing NaNs:
	> Excluding 0 patients due to being either dead before the 
	period of interest or always Temporaire / Inactive.

	> Excluding 491 patients who had a disease before the end of
	the period of interest.

		- 49 patients having alzheimer
		- 69 patients having parkinson
		- 38 patients having vascular_dementias
		- 331 patients having mci
		- 1 patients having alcohol_dementias
		- 8 patients having frontotemporal_dementias
		- 12 patients having other_dementias
		- 7 patients having parkinson_dementias
		- 105 patients having all_dementias

	> Filling 123 patients' diseases column for patient having no
	diagnostics (assuming no neurodegenerative diseases).

	> Filling 380 patients' medications columns for patients having
	no prescriptions (assuming no medications were taken).

* After processing:


Unnamed: 0,person_id,diseases,gender_code,person_state_code,duration (days),avg. Alcohol (glasses/day),avg. Tobaco (cigarettes/day),avg. BMI,avg. CHARLSON,N02
# NaN,0.0,0.0,0.0,0.0,0.0,21181.0,21051.0,9166.0,446.0,0.0
NaN (%),0.0,0.0,0.0,0.0,0.0,96.11,95.52,41.59,2.02,0.0


22,039 patients

Diseases:
	- 62 patients having alzheimer
	- 106 patients having parkinson
	- 20 patients having vascular_dementias
	- 367 patients having mci
	- 3 patients having alcohol_dementias
	- 3 patients having frontotemporal_dementias
	- 29 patients having other_dementias
	- 1 patients having parkinson_dementias
	- 105 patients having all_dementias


In [32]:
save_dataset(dataset_FR_65, has_another_disease_before_FR_65, 
             inactive_ids_FR_65, stats_FR_65,
             'FR', 65)

  df.to_csv(dataset_path, index=False)


### FR_70

In [33]:
%%time
dataset_FR_70, has_another_disease_before_FR_70, inactive_ids_FR_70, stats_FR_70 = get_dataset(
    country='FR', age=70, 
    valid_height_range=valid_height_range , valid_weight_range=valid_weight_range, 
    min_before=min_before, min_during=min_during, min_after=min_after,
    n_most_frequent=n_most_frequent, 
    start_year=start_year, end_year=end_year,
    extraction_date=extraction_date)

* Begin datasets loadings:

  > Loading: CONTACT...
	[INFO] 1,619,511 samples for 21,600 unique patients

  > Loading: CONTACT DIAGNOSTICS...
	[INFO] 5,561,116 samples for 21,438 unique patients

  > Loading: PERSON...
	[INFO] 21,600 samples for 21,600 unique patients

  > Loading: CONTACT PRESCRIPTIONS...
	[INFO] 5,353,553 samples for 20,853 unique patients

  > Loading: SOCIAL CHANGES...
	[INFO] 54,587 samples for 21,600 unique patients

  > Loading: MEASURE CHANGES...
	[INFO] 2,872,457 samples for 19,508 unique patients

  > Loading: ATTRIBUTES CHANGES...
	[INFO] 265,326 samples for 21,600 unique patients

  > Loading: PERSON MEASURE CHANGES...
	[INFO] 2,872,457 samples for 19,508 unique patients

* Filtering out 2,886 inactive patients (13.36% of 21600)

* Begin extraction of interesting features:
Note: statistics display using [INFO] flag is from the current table only and are not combined with info from other tables

  > GENDER:
	[INFO] We have 18,714 unique patients:
		- 52.33% 

In [34]:
dataset_FR_70 = processing_nans(dataset_FR_70, has_another_disease_before_FR_70)

* Original dataset:


Unnamed: 0,person_id,diseases,gender_code,person_state_code,duration (days),avg. Alcohol (glasses/day),avg. Tobaco (cigarettes/day),avg. BMI,avg. CHARLSON,N02
# NaN,0.0,728.0,0.0,1.0,1.0,18035.0,18074.0,7809.0,0.0,337.0
NaN (%),0.0,3.89,0.0,0.01,0.01,96.37,96.58,41.73,0.0,1.8


18,714 patients

* Processing NaNs:
	> Excluding 1 patients due to being either dead before the 
	period of interest or always Temporaire / Inactive.

	> Excluding 684 patients who had a disease before the end of
	the period of interest.

		- 83 patients having alzheimer
		- 119 patients having parkinson
		- 37 patients having vascular_dementias
		- 457 patients having mci
		- 2 patients having alcohol_dementias
		- 0 patients having frontotemporal_dementias
		- 23 patients having other_dementias
		- 7 patients having parkinson_dementias
		- 137 patients having all_dementias

	> Filling 106 patients' diseases column for patient having no
	diagnostics (assuming no neurodegenerative diseases).

	> Filling 337 patients' medications columns for patients having
	no prescriptions (assuming no medications were taken).

* After processing:


Unnamed: 0,person_id,diseases,gender_code,person_state_code,duration (days),avg. Alcohol (glasses/day),avg. Tobaco (cigarettes/day),avg. BMI,avg. CHARLSON,N02
# NaN,0.0,0.0,0.0,0.0,0.0,17378.0,17419.0,7608.0,0.0,0.0
NaN (%),0.0,0.0,0.0,0.0,0.0,96.39,96.62,42.2,0.0,0.0


18,029 patients

Diseases:
	- 117 patients having alzheimer
	- 124 patients having parkinson
	- 28 patients having vascular_dementias
	- 518 patients having mci
	- 0 patients having alcohol_dementias
	- 1 patients having frontotemporal_dementias
	- 76 patients having other_dementias
	- 6 patients having parkinson_dementias
	- 203 patients having all_dementias


In [35]:
save_dataset(dataset_FR_70, has_another_disease_before_FR_70, 
             inactive_ids_FR_70, stats_FR_70,
             'FR', 70)

  df.to_csv(dataset_path, index=False)


In [36]:
!ls {OUTPUT_DIR}

FR_65  FR_70  UK_65  UK_70
