# Cleaning up Data

Goals:

* Load multiple CSVs with glob
* Examine columns and types
* Create a function
* Refactor to ease reading
* Fix lambda/comprehension issue

Data license:
...that any publications resulting from the use of the data include the 
names of the principal investigator responsible for the data collection
at each institution.  They would be:

 1. Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D.
 2. University Hospital, Zurich, Switzerland: William Steinbrunn, M.D.
 3. University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D.
 4. V.A. Medical Center, Long Beach and Cleveland Clinic Foundation:
Robert Detrano, M.D., Ph.D.

In [None]:
# https://archive.ics.uci.edu/ml/datasets/heart+disease
import numpy as np
import pandas as pd

import glob

In [None]:
names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
         'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']

files = glob.glob('data/processed*.data')
df = pd.concat([pd.read_csv(f, sep=',', names=names,
                           dtype_backend='pyarrow', engine='pyarrow')
                for f in files], axis='index', ignore_index=True)
df

## Cleanup

In [None]:
df.describe()

In [None]:
df.select_dtypes('string')

In [None]:
df.select_dtypes(object)

In [None]:
df.dtypes

In [None]:
!cat data/heart-disease.names

## Age

In [None]:
df

In [None]:
df.age.describe()

In [None]:
df.age.hist(figsize=(8,3))

In [None]:
df.age.dtype

In [None]:
(df
 .astype({'age': 'int8[pyarrow]'})
)

## Sex

In [None]:
(df
 .astype({'age': 'int8[pyarrow]'})
 .sex
 .value_counts()
)

In [None]:
# Expect an error here (for demonstration purposes)
# ArrowInvalid: Could not convert 'male' with type str: tried to convert to double

(df
 .astype({'age': 'int8[pyarrow]'})
 .assign(sex=df.sex.replace({1.0: 'male', 0.0:'female'})) 
)

In [None]:
(df
 .astype({'age': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}))
)

In [None]:
(df
 .astype({'age': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}))
 .sex
 .value_counts()
)

## CP - Chest Pain
- Value 1: typical angina
- Value 2: atypical angina
- Value 3: non-anginal pain
- Value 4: asymptomatic

In [None]:
(df
 .astype({'age': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}))
 .cp
)

In [None]:
(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}))
)

## trestbps - Resting Blood Pressure

In [None]:
(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}))
 .trestbps
 .describe()
)

In [None]:
(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}))
 .trestbps
 .value_counts()
)

In [None]:
(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
         trestbps=df.trestbps.replace('?', None).astype('uint8[pyarrow]'))
)

In [None]:
(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
         trestbps=df.trestbps.replace('?', None).astype('string[pyarrow]').astype('uint8[pyarrow]'))
)

In [None]:
(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=df.trestbps
         .replace('?', np.nan)
         .astype('string[pyarrow]')
         .astype('float[pyarrow]')
         .astype('int16[pyarrow]')
        ) 
)

In [None]:
(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=df.trestbps
         .replace('?', np.nan)
         .astype('string[pyarrow]')
         .astype('float[pyarrow]')
         .astype('int16[pyarrow]')
        )
 .trestbps
 .describe()
)

## chol - serum cholestoral (mg/dl)

In [None]:
(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=df.trestbps
         .replace('?', np.nan)
         .astype('string[pyarrow]')
         .astype('float[pyarrow]')
         .astype('int16[pyarrow]')
        )
 .chol
 .describe()
)

In [None]:
(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=df.trestbps
         .replace('?', np.nan)
         .astype('string[pyarrow]')
         .astype('float[pyarrow]')
         .astype('int16[pyarrow]')
        )
 .chol
 .value_counts()
)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    return (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]')
        )
 .chol
 .value_counts()
)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    return (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]')
        )
 .chol
 .hist(bins=100, figsize=(8,3))
)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    return (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]')
        )
 .chol
 .isna()
 .sum()
)

## fbs - Fasting blood sugar (> 120 mg/dl)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    return (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]')
        )
 .fbs
 .describe()
)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    return (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]')
        )
 .fbs
 .value_counts()
)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    return (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]')         
        )
 .fbs
 .value_counts(dropna=False)
)

## restecg - resting electrocardiographic results
- Value 0: normal
- Value 1: having ST-T wave abnormality (T wave inversions and/or ST 
          elevation or depression of > 0.05 mV)
- Value 2: showing probable or definite left ventricular hypertrophy
          by Estes' criteria

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    return (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]')         
        )
 .restecg
 .value_counts()
)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    return (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]')         
        )
 .restecg
 .value_counts()
)

## thalach - maximum heart rate achieved

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    return (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]')         
        )
 .thalach
 .value_counts()
)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    return (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
         
        )
 .thalach
 .value_counts()
)

## exang - exercise induced angina 1 - yes 0 - no

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    return (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
         
        )
 .exang
 .value_counts()
)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    return (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
        exang=lambda df_: remove_question(df_, 
                            'exang', dtype='bool[pyarrow]'),         
         
         
        )
 .exang
 .value_counts()
)

## oldpeak - ST depression induced by exercise relative to rest

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    return (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
        exang=lambda df_: remove_question(df_, 
                            'exang', dtype='bool[pyarrow]'),         
         
         
        )
 .oldpeak
 .value_counts()
)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    res = (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
          )
    if dtype == 'float[pyarrow]':
        return res
    else:
        return (res
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
        exang=lambda df_: remove_question(df_, 
                            'exang', dtype='bool[pyarrow]'),         
        oldpeak=lambda df_: remove_question(df_, 
                            'oldpeak', dtype='float[pyarrow]'),           
        )
 .oldpeak
 .value_counts()
)

## slope - the slope of the peak exercise ST segment
- Value 1: upsloping
- Value 2: flat
- Value 3: downsloping

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    res = (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
          )
    if dtype == 'float[pyarrow]':
        return res
    else:
        return (res
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
        exang=lambda df_: remove_question(df_, 
                            'exang', dtype='bool[pyarrow]'),         
        oldpeak=lambda df_: remove_question(df_, 
                            'oldpeak', dtype='float[pyarrow]'),           
        )
 .slope
 .value_counts()
)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    res = (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
          )
    if dtype == 'float[pyarrow]':
        return res
    else:
        return (res
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
        exang=lambda df_: remove_question(df_, 
                            'exang', dtype='bool[pyarrow]'),         
        oldpeak=lambda df_: remove_question(df_, 
                            'oldpeak', dtype='float[pyarrow]'),   
        slope=lambda df_: remove_question(df_, 
                            'slope', dtype='int8[pyarrow]'),
        )
 .slope
 .value_counts()
)

## ca - number of major vessels (0-3) colored by flouroscopy

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    res = (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
          )
    if dtype == 'float[pyarrow]':
        return res
    else:
        return (res
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
        exang=lambda df_: remove_question(df_, 
                            'exang', dtype='bool[pyarrow]'),         
        oldpeak=lambda df_: remove_question(df_, 
                            'oldpeak', dtype='float[pyarrow]'),   
        slope=lambda df_: remove_question(df_, 
                            'slope', dtype='int8[pyarrow]'),
        )
 .ca
 .value_counts()
)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    res = (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
          )
    if dtype == 'float[pyarrow]':
        return res
    else:
        return (res
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
        exang=lambda df_: remove_question(df_, 
                            'exang', dtype='bool[pyarrow]'),         
        oldpeak=lambda df_: remove_question(df_, 
                            'oldpeak', dtype='float[pyarrow]'),   
        slope=lambda df_: remove_question(df_, 
                            'slope', dtype='int8[pyarrow]'),
        ca=lambda df_: remove_question(df_, 
                            'ca', dtype='int8[pyarrow]'),
        )
 .ca
 .value_counts()
)

## thal 

- 3 = normal; 6 = fixed defect; 7 = reversable defect

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    res = (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
          )
    if dtype == 'float[pyarrow]':
        return res
    else:
        return (res
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
        exang=lambda df_: remove_question(df_, 
                            'exang', dtype='bool[pyarrow]'),         
        oldpeak=lambda df_: remove_question(df_, 
                            'oldpeak', dtype='float[pyarrow]'),   
        slope=lambda df_: remove_question(df_, 
                            'slope', dtype='int8[pyarrow]'),
        ca=lambda df_: remove_question(df_, 
                            'ca', dtype='int8[pyarrow]'),
        )
 .thal
 .value_counts()
)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    res = (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
          )
    if dtype == 'float[pyarrow]':
        return res
    else:
        return (res
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
        exang=lambda df_: remove_question(df_, 
                            'exang', dtype='bool[pyarrow]'),         
        oldpeak=lambda df_: remove_question(df_, 
                            'oldpeak', dtype='float[pyarrow]'),   
        slope=lambda df_: remove_question(df_, 
                            'slope', dtype='int8[pyarrow]'),
        ca=lambda df_: remove_question(df_, 
                            'ca', dtype='int8[pyarrow]'),
        thal=lambda df_: (df_.thal.replace({'3.0': '3',
                                            '7.0': '7', '6.0': '6'})
                             .replace({'?': 'missing', '3': 'normal',
                                       '6': 'fixed', '7': 'reversible'}))
        )
 .thal
 .value_counts()
)

## num
: diagnosis of heart disease (angiographic disease status)
- Value 0: < 50% diameter narrowing
- Value 1: > 50% diameter narrowing
(in any major vessel: attributes 59 through 68 are vessels)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    res = (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
          )
    if dtype == 'float[pyarrow]':
        return res
    else:
        return (res
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
        exang=lambda df_: remove_question(df_, 
                            'exang', dtype='bool[pyarrow]'),         
        oldpeak=lambda df_: remove_question(df_, 
                            'oldpeak', dtype='float[pyarrow]'),   
        slope=lambda df_: remove_question(df_, 
                            'slope', dtype='int8[pyarrow]'),
        ca=lambda df_: remove_question(df_, 
                            'ca', dtype='int8[pyarrow]'),
        thal=lambda df_: (df_.thal.replace({'3.0': '3',
                                            '7.0': '7', '6.0': '6'})
                             .replace({'?': 'missing', '3': 'normal',
                                       '6': 'fixed', '7': 'reversible'}))
        )
 .num
 .value_counts()
)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    res = (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
          )
    if dtype == 'float[pyarrow]':
        return res
    else:
        return (res
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
        exang=lambda df_: remove_question(df_, 
                            'exang', dtype='bool[pyarrow]'),         
        oldpeak=lambda df_: remove_question(df_, 
                            'oldpeak', dtype='float[pyarrow]'),   
        slope=lambda df_: remove_question(df_, 
                            'slope', dtype='int8[pyarrow]'),
        ca=lambda df_: remove_question(df_, 
                            'ca', dtype='int8[pyarrow]'),
        thal=lambda df_: (df_.thal.replace({'3.0': '3',
                                            '7.0': '7', '6.0': '6'})
                             .replace({'?': 'missing', '3': 'normal',
                                       '6': 'fixed', '7': 'reversible'}))
        )
 .num
 .dtype
)

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    res = (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
          )
    if dtype == 'float[pyarrow]':
        return res
    else:
        return (res
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]',
         'num': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
        exang=lambda df_: remove_question(df_, 
                            'exang', dtype='bool[pyarrow]'),         
        oldpeak=lambda df_: remove_question(df_, 
                            'oldpeak', dtype='float[pyarrow]'),   
        slope=lambda df_: remove_question(df_, 
                            'slope', dtype='int8[pyarrow]'),
        ca=lambda df_: remove_question(df_, 
                            'ca', dtype='int8[pyarrow]'),
        thal=lambda df_: (df_.thal.replace({'3.0': '3',
                                            '7.0': '7', '6.0': '6'})
                             .replace({'?': 'missing', '3': 'normal',
                                       '6': 'fixed', '7': 'reversible'}))
        )
 .num
 .dtype
)

## Memory Usage

In [None]:
df.memory_usage(deep=True).sum()

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    res = (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
          )
    if dtype == 'float[pyarrow]':
        return res
    else:
        return (res
            .astype(dtype)   
           )

(df
 .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]',
         'num': 'int8[pyarrow]'})
 .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
        trestbps=lambda df_: remove_question(df_, 
                            'trestbps', dtype='int16[pyarrow]'),
        chol=lambda df_: remove_question(df_, 
                            'chol', dtype='int16[pyarrow]'),
        fbs=lambda df_: remove_question(df_, 
                            'fbs', dtype='bool[pyarrow]'),         
        restecg=lambda df_: remove_question(df_, 
                            'restecg', dtype='int8[pyarrow]'),
        thalach=lambda df_: remove_question(df_, 
                            'thalach', dtype='int16[pyarrow]'),
        exang=lambda df_: remove_question(df_, 
                            'exang', dtype='bool[pyarrow]'),         
        oldpeak=lambda df_: remove_question(df_, 
                            'oldpeak', dtype='float[pyarrow]'),   
        slope=lambda df_: remove_question(df_, 
                            'slope', dtype='int8[pyarrow]'),
        ca=lambda df_: remove_question(df_, 
                            'ca', dtype='int8[pyarrow]'),
        thal=lambda df_: (df_.thal.replace({'3.0': '3',
                                            '7.0': '7', '6.0': '6'})
                             .replace({'?': 'missing', '3': 'normal',
                                       '6': 'fixed', '7': 'reversible'}))
        )
 .memory_usage(deep=True)
 .sum()
)

In [None]:
391_668 / 34_614

## Make a Function

In [None]:
import pandas as pd
import numpy as np

import glob

names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']

files = glob.glob('data/processed*.data')
df = pd.concat([pd.read_csv(f, sep=',',names=names,
                           dtype_backend='pyarrow', engine='pyarrow')
                for f in files], axis='index', ignore_index=True)
df

In [None]:
def remove_question(df, col, dtype='int8[pyarrow]'):
    res = (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
          )
    if dtype == 'float[pyarrow]':
        return res
    else:
        return (res
            .astype(dtype)   
           )

def tweak_heart(df):
    return (df
     .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]',
             'num': 'int8[pyarrow]'})
     .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
            trestbps=lambda df_: remove_question(df_, 
                                'trestbps', dtype='int16[pyarrow]'),
            chol=lambda df_: remove_question(df_, 
                                'chol', dtype='int16[pyarrow]'),
            fbs=lambda df_: remove_question(df_, 
                                'fbs', dtype='bool[pyarrow]'),         
            restecg=lambda df_: remove_question(df_, 
                                'restecg', dtype='int8[pyarrow]'),
            thalach=lambda df_: remove_question(df_, 
                                'thalach', dtype='int16[pyarrow]'),
            exang=lambda df_: remove_question(df_, 
                                'exang', dtype='bool[pyarrow]'),         
            oldpeak=lambda df_: remove_question(df_, 
                                'oldpeak', dtype='float[pyarrow]'),   
            slope=lambda df_: remove_question(df_, 
                                'slope', dtype='int8[pyarrow]'),
            ca=lambda df_: remove_question(df_, 
                                'ca', dtype='int8[pyarrow]'),
            thal=lambda df_: (df_.thal.replace({'3.0': '3',
                                                '7.0': '7', '6.0': '6'})
                                 .replace({'?': 'missing', '3': 'normal',
                                           '6': 'fixed', '7': 'reversible'}))
            )
    )

tweak_heart(df)

In [None]:
# This doesn't work!
def remove_question(df, col, dtype='int8[pyarrow]'):
    res = (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
          )
    if dtype == 'float[pyarrow]':
        return res
    else:
        return (res
            .astype(dtype)   
           )

types = {'trestbps':'int16[pyarrow]',
         'chol':'int16[pyarrow]',
         'fbs':'bool[pyarrow]',         
         'restecg':'int8[pyarrow]',                  
         'thalach':'int16[pyarrow]',                           
         'exang':'bool[pyarrow]', 
         'oldpeak':'float[pyarrow]',          
         'slope':'int8[pyarrow]',
         'ca':'int8[pyarrow]',         
        }
    
def tweak_heart(df):
    return (df
     .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]',
             'num': 'int8[pyarrow]'})
     .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
            **{col: lambda df_: remove_question(df_, col, dtype)
              for col, dtype in types.items()},
            thal=lambda df_: (df_.thal.replace({'3.0': '3',
                                                '7.0': '7', '6.0': '6'})
                                 .replace({'?': 'missing', '3': 'normal',
                                           '6': 'fixed', '7': 'reversible'}))
            )
    )

tweak_heart(df)

In [None]:
# Fix by adding default params to lambda
def remove_question(df, col, dtype='int8[pyarrow]'):
    res = (df
            [col]
            .replace('?', np.nan)
            .astype('string[pyarrow]')
            .astype('float[pyarrow]')
          )
    if dtype == 'float[pyarrow]':
        return res
    else:
        return (res
            .astype(dtype)   
           )

types = {'trestbps':'int16[pyarrow]',
         'chol':'int16[pyarrow]',
         'fbs':'bool[pyarrow]',         
         'restecg':'int8[pyarrow]',                  
         'thalach':'int16[pyarrow]',                           
         'exang':'bool[pyarrow]', 
         'oldpeak':'float[pyarrow]',          
         'slope':'int8[pyarrow]',
         'ca':'int8[pyarrow]',         
        }
    
def tweak_heart(df):
    return (df
     .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]',
             'num': 'int8[pyarrow]'})
     .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
            **{col: (lambda df2_, col=col, dtype=dtype: remove_question(df2_, col, dtype))
               for col, dtype in types.items()},
            thal=lambda df_: (df_.thal.replace({'3.0': '3',
                                                '7.0': '7', '6.0': '6'})
                                 .replace({'?': 'missing', '3': 'normal',
                                           '6': 'fixed', '7': 'reversible'}))
            )
    )

tweak_heart(df)

In [None]:
# Convert ?'s at once
types = {'trestbps':'int16[pyarrow]',
         'chol':'int16[pyarrow]',
         'fbs':'bool[pyarrow]',         
         'restecg':'int8[pyarrow]',                  
         'thalach':'int16[pyarrow]',                           
         'exang':'bool[pyarrow]', 
         'oldpeak':'float[pyarrow]',          
         'slope':'int8[pyarrow]',
         'ca':'int8[pyarrow]',         
        }
    
def tweak_heart(df):
    return (df
     .astype({'age': 'int8[pyarrow]', 'cp': 'int8[pyarrow]',
             'num': 'int8[pyarrow]'})
     .assign(sex=df.sex.astype('string[pyarrow]').replace({'1.0': 'male', '0.0':'female'}),
            #**{col: (lambda df2_, col=col, dtype=dtype: remove_question(df2_, col, dtype))
            #   for col, dtype in types.items()},
            **(df.loc[:, list(types)].replace('?', np.nan).astype('string[pyarrow]').astype('float[pyarrow]')),
            thal=lambda df_: (df_.thal.replace({'3.0': '3',
                                                '7.0': '7', '6.0': '6'})
                                 .replace({'?': 'missing', '3': 'normal',
                                           '6': 'fixed', '7': 'reversible'}))
            )
     .astype(types)
    )

tweak_heart(df)