diff --git a/setup.py b/setup.py index 27bc4ce..fb408a7 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='0.4.2', + version='0.4.3', description='TableOne', long_description=long_description, diff --git a/tableone.py b/tableone.py index 79b2c0d..ba27f60 100644 --- a/tableone.py +++ b/tableone.py @@ -4,7 +4,7 @@ """ __author__ = "Tom Pollard , Alistair Johnson" -__version__ = "0.4.2" +__version__ = "0.4.3" import pandas as pd import csv @@ -161,30 +161,27 @@ def _create_cat_describe(self,data): d_slice = data cats = {} - for v in self.categorical: - ds = d_slice[v].astype('category') - levels = ds[ds.notnull()].unique().categories.sort_values() - df = pd.DataFrame(index = levels) - # clean later - # add descriptive details - df['n'] = ds.count() - df['isnull'] = data[v].isnull().sum() - df['level'] = levels - df = df.merge(ds.value_counts(dropna=True).to_frame().rename(columns= {v:'freq'}), - left_on='level',right_index=True, how='left') - df['freq'].fillna(0,inplace=True) - df['percent'] = (df['freq'] / df['n']) * 100 - # set level as index to df - df.set_index('level', inplace=True) - cats[v] = df - - cats_df = pd.concat(cats) - cats_df.index.rename('variable',level=0, inplace=True) - - cats_df['t1_summary'] = cats_df.freq.map(str) \ - + ' (' + cats_df.percent.apply(round, ndigits=2).map(str) + ')' - - group_dict[g] = cats_df + # create a dataframe with freq, proportion + df = d_slice[self.categorical].copy() + df = df.melt().groupby(['variable','value']).size().to_frame(name='freq') + df.index.set_names('level', level=1, inplace=True) + df['percent'] = df['freq'].div(df.freq.sum(level=0),level=0)* 100 + + # add n column, listing total non-null values for each variable + ct = d_slice.count().to_frame(name='n') + ct.index.name = 'variable' + df = df.join(ct) + + # add null count + nulls = d_slice.isnull().sum().to_frame(name='isnull') + nulls.index.name = 'variable' + df = df.join(nulls) + + # add summary column + df['t1_summary'] = df.freq.map(str) + ' (' + df.percent.apply(round, ndigits=2).map(str) + ')' + + # add to dictionary + group_dict[g] = df df_cat = pd.concat(group_dict,axis=1)