replace loop with vector op

tompollard · Feb 2, 2018 · 60ac9d5 · 60ac9d5
1 parent 85f5097
commit 60ac9d5
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 26 deletions.
diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
     # Versions should comply with PEP440. For a discussion on single-sourcing
     # the version across setup.py and the project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
-    version='0.4.2',                         
+    version='0.4.3',                         
 
     description='TableOne',
     long_description=long_description,

diff --git a/tableone.py b/tableone.py
@@ -4,7 +4,7 @@
 """
 
 __author__ = "Tom Pollard <tpollard@mit.edu>, Alistair Johnson"
-__version__ = "0.4.2"
+__version__ = "0.4.3"
 
 import pandas as pd
 import csv
@@ -161,30 +161,27 @@ def _create_cat_describe(self,data):
                 d_slice = data
             cats = {}
 
-            for v in self.categorical:
-                ds = d_slice[v].astype('category')
-                levels = ds[ds.notnull()].unique().categories.sort_values()
-                df = pd.DataFrame(index = levels)
-                # clean later
-                # add descriptive details
-                df['n'] = ds.count()
-                df['isnull'] = data[v].isnull().sum()
-                df['level'] = levels
-                df = df.merge(ds.value_counts(dropna=True).to_frame().rename(columns= {v:'freq'}),
-                    left_on='level',right_index=True, how='left')
-                df['freq'].fillna(0,inplace=True)
-                df['percent'] = (df['freq'] / df['n']) * 100
-                # set level as index to df
-                df.set_index('level', inplace=True)
-                cats[v] = df
-
-            cats_df = pd.concat(cats)
-            cats_df.index.rename('variable',level=0, inplace=True)
-
-            cats_df['t1_summary'] = cats_df.freq.map(str) \
-                + ' (' + cats_df.percent.apply(round, ndigits=2).map(str) + ')'
-
-            group_dict[g] = cats_df
+            # create a dataframe with freq, proportion
+            df = d_slice[self.categorical].copy()
+            df = df.melt().groupby(['variable','value']).size().to_frame(name='freq')
+            df.index.set_names('level', level=1, inplace=True)
+            df['percent'] = df['freq'].div(df.freq.sum(level=0),level=0)* 100
+
+            # add n column, listing total non-null values for each variable
+            ct = d_slice.count().to_frame(name='n')
+            ct.index.name = 'variable'
+            df = df.join(ct)
+
+            # add null count
+            nulls = d_slice.isnull().sum().to_frame(name='isnull')
+            nulls.index.name = 'variable'
+            df = df.join(nulls)
+
+            # add summary column
+            df['t1_summary'] = df.freq.map(str) + ' (' + df.percent.apply(round, ndigits=2).map(str) + ')'
+
+            # add to dictionary
+            group_dict[g] = df
 
         df_cat = pd.concat(group_dict,axis=1)